From 169e01b87f62f01f7bd56c86ee354ea602c8ea64 Mon Sep 17 00:00:00 2001 From: Asaf Yehudai Date: Wed, 11 Feb 2026 10:37:54 +0200 Subject: [PATCH 1/2] Update to v0.2 --- .../623bae1f-19e9-47f9-bc7b-80a859218d07.json | 37 +++++- .../fbba98c5-5d56-4837-9044-d4e5ac610c2c.json | 37 +++++- .../dc6e1164-c9d7-4dd5-b8dc-fbc4e3f45011.json | 37 +++++- .../c62a913b-3101-4ce3-a5c5-a1ac844e55f8.json | 37 +++++- .../3101726d-fd51-436d-8adf-cbdf0d534834.json | 42 ++++++- .../f878a52a-fa80-4113-ae7d-0cb11e3ef9fd.json | 32 ++++- .../904c6359-bd7b-4448-9f16-bc115d0629c4.json | 42 ++++++- .../49511052-6881-4151-9b46-686c75f73c22.json | 42 ++++++- .../b289e2e6-d57b-4a2b-aa61-e2974d193909.json | 37 +++++- .../aeeca919-71a1-42a0-a6d0-6779d77750e6.json | 32 ++++- .../db29538d-f40e-42d0-b3c0-e622f92112d2.json | 42 ++++++- .../ab0cdc4f-47dd-4dcc-b506-982ce3924105.json | 37 +++++- .../44da63b6-d934-4330-bc20-33464bae61dd.json | 42 ++++++- .../c930cbe0-f429-4b61-9abe-86dcb7266cf7.json | 42 ++++++- .../c84b27b2-2dd9-48ee-9a53-ec27ae62ae7a.json | 32 ++++- .../73ee9408-e669-4b8a-9419-76bd6051ce8d.json | 32 ++++- .../0deed2f4-770e-4033-a65d-e1da19e00611.json | 42 ++++++- .../e727cb77-f229-4aaa-909f-99c7aa06676b.json | 37 +++++- .../da9264cd-2fa3-4121-81de-eef994e15993.json | 37 +++++- .../79cc5cd4-bfed-466d-9fbe-2f27e8aab175.json | 42 ++++++- .../28c35831-679d-489a-b2c4-fd2c7f333fbc.json | 42 ++++++- .../9db7907d-7b22-480c-86a5-f88ec2b302e7.json | 37 +++++- .../2faddf79-41e6-47e9-9c26-17bc987bc870.json | 37 +++++- .../20989a47-6556-4e3b-8909-d0a419cb159b.json | 37 +++++- .../f3d0010f-efed-4f87-9582-b9c87b4de99a.json | 37 +++++- .../a0ce3ed6-2a2c-46ad-be86-6f6701533e36.json | 37 +++++- .../d54c4830-23c8-4c12-aea1-4f5b5245464f.json | 37 +++++- .../b5853278-edd9-4bc8-bbeb-d6dab515b562.json | 37 +++++- .../74188e30-1e49-47d8-af01-b80e430dafa0.json | 37 +++++- .../93974286-0497-46a2-a2e8-404c1e89dba0.json | 37 +++++- .../02c0020c-7d69-4701-a606-4bc79ad87afd.json | 37 +++++- .../5dcb7c54-64e7-4f76-8903-8f57b35cdb0c.json | 37 +++++- .../4887256e-0545-40dd-9756-ff850e003a29.json | 37 +++++- .../d2b70870-9cbc-4666-bbd4-097fcebe716e.json | 37 +++++- .../f420f432-2291-40a9-8ebd-b91241970113.json | 37 +++++- .../02e68d1b-86f3-4344-ad8d-45df878b744c.json | 42 ++++++- .../f712ab4a-1127-44ba-b6b9-7a40290f3322.json | 42 ++++++- .../b4175f0f-f9f4-4418-b4aa-a31e7f1f93f4.json | 37 +++++- .../9879e9a7-ddbc-4338-abc7-e3bc394869e9.json | 37 +++++- .../d7d8a5cb-e295-4ced-b528-d99d814ff008.json | 37 +++++- .../bff86a1f-71c3-4f27-aeae-bba6d03635ef.json | 37 +++++- .../723281f8-54b7-4db6-8253-5a6dcf4f3d4a.json | 37 +++++- .../0ce7dc54-f608-4985-9904-75cee09b6288.json | 32 ++++- .../5bb0aaa4-2cc5-4622-8235-993bc4178f12.json | 32 ++++- .../85ab22b8-0587-4e2b-857f-3d6d84d571a4.json | 42 ++++++- .../37aa6702-b2fa-43bf-b5a9-36740f627217.json | 32 ++++- .../57f48d0c-e424-410d-b9ee-4707e2add036.json | 32 ++++- .../8643b4dd-e18c-442c-adb5-84ef756534f8.json | 42 ++++++- .../2f3d2e46-1f9e-4b1c-9729-ab0a93cc245c.json | 37 +++++- .../4aec78d3-a38c-48e0-b9e2-b6dc063bd37e.json | 42 ++++++- .../f9b60945-8b14-4564-9d44-3eb6db675ab9.json | 32 ++++- .../56703c11-eccb-4f66-af13-60f972a5068f.json | 37 +++++- .../fbd8be7e-5670-4729-a77d-83472510b734.json | 37 +++++- .../2e18ee77-9c46-4cf9-9521-303ad15e5be4.json | 37 +++++- .../ec5b296e-03e8-4371-a8c1-eca0b0b9759d.json | 42 ++++++- .../07b61a55-a8e3-4a6f-9806-a4100f8d5297.json | 37 +++++- .../3d534c25-5016-44de-9c47-24b7d7399b0f.json | 42 ++++++- .../4de91433-05b3-4f88-9d0f-66691c671f62.json | 42 ++++++- .../dc71f1ba-f4b8-4231-ac72-0acf9a22d73e.json | 37 +++++- .../36c4adc9-c2fb-4bc3-81ba-88478d30332e.json | 37 +++++- .../f0827b15-20d0-4986-b5a0-bb4bc9be768e.json | 42 ++++++- .../aeaa8b33-e327-4c65-9641-5dfc63feee3b.json | 42 ++++++- .../c97c79f3-fd92-49db-9131-5e45834a7eaf.json | 37 +++++- .../687099cb-c1bf-49ec-a902-329c2b818369.json | 42 ++++++- .../8da4f5eb-6264-4503-b9bc-fcf843b638be.json | 37 +++++- .../28a68b87-5412-4374-9e61-896b0fff7669.json | 42 ++++++- .../3209c869-03c5-4801-8e4b-4c8bcde3d58f.json | 37 +++++- .../9d1e124c-e133-41d3-8ac7-5c8c5027aa02.json | 32 ++++- .../633d499b-58bd-4fca-9b56-0f005a5a21b8.json | 37 +++++- .../5c4f3caf-6af3-48c6-83e2-4710d31e6acf.json | 37 +++++- .../77d1edc1-fb54-4371-bf7c-baebbb351163.json | 37 +++++- .../e7eecdb0-bc17-4d9f-b3e8-9ee777d2f595.json | 37 +++++- .../3f3915b3-0d6e-451c-9185-fa4372b93f2b.json | 37 +++++- .../e534d37b-3009-4a7d-82d8-d7c85b95649e.json | 37 +++++- .../bd8f0ed1-75fc-48c1-996e-655d205c027c.json | 37 +++++- .../e9effaf6-e48b-4b35-b035-430be81b316b.json | 42 ++++++- .../d2132eea-eb88-41e5-b8e6-2e8e8a623ed1.json | 32 ++++- .../ffd05bc7-3724-40ba-85b9-c25ebe71fba2.json | 32 ++++- .../43f0e93d-f0b8-46af-a549-e1ac315d96ea.json | 42 ++++++- .../9ccab7bd-d2ed-4ab3-ad81-656650c29a3b.json | 37 +++++- .../c10d4213-f1fa-41e6-92d9-0d5337c1362b.json | 37 +++++- .../63b08ba0-eeb9-48ae-a5d1-d7d3792aa1c0.json | 37 +++++- .../d724076d-509f-4ad4-894c-976b0472de85.json | 37 +++++- .../54d34f25-1cd9-4995-8e56-c36981842fc8.json | 32 ++++- .../63ae1c75-fd4d-4f40-afd0-b9f91d700014.json | 37 +++++- .../1d5ebbce-8cfe-446b-82c0-a227d4e9247f.json | 32 ++++- .../3f9c81ac-5c76-43b4-a27d-7eaa055139c4.json | 42 ++++++- .../680098fb-76cf-47b6-a0ea-a1a06ca46dca.json | 42 ++++++- .../6ec21338-9908-4ce4-a1f2-dac14c5e27ab.json | 32 ++++- .../592ad1e3-8a48-4c39-8013-81d7c731780f.json | 37 +++++- .../5b36f0af-7ff6-4564-9714-08fbf41d261f.json | 42 ++++++- .../04f120c6-b648-4c83-81d8-05118efb0904.json | 37 +++++- .../c907e494-ab2e-4a28-a28d-aeb68eb818ed.json | 42 ++++++- .../d9eed240-ebbe-482f-8dae-c5251ed6d067.json | 32 ++++- .../670865e1-f219-465b-9fbe-6da6f73ac9e6.json | 37 +++++- .../88953298-b63e-499f-a31e-f0f586c4772d.json | 32 ++++- .../3acb690c-ffc0-4e67-8ae1-e79bcee4f824.json | 37 +++++- .../6ad2cb6a-f9a3-424e-aed2-9493899872e3.json | 32 ++++- .../1892bf75-916b-4d4f-96ab-fda36872ae5d.json | 32 ++++- .../e06e1863-c28f-4c96-a672-b1073c80aa71.json | 32 ++++- .../d923f7aa-a9d4-406a-b5d7-bdab508f04f7.json | 32 ++++- .../5c5e40b1-e86a-4d30-b93c-f8f9e73cdca8.json | 32 ++++- .../59299d8c-e468-490f-8a52-eef49b0aaeea.json | 42 ++++++- .../3ce9612f-9b57-476e-9fa4-6e63f14568a7.json | 32 ++++- .../9c605bf1-2533-43db-a610-e71c0aaecdb5.json | 42 ++++++- .../c289f778-92b8-44df-a079-3bced33c8ab5.json | 32 ++++- .../329d4101-e740-490c-9fbc-1708f76a2f61.json | 32 ++++- .../3e87f52e-b136-4cb3-8cbb-d8d8a8571051.json | 42 ++++++- .../62b9adca-db38-46c0-a68a-ed7a8e735035.json | 32 ++++- .../4d2f43eb-e6f3-4686-a9d9-6b6c6b68b86c.json | 42 ++++++- .../830df3fd-d479-4af8-a92b-93d82e804fec.json | 32 ++++- .../0e6d85b8-aa37-448c-adb2-0da2bd13e322.json | 32 ++++- .../45f0bd9c-e939-4b83-a623-1db61f431500.json | 42 ++++++- .../0f710903-7dd8-44ea-914d-d43bbfe894f1.json | 42 ++++++- .../b9ddd960-f6f7-4962-8297-88ec7fbbbd1f.json | 32 ++++- .../25a4520b-c780-45fc-a00f-36db1776c6a8.json | 42 ++++++- .../96d7e5c1-2f43-4f09-9702-0af090afa141.json | 42 ++++++- .../5a47f8bd-401a-4b6b-91b0-9593b36e5996.json | 42 ++++++- .../c27e98d4-f5ea-48f9-babc-3ccda2d21d2a.json | 42 ++++++- .../060bf847-e7b5-4e30-934f-5306d01c499a.json | 42 ++++++- .../e648e6c2-18bb-49d7-b08f-47ce41a67d4f.json | 42 ++++++- .../537e92cb-25db-47f5-916a-6f666e14639a.json | 42 ++++++- .../e59ca33f-c6ce-44d4-9cb4-2fd65608313b.json | 32 ++++- .../fc99848b-82c7-459e-8327-1867a332ff28.json | 42 ++++++- .../357f4f03-9542-495f-b575-4274111bbe1f.json | 32 ++++- .../d78c42d6-fc0d-4719-bbb6-7a53dbb0d017.json | 32 ++++- .../c94ddbe5-2bc0-4a33-b06b-10671fb22b70.json | 27 ++++- .../cc2ac405-1710-46fa-aeba-dd86797c666c.json | 27 ++++- .../49fcb3e2-2883-4c3d-b519-d511c6b10162.json | 27 ++++- .../0ba5ce6c-f311-4b02-a67a-d49539119a8e.json | 27 ++++- .../49029c9e-a831-4219-8e26-df20862ad3e1.json | 27 ++++- .../6dedd117-eab0-4c31-b50b-4890099d9904.json | 27 ++++- .../71c20c06-efb8-428e-9e9d-e4fedf11041a.json | 27 ++++- .../862f3d57-8f5f-4372-b6fb-876fb35efba4.json | 27 ++++- .../93ea2bfa-e058-42d5-afac-0d3fc50fce91.json | 27 ++++- .../c1331fa1-7793-4526-b24b-02261bb4437f.json | 27 ++++- .../c3cab72a-47b3-47ec-bb2d-986903ab8c26.json | 27 ++++- .../cd0452a7-0370-4024-a51f-b3deff290db9.json | 27 ++++- .../6fd85045-d600-451f-8d27-da637add4081.json | 27 ++++- .../a15ca8c3-fd90-4ef9-80c5-40eeac60d785.json | 27 ++++- .../5f43832f-14fa-49e1-a851-949163aec826.json | 27 ++++- .../1f8869e7-e434-469e-906d-d34621582cba.json | 42 ++++++- .../8f9d05db-9bb0-4998-bc75-96dbfa695548.json | 37 +++++- .../2681e475-da0a-48a9-ab68-e0bf59240f90.json | 42 ++++++- .../e2986d78-100d-417a-9f38-9a570a335d95.json | 37 +++++- .../1bc5cd51-5a3a-46ea-bc78-56f9b3081f69.json | 37 +++++- .../1d1127ee-7a0e-4915-b8bf-0b22f8ba338b.json | 42 ++++++- .../4bb55ff5-5adf-407f-a9d6-910c6c9d2770.json | 42 ++++++- .../daebee0b-3856-4270-94c6-c14bd84f5cf5.json | 37 +++++- .../1be99417-352e-4a94-8108-b43123553667.json | 42 ++++++- .../8d3fbc68-2ee7-4989-a40c-f4a45e579b5c.json | 37 +++++- .../9533891f-c2f7-4e82-9f39-131768dbc28a.json | 37 +++++- .../b8a47660-f0a5-4136-a743-979863c53e3a.json | 42 ++++++- .../2673bea2-42eb-42a5-9dc2-13d43341c9b2.json | 42 ++++++- .../6f5555c2-588a-48d1-811c-be53634bbdef.json | 37 +++++- .../9c96fa7b-52e8-4aed-9fdd-f389091d5e6f.json | 42 ++++++- .../0519d9fb-f220-40ab-8257-f20ed98a8b47.json | 37 +++++- .../ece70375-447f-41e8-aa03-8f4b26abea73.json | 37 +++++- .../7bbaffdd-f822-48cf-a0f2-e66b16db678d.json | 37 +++++- .../27c5c441-64ce-41dd-8384-f84c8f6ccc14.json | 37 +++++- .../38a14e6a-2094-4e0b-be22-45181ede2a63.json | 37 +++++- .../cee37c2c-2766-47b7-9192-a141e5d22f2d.json | 42 ++++++- .../d1d69392-8717-462d-9ce0-c7ddf5faf97d.json | 42 ++++++- .../72071bb1-57c0-4727-8100-ba24d8da10f5.json | 42 ++++++- .../7626c158-edaf-48f3-9ac3-1188be0c6032.json | 42 ++++++- .../c37be7a8-dc10-4fea-962b-202986a4581e.json | 42 ++++++- .../223dc616-b20f-4065-91a7-3c35bfd11c94.json | 42 ++++++- .../4236b0a9-9d1e-41f6-8364-a7e8ebf51635.json | 42 ++++++- .../c8030a87-0cdf-4918-b0d5-d1fb0e284656.json | 42 ++++++- .../e6ecc1eb-7ff1-46aa-bf03-37bad1b391b7.json | 42 ++++++- .../64872b1a-1eae-4171-95ec-a80c782b69f0.json | 42 ++++++- .../37484401-c7fe-469d-889a-e70f7cadbf82.json | 42 ++++++- .../8cf36288-3add-4fcd-a012-0df9eae2a059.json | 42 ++++++- .../f2c8f979-c331-4b9b-b0a7-5efa82c17d3b.json | 42 ++++++- .../de409ce8-fb68-4113-8879-23712769cbde.json | 42 ++++++- .../264f20d7-1574-448c-8917-eb3f20810819.json | 42 ++++++- .../0ebaec42-9190-4326-95dd-5ecb48bf1a72.json | 42 ++++++- .../29515933-c60b-4686-b475-70ef53d75457.json | 42 ++++++- .../414174a9-7e44-4f7b-94ce-0757639f5af7.json | 42 ++++++- .../48513083-f854-455e-8455-ddbd2698ec03.json | 42 ++++++- .../0b373560-854f-4482-81d0-6c984e130144.json | 42 ++++++- .../1a021cab-d569-4077-af5e-1643f45de03d.json | 42 ++++++- .../e26e230d-59b3-4243-a6c4-3845ab74b89b.json | 42 ++++++- .../aa0991d0-9c5e-4f94-bc12-3342ca389e99.json | 42 ++++++- .../397abe47-d5e9-487d-b883-ec49db16c584.json | 42 ++++++- .../82f52a35-41b5-4b9c-bb3e-4bf18eed0b92.json | 42 ++++++- .../670382ab-a8a1-43f3-a572-b9a5aeae23ef.json | 42 ++++++- .../a4b3c031-7c01-4f7a-8cfe-52b3260d6ecc.json | 42 ++++++- .../7fcd3fce-2296-4b5c-8362-24b1c70ccb8f.json | 42 ++++++- .../4f164e8b-55a1-498f-b586-cf78da7d0b57.json | 42 ++++++- .../a84d3d61-6e05-4d4d-bc89-7f663e9667fb.json | 42 ++++++- .../7aa98f71-8262-4c1f-a71c-1ef36f2ef04c.json | 42 ++++++- .../93398c1f-3129-4be4-83b5-62a4a45c6b84.json | 42 ++++++- .../62493784-f899-4736-bdce-2107ec99a752.json | 42 ++++++- .../9b68ecaa-cf9d-414e-9cf1-c662c765bb5c.json | 42 ++++++- .../76f3d0bd-2b71-4406-a0d4-b01b6c91c4ff.json | 42 ++++++- .../2dc5ab6f-2427-42ae-9582-a0e6139f451a.json | 42 ++++++- .../0db97be6-6562-47d8-bd1a-5b469250e54b.json | 42 ++++++- .../228e4dc4-e517-4023-b690-7f0c321286b2.json | 42 ++++++- .../9442b27c-c94d-41c0-a752-3bd82385272d.json | 42 ++++++- .../561039ac-b156-40eb-bf53-21a275b858ca.json | 42 ++++++- .../d801d700-7b4d-4a62-883b-3d85b05385ea.json | 42 ++++++- .../b8f24058-4441-4d19-898e-80470cc7b685.json | 42 ++++++- .../1f372e00-e7a8-43ef-8e14-ef1b08e5e957.json | 42 ++++++- .../0200a1b3-71f1-4633-96a5-4ca9883a67a7.json | 42 ++++++- .../55479901-aec7-4875-b792-ba73b54aa37a.json | 42 ++++++- .../872597b2-4392-4f23-b5b2-41d418b6cf89.json | 42 ++++++- .../5cb437b5-5993-418d-bd9f-81dea71d9edf.json | 42 ++++++- .../c471cdf7-73f9-48c9-a970-baa66b609093.json | 42 ++++++- .../794a71b4-8a43-4c69-a663-369eea6a84a3.json | 42 ++++++- .../2ad22375-4ed8-4be6-a012-a6f6799581e2.json | 42 ++++++- .../a8df0dc2-d16c-4e1a-b0b5-abe2a4a1d803.json | 42 ++++++- .../ca0a010a-fe3a-4b87-8c80-4a8d3e2597fb.json | 42 ++++++- .../5d1c166c-6a22-4afb-b1b1-f7db9ec38bd8.json | 42 ++++++- .../10a432fa-dfef-4c9c-bdf7-ce0f81fd1895.json | 42 ++++++- .../a550663c-2a04-4dfb-8663-b177a7181f3d.json | 42 ++++++- .../72b6196e-0a2b-4ec9-80a3-a7eb14f7be09.json | 42 ++++++- .../5e41f068-f009-4e32-bac1-9de5220a2ce2.json | 42 ++++++- .../eca1331f-6503-481a-b77b-3d96791f54e8.json | 42 ++++++- .../69def7de-a916-4d23-984b-e676e91e1d8c.json | 42 ++++++- .../679c6e0b-9e0b-4224-b1e3-59df149739a0.json | 42 ++++++- .../2335433d-37c6-47f0-ad3b-5e0a42e9488f.json | 42 ++++++- .../fe84f8a3-5fe9-4385-b6d4-0436fb7e5197.json | 42 ++++++- .../70d2697e-0df5-40ae-9268-b906c9cabd9d.json | 42 ++++++- .../0a30fd70-2381-4a4b-89aa-dbd169c856f0.json | 42 ++++++- .../b9c787f9-3bcd-4215-a157-7fcfa2df82cc.json | 42 ++++++- .../bdd98f27-fbfd-4de7-bd4e-3b8c3e4e7cc0.json | 42 ++++++- .../44b20109-d534-4aa9-867d-fa59935ef6d0.json | 42 ++++++- .../d1196312-4153-4a38-aa46-2940d63d7924.json | 42 ++++++- .../4b1e3070-04ef-47e7-b720-739320194e7b.json | 42 ++++++- .../247f400e-dca8-4dab-bebf-092f778f02c9.json | 42 ++++++- .../d043ad21-102b-49f0-9e8e-6daef7cc3a2e.json | 42 ++++++- .../d45ec8b8-1ee6-49bb-9237-a7271ba9d13c.json | 42 ++++++- .../05a4c6aa-9af2-44f0-8c55-8aeed2e75eaf.json | 42 ++++++- .../a6ef712e-014e-470e-8d5b-f3b51f677aee.json | 42 ++++++- .../35a039ba-06be-4ec2-9bde-a6a6db2eefec.json | 42 ++++++- .../97cb96f8-ce4c-403f-bfbc-386d3c611c81.json | 42 ++++++- .../3a1621e9-75ee-4b34-9c0d-ae15399b1dab.json | 42 ++++++- .../237218ac-4c74-4647-82b1-700360ddfdbd.json | 42 ++++++- .../2858d126-d2ef-4512-8fc8-c39faf24b908.json | 42 ++++++- .../d118ddb1-aafc-4ddf-b5c7-f3ff921bbe0c.json | 42 ++++++- .../379ec82f-a6a7-4976-a4a6-ab80cb9da293.json | 42 ++++++- .../c4df42d1-a838-4717-a814-40559fcd7342.json | 42 ++++++- .../f022d826-3252-4def-b37b-3ce44d78f4ce.json | 42 ++++++- .../cecc321b-efbd-434e-8a31-a97bbb8bbb3b.json | 42 ++++++- .../278c2132-3415-48f4-a839-ed09d71e9240.json | 42 ++++++- .../92bbda1a-ecb1-493d-aa39-a29522c1a11e.json | 42 ++++++- .../f43b2dff-9e73-4779-86e0-b2cc30ae8b40.json | 42 ++++++- .../59a98f5d-d017-4b1a-a563-5abd113337e9.json | 42 ++++++- .../a41597ed-fbab-41af-9625-c277ca988546.json | 42 ++++++- .../e311eb59-f217-4bc2-b69b-dcea434797a8.json | 42 ++++++- .../69b037c3-bae2-4889-b10d-e732c45851e9.json | 42 ++++++- .../adeee000-0b62-4a0c-afaa-5e8c5f29ff6d.json | 42 ++++++- .../4464d588-62b2-440b-8188-2450bd7a94c5.json | 42 ++++++- .../bf358648-a41d-43ee-8c14-f8b8eef41871.json | 42 ++++++- .../afd99f12-f739-40d3-aa11-ef3a45316931.json | 42 ++++++- .../49b4a24b-ddf1-47f0-ba39-9366892a1213.json | 42 ++++++- .../ea14a487-39c3-488b-b52b-998e57135487.json | 42 ++++++- .../02f74b6a-7f63-484e-a7c1-0c53bd801b87.json | 42 ++++++- .../e492c59d-4b03-4dce-983e-a8724de35a60.json | 42 ++++++- .../53de0394-8516-4882-b2bc-c7e62e3d8ef0.json | 42 ++++++- .../56d4c1c5-5238-45dc-8331-64a14b830779.json | 42 ++++++- .../7003c9d4-c758-4373-a7a3-04822978bf35.json | 42 ++++++- .../75a7dcb6-789c-49de-b209-4cf7d27465e4.json | 42 ++++++- .../e91d3910-4f20-4e82-b1fb-8605f5d2b8ac.json | 42 ++++++- .../f18bfd44-3097-4eb8-a09c-2372c3ecd738.json | 42 ++++++- .../9ca974b9-c5fb-4fc4-ab3e-1246e31ecdb2.json | 42 ++++++- .../fb1ab5e0-18db-4e5f-add3-2352d9a1f260.json | 42 ++++++- .../60ba1f0d-7e85-49e4-8c73-330d74de6707.json | 42 ++++++- .../29d1c194-8b87-466c-8701-e0fcf267665c.json | 42 ++++++- .../31e8f616-7b64-4d1a-b395-20bf8bb4629c.json | 42 ++++++- .../cc3f315d-3cea-47e4-83b4-b5045e778c5e.json | 42 ++++++- .../5d20dbf8-bb14-46af-adcd-b7ba05f8352c.json | 42 ++++++- .../06f2cb33-3937-4fde-84e2-6b5467f051c6.json | 42 ++++++- .../f35c4efa-3767-4a0e-8769-06230cda2512.json | 42 ++++++- .../6cb65d6a-6c46-4991-8154-f28b101954f6.json | 42 ++++++- .../6e15a49b-7dc4-4d69-965e-cb962c084e4a.json | 42 ++++++- .../9f5591f4-751d-48d3-a348-4bb59f6bb1a3.json | 42 ++++++- .../b609c002-fa0a-46a8-b5a1-9213ee89606c.json | 42 ++++++- .../b147fc7f-0e31-49ca-abfd-ba990a925097.json | 42 ++++++- .../e4fbfe23-2b70-459e-821b-db0116d43d8c.json | 42 ++++++- .../2ab7dc14-af3e-4fb2-8c0c-fe0e14100321.json | 42 ++++++- .../aca2c665-79f2-4226-b806-307be277ed08.json | 42 ++++++- .../d37a63df-6d38-4083-bf87-11064162efde.json | 42 ++++++- .../16e550cc-e59d-4aaa-b221-8cf71e1b26d2.json | 42 ++++++- .../47058e2a-dc41-45f8-8c32-bc496a8d3bc5.json | 37 +++++- .../7199c8b3-8346-4200-b07e-4362ad13a7db.json | 37 +++++- .../de7e59d5-e2ce-4479-bbd9-ab9deb3beed3.json | 37 +++++- .../17e011c3-1a53-40ae-b7b4-cb24c23df3de.json | 37 +++++- .../1125dd05-2f0d-48ca-825c-f5efa18564aa.json | 37 +++++- .../88014e0d-e89b-4fed-9eb6-5276bd7658df.json | 37 +++++- .../7cc9bfc2-570d-456c-918f-68fd4b711f05.json | 37 +++++- .../77b0957f-8779-4dbe-a6ea-cff50c4ee73b.json | 37 +++++- .../ba0ce7ce-a755-4337-bfec-0391680d3625.json | 32 ++++- .../4eb460eb-b3ad-4e0d-b131-5b59ef54015c.json | 32 ++++- .../6868a1e5-ee86-4f89-8452-5e939ac19169.json | 32 ++++- .../4a151d43-5fac-4afe-9c23-ba0e86a60849.json | 32 ++++- .../5f16d574-adef-4016-abcf-9e7936771ba7.json | 32 ++++- .../f3e0300f-39ed-4cfd-bd03-218904836037.json | 37 +++++- .../42c82c00-b74e-4152-a222-15d481a13e0c.json | 42 ++++++- .../68096be8-c49f-4a23-824e-1275248369f7.json | 32 ++++- .../c91270bd-3731-452a-b429-6cd4943d1194.json | 32 ++++- .../337c7a43-46a7-4acb-b7f1-936e1f2cf46f.json | 42 ++++++- .../3b00f881-8f73-4608-8cbb-846fe7d1cfea.json | 42 ++++++- .../2821dfdc-291b-405e-bd81-cf536c802885.json | 42 ++++++- .../7d441240-7e85-4776-b51c-3c1bc84456ba.json | 42 ++++++- .../840d35d9-441e-4ba3-bbc3-1f4ff2627517.json | 32 ++++- .../0127f3c5-9657-4eb6-a77a-5a6476a8fc79.json | 37 +++++- .../b72e2988-75e4-4d26-9a47-daae4786b02f.json | 42 ++++++- .../643cf5a3-8992-4126-87c9-814887314266.json | 42 ++++++- .../f81f1f67-6506-481f-87ce-a17a6a7578f3.json | 32 ++++- .../32b35218-a099-410e-8a65-a0d6e2f380a6.json | 42 ++++++- .../deec1e7c-0cb8-4e6f-b3ac-d37790b709f3.json | 32 ++++- .../e42a9986-4dcc-4017-be97-8135646c7424.json | 32 ++++- .../ffc92063-606a-4f31-bfdd-5683aa748ccc.json | 42 ++++++- .../23a5398c-0911-4a66-930d-abada12bf985.json | 42 ++++++- .../80b0bbcb-a57a-453c-8fff-502646520b1d.json | 32 ++++- .../e383c939-b952-4fdd-94e3-eb3716691860.json | 37 +++++- .../daf873f9-ab03-49df-96cb-a0f5a8613048.json | 37 +++++- .../f4cff132-3b2f-4e03-bb49-098b16d87cef.json | 32 ++++- .../f80685de-058c-4ab8-aa35-dc7321d1cea6.json | 37 +++++- .../c8e4349d-a084-4eb5-990f-403ba930a9ad.json | 37 +++++- .../729ca9c0-0680-49f1-97b9-5581be17a352.json | 32 ++++- .../fdd4add5-b44d-46f9-8c98-da3120df4161.json | 32 ++++- .../6b5ef643-30dd-4381-b66f-e9ecd6b0d06e.json | 32 ++++- .../95271b8c-4135-48bf-bbad-ae94baa37640.json | 32 ++++- .../f437e790-efe1-4dc5-8ccc-5b0bfd800069.json | 32 ++++- .../7d0f761a-2650-4029-b1e9-13af2f0cc69d.json | 37 +++++- .../49fc601e-4ac6-4672-a53d-0e89f19959c1.json | 37 +++++- .../6195e81a-d5a5-40af-96f6-259252009ad7.json | 32 ++++- .../2dec0f50-d374-4af3-9d27-80fcf50dac2c.json | 42 ++++++- .../96722888-0cc9-4dfd-b38d-91f4118c0be2.json | 32 ++++- .../683abc2a-fce0-4d3d-bdcc-5cac2c76a46a.json | 32 ++++- .../121344ec-61ef-49c5-a74b-b86f605d513e.json | 42 ++++++- .../8594f86b-a7f2-4046-a3a7-830d7ac20690.json | 32 ++++- .../c0c5e5e1-801c-48fd-a994-a4a69c0b1213.json | 32 ++++- .../0411ac30-1536-4639-8350-fc11d53298e3.json | 32 ++++- .../92281e58-4160-4d76-9119-b38fb47ffd8f.json | 32 ++++- .../43687871-2e19-4d2b-9754-1cb6527496c1.json | 32 ++++- .../1debe1de-b394-4856-a946-9d14bd867bf6.json | 37 +++++- .../80c589d2-c1eb-4dcf-8be8-042f4f66b7eb.json | 37 +++++- .../62478772-bb85-4d3f-a916-c3d17db3ee61.json | 37 +++++- .../a070bae2-c927-418b-91cc-161781c4f5b7.json | 42 ++++++- .../b884c919-a272-4f67-9a09-3d232f56d083.json | 42 ++++++- .../deac33dd-187b-4406-a76a-b33caf417380.json | 42 ++++++- .../185bd742-d7d4-4600-86bd-bcda75ed2ebc.json | 37 +++++- .../901e4de6-3ef6-4c2a-873c-cdcc47201974.json | 32 ++++- .../a051d5d6-18e6-483d-a000-4a52a06de676.json | 42 ++++++- .../94d77182-8952-4a63-b02b-3d8bd8a8dead.json | 32 ++++- .../9a48d808-0280-4175-a28a-7e9ba8ac6deb.json | 42 ++++++- .../f0d9f57d-d552-44ea-a91c-751854133316.json | 37 +++++- .../561cfba1-856d-4809-b5c7-41481735e1d6.json | 42 ++++++- .../995d1caf-b735-44dd-adff-875e3203aa46.json | 37 +++++- .../81767043-23c2-4229-b3b5-1c24e470d52a.json | 37 +++++- .../4f6344bc-af30-46f9-b6f8-41ff925d064e.json | 37 +++++- .../abac8640-40be-4eb5-9035-2bf6fd436a7a.json | 42 ++++++- .../6fd972ab-c45f-4ccd-a5cf-4aac5e703342.json | 32 ++++- .../8eb1bcf2-a6bd-467c-bc37-090fdb7a9460.json | 32 ++++- .../5ad53725-ed5a-41f3-8ff6-7404f3f981db.json | 32 ++++- .../ae2d05b4-5e80-4b00-af67-b94609b073eb.json | 32 ++++- .../592f2811-c197-423e-89d4-e25ee5a324fb.json | 32 ++++- .../17795e7b-e912-440f-a80e-63233d3b6d8c.json | 32 ++++- .../375cf55f-64f6-42f6-a947-1487feffb196.json | 37 +++++- .../94d2eddd-f7db-4360-ac58-0af39ce66935.json | 42 ++++++- .../996ca604-e01c-4a95-9286-60b6dc04f67d.json | 37 +++++- .../b6f0089f-d04b-4bcd-be84-ce3bc0d6c2b9.json | 37 +++++- .../83e15cba-4fec-48f2-9be4-78decbd96f66.json | 37 +++++- .../493617c0-37eb-4c83-b175-2507a3647b5e.json | 37 +++++- .../97f494ce-3c9c-4a19-a237-d458be611a0a.json | 37 +++++- .../f8bf1e92-3cc3-4c7e-9770-485a3074e85f.json | 37 +++++- .../5bf73fba-520f-4a2f-9296-8240847eb8ec.json | 37 +++++- .../3dd2c89f-64f5-4bbc-a621-791a9f0538b2.json | 37 +++++- .../ef987556-7277-48d8-ac07-532586773a3a.json | 32 ++++- .../add7eddb-7a8b-4c78-9864-c4316a97ce5e.json | 37 +++++- .../caf02954-1eed-44eb-b5f4-df47c90828d7.json | 37 +++++- .../00798930-daa2-4e79-82c6-2cccf1c3a0cb.json | 37 +++++- .../71658cf8-0189-49dc-847f-b9a9b5faee4a.json | 42 ++++++- .../3d506b91-5b0d-47e3-a3a0-bc09808bf5b5.json | 37 +++++- .../04c71231-2025-4e1a-b7ed-56b245868089.json | 37 +++++- .../08b2edd0-f8e9-47cd-b19d-53fdc7209917.json | 42 ++++++- .../79a43841-4032-4a20-8b5a-83b4b446d107.json | 37 +++++- .../a2c16ab8-1098-490a-8d0a-392d835427e0.json | 42 ++++++- .../0aa12860-7ebe-49c2-a5af-1926d23e34f8.json | 37 +++++- .../796d3ec1-9c26-4ead-87cb-4eb866209120.json | 42 ++++++- scripts/rewardbench/adapter.py | 24 +++- scripts/rewardbench/migrate_to_v020.py | 113 ++++++++++++++++++ scripts/utils/__init__.py | 8 ++ scripts/utils/schema.py | 97 +++++++++++++-- 388 files changed, 13016 insertions(+), 1934 deletions(-) create mode 100644 scripts/rewardbench/migrate_to_v020.py diff --git a/data/reward-bench/0-hero/Matter-0.1-7B-DPO-preview/623bae1f-19e9-47f9-bc7b-80a859218d07.json b/data/reward-bench/0-hero/Matter-0.1-7B-DPO-preview/623bae1f-19e9-47f9-bc7b-80a859218d07.json index a27298ad0..cef912137 100644 --- a/data/reward-bench/0-hero/Matter-0.1-7B-DPO-preview/623bae1f-19e9-47f9-bc7b-80a859218d07.json +++ b/data/reward-bench/0-hero/Matter-0.1-7B-DPO-preview/623bae1f-19e9-47f9-bc7b-80a859218d07.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/0-hero_Matter-0.1-7B-DPO-preview/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7247 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8939 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5768 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6378 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8854 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5348 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/0-hero/Matter-0.1-7B-boost-DPO-preview/fbba98c5-5d56-4837-9044-d4e5ac610c2c.json b/data/reward-bench/0-hero/Matter-0.1-7B-boost-DPO-preview/fbba98c5-5d56-4837-9044-d4e5ac610c2c.json index c4e185274..2e9c3f43d 100644 --- a/data/reward-bench/0-hero/Matter-0.1-7B-boost-DPO-preview/fbba98c5-5d56-4837-9044-d4e5ac610c2c.json +++ b/data/reward-bench/0-hero/Matter-0.1-7B-boost-DPO-preview/fbba98c5-5d56-4837-9044-d4e5ac610c2c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/0-hero_Matter-0.1-7B-boost-DPO-preview/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7448 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9106 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6096 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7135 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8395 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5566 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Ahjeong/MMPO_Gemma_7b/dc6e1164-c9d7-4dd5-b8dc-fbc4e3f45011.json b/data/reward-bench/Ahjeong/MMPO_Gemma_7b/dc6e1164-c9d7-4dd5-b8dc-fbc4e3f45011.json index b43b13f02..89456cf7f 100644 --- a/data/reward-bench/Ahjeong/MMPO_Gemma_7b/dc6e1164-c9d7-4dd5-b8dc-fbc4e3f45011.json +++ b/data/reward-bench/Ahjeong/MMPO_Gemma_7b/dc6e1164-c9d7-4dd5-b8dc-fbc4e3f45011.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Ahjeong_MMPO_Gemma_7b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7587 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9693 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.614 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7135 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7756 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6831 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3/c62a913b-3101-4ce3-a5c5-a1ac844e55f8.json b/data/reward-bench/Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3/c62a913b-3101-4ce3-a5c5-a1ac844e55f8.json index 0e64d13ad..f147a68de 100644 --- a/data/reward-bench/Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3/c62a913b-3101-4ce3-a5c5-a1ac844e55f8.json +++ b/data/reward-bench/Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3/c62a913b-3101-4ce3-a5c5-a1ac844e55f8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Ahjeong_MMPO_Gemma_7b_gamma1.1_epoch3/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7652 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9721 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6338 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7635 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7284 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6913 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/3101726d-fd51-436d-8adf-cbdf0d534834.json b/data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/3101726d-fd51-436d-8adf-cbdf0d534834.json index 619ba0c5a..9aaa4ec32 100644 --- a/data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/3101726d-fd51-436d-8adf-cbdf0d534834.json +++ b/data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/3101726d-fd51-436d-8adf-cbdf0d534834.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/anthropic_claude-3-5-sonnet-20240620/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6466 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3875 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5683 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8519 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8697 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.674 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/f878a52a-fa80-4113-ae7d-0cb11e3ef9fd.json b/data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/f878a52a-fa80-4113-ae7d-0cb11e3ef9fd.json index e8c404a58..4e001bb6c 100644 --- a/data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/f878a52a-fa80-4113-ae7d-0cb11e3ef9fd.json +++ b/data/reward-bench/Anthropic/claude-3-5-sonnet-20240620/f878a52a-fa80-4113-ae7d-0cb11e3ef9fd.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Anthropic_claude-3-5-sonnet-20240620/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8417 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9637 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7401 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8162 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.8469 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Anthropic/claude-3-7-sonnet-20250219/904c6359-bd7b-4448-9f16-bc115d0629c4.json b/data/reward-bench/Anthropic/claude-3-7-sonnet-20250219/904c6359-bd7b-4448-9f16-bc115d0629c4.json index 74591cc2b..47b1297ca 100644 --- a/data/reward-bench/Anthropic/claude-3-7-sonnet-20250219/904c6359-bd7b-4448-9f16-bc115d0629c4.json +++ b/data/reward-bench/Anthropic/claude-3-7-sonnet-20250219/904c6359-bd7b-4448-9f16-bc115d0629c4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/anthropic_claude-3-7-sonnet-20250219/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7539 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7326 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5437 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.75 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9033 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9212 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6723 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Anthropic/claude-3-haiku-20240307/49511052-6881-4151-9b46-686c75f73c22.json b/data/reward-bench/Anthropic/claude-3-haiku-20240307/49511052-6881-4151-9b46-686c75f73c22.json index 1e5b98c04..6e0f3e1b3 100644 --- a/data/reward-bench/Anthropic/claude-3-haiku-20240307/49511052-6881-4151-9b46-686c75f73c22.json +++ b/data/reward-bench/Anthropic/claude-3-haiku-20240307/49511052-6881-4151-9b46-686c75f73c22.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/anthropic_claude-3-haiku-20240307/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.3711 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.4042 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.2812 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.3552 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.595 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.501 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.0899 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Anthropic/claude-3-haiku-20240307/b289e2e6-d57b-4a2b-aa61-e2974d193909.json b/data/reward-bench/Anthropic/claude-3-haiku-20240307/b289e2e6-d57b-4a2b-aa61-e2974d193909.json index 03e1c4f0a..16656cf8a 100644 --- a/data/reward-bench/Anthropic/claude-3-haiku-20240307/b289e2e6-d57b-4a2b-aa61-e2974d193909.json +++ b/data/reward-bench/Anthropic/claude-3-haiku-20240307/b289e2e6-d57b-4a2b-aa61-e2974d193909.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Anthropic_claude-3-haiku-20240307/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7289 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9274 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5197 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7953 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.706 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6635 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Anthropic/claude-3-opus-20240229/aeeca919-71a1-42a0-a6d0-6779d77750e6.json b/data/reward-bench/Anthropic/claude-3-opus-20240229/aeeca919-71a1-42a0-a6d0-6779d77750e6.json index 551660420..dd51285f1 100644 --- a/data/reward-bench/Anthropic/claude-3-opus-20240229/aeeca919-71a1-42a0-a6d0-6779d77750e6.json +++ b/data/reward-bench/Anthropic/claude-3-opus-20240229/aeeca919-71a1-42a0-a6d0-6779d77750e6.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Anthropic_claude-3-opus-20240229/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8008 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9469 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6031 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8662 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.7868 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Anthropic/claude-3-opus-20240229/db29538d-f40e-42d0-b3c0-e622f92112d2.json b/data/reward-bench/Anthropic/claude-3-opus-20240229/db29538d-f40e-42d0-b3c0-e622f92112d2.json index 6a4ce0e30..1c912c7c4 100644 --- a/data/reward-bench/Anthropic/claude-3-opus-20240229/db29538d-f40e-42d0-b3c0-e622f92112d2.json +++ b/data/reward-bench/Anthropic/claude-3-opus-20240229/db29538d-f40e-42d0-b3c0-e622f92112d2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/anthropic_claude-3-opus-20240229/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5744 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5389 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3312 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5137 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8378 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6646 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5601 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Anthropic/claude-3-sonnet-20240229/ab0cdc4f-47dd-4dcc-b506-982ce3924105.json b/data/reward-bench/Anthropic/claude-3-sonnet-20240229/ab0cdc4f-47dd-4dcc-b506-982ce3924105.json index b3e532d32..3721b0f48 100644 --- a/data/reward-bench/Anthropic/claude-3-sonnet-20240229/ab0cdc4f-47dd-4dcc-b506-982ce3924105.json +++ b/data/reward-bench/Anthropic/claude-3-sonnet-20240229/ab0cdc4f-47dd-4dcc-b506-982ce3924105.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Anthropic_claude-3-sonnet-20240229/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7458 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9344 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5658 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8169 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6907 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6963 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Anthropic/claude-opus-4-20250514/44da63b6-d934-4330-bc20-33464bae61dd.json b/data/reward-bench/Anthropic/claude-opus-4-20250514/44da63b6-d934-4330-bc20-33464bae61dd.json index cc0ce43c9..8dffb9139 100644 --- a/data/reward-bench/Anthropic/claude-opus-4-20250514/44da63b6-d934-4330-bc20-33464bae61dd.json +++ b/data/reward-bench/Anthropic/claude-opus-4-20250514/44da63b6-d934-4330-bc20-33464bae61dd.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/anthropic_claude-opus-4-20250514/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7648 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8267 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4188 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7491 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8954 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8616 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.8375 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Anthropic/claude-sonnet-4-20250514/c930cbe0-f429-4b61-9abe-86dcb7266cf7.json b/data/reward-bench/Anthropic/claude-sonnet-4-20250514/c930cbe0-f429-4b61-9abe-86dcb7266cf7.json index 7029d7203..45d756c32 100644 --- a/data/reward-bench/Anthropic/claude-sonnet-4-20250514/c930cbe0-f429-4b61-9abe-86dcb7266cf7.json +++ b/data/reward-bench/Anthropic/claude-sonnet-4-20250514/c930cbe0-f429-4b61-9abe-86dcb7266cf7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/anthropic_claude-sonnet-4-20250514/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7117 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7612 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3594 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7049 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8909 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7596 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.7939 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/AtlaAI/Selene-1-Mini-Llama-3.1-8B/c84b27b2-2dd9-48ee-9a53-ec27ae62ae7a.json b/data/reward-bench/AtlaAI/Selene-1-Mini-Llama-3.1-8B/c84b27b2-2dd9-48ee-9a53-ec27ae62ae7a.json index 3b18998ef..e1c13041b 100644 --- a/data/reward-bench/AtlaAI/Selene-1-Mini-Llama-3.1-8B/c84b27b2-2dd9-48ee-9a53-ec27ae62ae7a.json +++ b/data/reward-bench/AtlaAI/Selene-1-Mini-Llama-3.1-8B/c84b27b2-2dd9-48ee-9a53-ec27ae62ae7a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/AtlaAI_Selene-1-Mini-Llama-3.1-8B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8913 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9358 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7939 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8926 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9429 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/AtlaAI/Selene-1/73ee9408-e669-4b8a-9419-76bd6051ce8d.json b/data/reward-bench/AtlaAI/Selene-1/73ee9408-e669-4b8a-9419-76bd6051ce8d.json index 0b508fc12..e90407d26 100644 --- a/data/reward-bench/AtlaAI/Selene-1/73ee9408-e669-4b8a-9419-76bd6051ce8d.json +++ b/data/reward-bench/AtlaAI/Selene-1/73ee9408-e669-4b8a-9419-76bd6051ce8d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/AtlaAI_Selene-1/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9241 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9777 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8399 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9216 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9572 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/0deed2f4-770e-4033-a65d-e1da19e00611.json b/data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/0deed2f4-770e-4033-a65d-e1da19e00611.json index 606de4ddf..dd6dc0bf7 100644 --- a/data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/0deed2f4-770e-4033-a65d-e1da19e00611.json +++ b/data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/0deed2f4-770e-4033-a65d-e1da19e00611.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/CIR-AMS_BTRM_Qwen2_7b_0613/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5736 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5347 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3563 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6066 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7178 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5737 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6527 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/e727cb77-f229-4aaa-909f-99c7aa06676b.json b/data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/e727cb77-f229-4aaa-909f-99c7aa06676b.json index c8ff53cce..7179fab0e 100644 --- a/data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/e727cb77-f229-4aaa-909f-99c7aa06676b.json +++ b/data/reward-bench/CIR-AMS/BTRM_Qwen2_7b_0613/e727cb77-f229-4aaa-909f-99c7aa06676b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/CIR-AMS_BTRM_Qwen2_7b_0613/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8172 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9749 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5724 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9014 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8775 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.7029 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/CohereForAI/c4ai-command-r-plus/da9264cd-2fa3-4121-81de-eef994e15993.json b/data/reward-bench/CohereForAI/c4ai-command-r-plus/da9264cd-2fa3-4121-81de-eef994e15993.json index 4b71e3b45..10fb8ed6c 100644 --- a/data/reward-bench/CohereForAI/c4ai-command-r-plus/da9264cd-2fa3-4121-81de-eef994e15993.json +++ b/data/reward-bench/CohereForAI/c4ai-command-r-plus/da9264cd-2fa3-4121-81de-eef994e15993.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/CohereForAI_c4ai-command-r-plus/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7057 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9511 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5757 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5986 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.704 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6924 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ContextualAI/LMUnit-llama3.1-70b/79cc5cd4-bfed-466d-9fbe-2f27e8aab175.json b/data/reward-bench/ContextualAI/LMUnit-llama3.1-70b/79cc5cd4-bfed-466d-9fbe-2f27e8aab175.json index 40b37fe16..ff7ea07a0 100644 --- a/data/reward-bench/ContextualAI/LMUnit-llama3.1-70b/79cc5cd4-bfed-466d-9fbe-2f27e8aab175.json +++ b/data/reward-bench/ContextualAI/LMUnit-llama3.1-70b/79cc5cd4-bfed-466d-9fbe-2f27e8aab175.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/ContextualAI_LMUnit-llama3.1-70b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8054 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8463 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4875 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7158 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9067 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9697 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.9063 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ContextualAI/LMUnit-qwen2.5-72b/28c35831-679d-489a-b2c4-fd2c7f333fbc.json b/data/reward-bench/ContextualAI/LMUnit-qwen2.5-72b/28c35831-679d-489a-b2c4-fd2c7f333fbc.json index 64bdcfef0..8597afb51 100644 --- a/data/reward-bench/ContextualAI/LMUnit-qwen2.5-72b/28c35831-679d-489a-b2c4-fd2c7f333fbc.json +++ b/data/reward-bench/ContextualAI/LMUnit-qwen2.5-72b/28c35831-679d-489a-b2c4-fd2c7f333fbc.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/ContextualAI_LMUnit-qwen2.5-72b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8208 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8716 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5437 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7268 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9133 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9677 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.9014 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ContextualAI/archangel_sft-dpo_llama13b/9db7907d-7b22-480c-86a5-f88ec2b302e7.json b/data/reward-bench/ContextualAI/archangel_sft-dpo_llama13b/9db7907d-7b22-480c-86a5-f88ec2b302e7.json index ab96244b5..4aa411ea6 100644 --- a/data/reward-bench/ContextualAI/archangel_sft-dpo_llama13b/9db7907d-7b22-480c-86a5-f88ec2b302e7.json +++ b/data/reward-bench/ContextualAI/archangel_sft-dpo_llama13b/9db7907d-7b22-480c-86a5-f88ec2b302e7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ContextualAI_archangel_sft-dpo_llama13b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.54 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7123 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4298 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5649 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.4401 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5656 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ContextualAI/archangel_sft-dpo_llama30b/2faddf79-41e6-47e9-9c26-17bc987bc870.json b/data/reward-bench/ContextualAI/archangel_sft-dpo_llama30b/2faddf79-41e6-47e9-9c26-17bc987bc870.json index 215ac5202..fefd98c33 100644 --- a/data/reward-bench/ContextualAI/archangel_sft-dpo_llama30b/2faddf79-41e6-47e9-9c26-17bc987bc870.json +++ b/data/reward-bench/ContextualAI/archangel_sft-dpo_llama30b/2faddf79-41e6-47e9-9c26-17bc987bc870.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ContextualAI_archangel_sft-dpo_llama30b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5618 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6927 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4474 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.4745 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5705 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ContextualAI/archangel_sft-dpo_llama7b/20989a47-6556-4e3b-8909-d0a419cb159b.json b/data/reward-bench/ContextualAI/archangel_sft-dpo_llama7b/20989a47-6556-4e3b-8909-d0a419cb159b.json index f40114fab..69dc72884 100644 --- a/data/reward-bench/ContextualAI/archangel_sft-dpo_llama7b/20989a47-6556-4e3b-8909-d0a419cb159b.json +++ b/data/reward-bench/ContextualAI/archangel_sft-dpo_llama7b/20989a47-6556-4e3b-8909-d0a419cb159b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ContextualAI_archangel_sft-dpo_llama7b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5304 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5782 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4452 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5203 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.5658 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5544 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia1-4b/f3d0010f-efed-4f87-9582-b9c87b4de99a.json b/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia1-4b/f3d0010f-efed-4f87-9582-b9c87b4de99a.json index ca3483447..c640a7456 100644 --- a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia1-4b/f3d0010f-efed-4f87-9582-b9c87b4de99a.json +++ b/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia1-4b/f3d0010f-efed-4f87-9582-b9c87b4de99a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ContextualAI_archangel_sft-dpo_pythia1-4b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5233 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6397 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3728 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5041 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.5672 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5427 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia12-0b/a0ce3ed6-2a2c-46ad-be86-6f6701533e36.json b/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia12-0b/a0ce3ed6-2a2c-46ad-be86-6f6701533e36.json index 24d3c5885..ed72e23b9 100644 --- a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia12-0b/a0ce3ed6-2a2c-46ad-be86-6f6701533e36.json +++ b/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia12-0b/a0ce3ed6-2a2c-46ad-be86-6f6701533e36.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ContextualAI_archangel_sft-dpo_pythia12-0b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5009 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6676 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.364 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5432 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.4139 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5303 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia2-8b/d54c4830-23c8-4c12-aea1-4f5b5245464f.json b/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia2-8b/d54c4830-23c8-4c12-aea1-4f5b5245464f.json index d90b44daa..10908a053 100644 --- a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia2-8b/d54c4830-23c8-4c12-aea1-4f5b5245464f.json +++ b/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia2-8b/d54c4830-23c8-4c12-aea1-4f5b5245464f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ContextualAI_archangel_sft-dpo_pythia2-8b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5286 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8073 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3355 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.4473 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.5135 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5501 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia6-9b/b5853278-edd9-4bc8-bbeb-d6dab515b562.json b/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia6-9b/b5853278-edd9-4bc8-bbeb-d6dab515b562.json index 998861312..40f3a091a 100644 --- a/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia6-9b/b5853278-edd9-4bc8-bbeb-d6dab515b562.json +++ b/data/reward-bench/ContextualAI/archangel_sft-dpo_pythia6-9b/b5853278-edd9-4bc8-bbeb-d6dab515b562.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ContextualAI_archangel_sft-dpo_pythia6-9b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5263 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7486 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3421 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5176 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.4847 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.551 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ContextualAI/archangel_sft-kto_llama13b/74188e30-1e49-47d8-af01-b80e430dafa0.json b/data/reward-bench/ContextualAI/archangel_sft-kto_llama13b/74188e30-1e49-47d8-af01-b80e430dafa0.json index 3ce4e1d0e..22b4b63bd 100644 --- a/data/reward-bench/ContextualAI/archangel_sft-kto_llama13b/74188e30-1e49-47d8-af01-b80e430dafa0.json +++ b/data/reward-bench/ContextualAI/archangel_sft-kto_llama13b/74188e30-1e49-47d8-af01-b80e430dafa0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ContextualAI_archangel_sft-kto_llama13b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5952 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8408 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3772 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.4649 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7077 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.576 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ContextualAI/archangel_sft-kto_llama30b/93974286-0497-46a2-a2e8-404c1e89dba0.json b/data/reward-bench/ContextualAI/archangel_sft-kto_llama30b/93974286-0497-46a2-a2e8-404c1e89dba0.json index 03921df15..ca6ff0f55 100644 --- a/data/reward-bench/ContextualAI/archangel_sft-kto_llama30b/93974286-0497-46a2-a2e8-404c1e89dba0.json +++ b/data/reward-bench/ContextualAI/archangel_sft-kto_llama30b/93974286-0497-46a2-a2e8-404c1e89dba0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ContextualAI_archangel_sft-kto_llama30b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5901 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8436 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4057 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6054 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.5075 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5862 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ContextualAI/archangel_sft-kto_llama7b/02c0020c-7d69-4701-a606-4bc79ad87afd.json b/data/reward-bench/ContextualAI/archangel_sft-kto_llama7b/02c0020c-7d69-4701-a606-4bc79ad87afd.json index 556053c1d..ac5acffb2 100644 --- a/data/reward-bench/ContextualAI/archangel_sft-kto_llama7b/02c0020c-7d69-4701-a606-4bc79ad87afd.json +++ b/data/reward-bench/ContextualAI/archangel_sft-kto_llama7b/02c0020c-7d69-4701-a606-4bc79ad87afd.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ContextualAI_archangel_sft-kto_llama7b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5388 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5587 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4364 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.4568 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6941 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5575 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia1-4b/5dcb7c54-64e7-4f76-8903-8f57b35cdb0c.json b/data/reward-bench/ContextualAI/archangel_sft-kto_pythia1-4b/5dcb7c54-64e7-4f76-8903-8f57b35cdb0c.json index 04509bcea..36044c710 100644 --- a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia1-4b/5dcb7c54-64e7-4f76-8903-8f57b35cdb0c.json +++ b/data/reward-bench/ContextualAI/archangel_sft-kto_pythia1-4b/5dcb7c54-64e7-4f76-8903-8f57b35cdb0c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ContextualAI_archangel_sft-kto_pythia1-4b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5581 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6844 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3794 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5257 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6447 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5546 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia12-0b/4887256e-0545-40dd-9756-ff850e003a29.json b/data/reward-bench/ContextualAI/archangel_sft-kto_pythia12-0b/4887256e-0545-40dd-9756-ff850e003a29.json index 07e49a4d2..16ed21233 100644 --- a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia12-0b/4887256e-0545-40dd-9756-ff850e003a29.json +++ b/data/reward-bench/ContextualAI/archangel_sft-kto_pythia12-0b/4887256e-0545-40dd-9756-ff850e003a29.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ContextualAI_archangel_sft-kto_pythia12-0b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5053 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7486 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3618 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.4757 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.4127 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.55 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia2-8b/d2b70870-9cbc-4666-bbd4-097fcebe716e.json b/data/reward-bench/ContextualAI/archangel_sft-kto_pythia2-8b/d2b70870-9cbc-4666-bbd4-097fcebe716e.json index fc9b9fbf2..4c1047aa1 100644 --- a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia2-8b/d2b70870-9cbc-4666-bbd4-097fcebe716e.json +++ b/data/reward-bench/ContextualAI/archangel_sft-kto_pythia2-8b/d2b70870-9cbc-4666-bbd4-097fcebe716e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ContextualAI_archangel_sft-kto_pythia2-8b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5497 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.757 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3421 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.4743 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6216 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.557 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia6-9b/f420f432-2291-40a9-8ebd-b91241970113.json b/data/reward-bench/ContextualAI/archangel_sft-kto_pythia6-9b/f420f432-2291-40a9-8ebd-b91241970113.json index 132d06441..521c30c11 100644 --- a/data/reward-bench/ContextualAI/archangel_sft-kto_pythia6-9b/f420f432-2291-40a9-8ebd-b91241970113.json +++ b/data/reward-bench/ContextualAI/archangel_sft-kto_pythia6-9b/f420f432-2291-40a9-8ebd-b91241970113.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ContextualAI_archangel_sft-kto_pythia6-9b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5561 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7765 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3618 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5365 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.5415 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5723 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Databricks-Mosaic-Research/PGRM/02e68d1b-86f3-4344-ad8d-45df878b744c.json b/data/reward-bench/Databricks-Mosaic-Research/PGRM/02e68d1b-86f3-4344-ad8d-45df878b744c.json index 4e96fe9f0..5d22f5c2e 100644 --- a/data/reward-bench/Databricks-Mosaic-Research/PGRM/02e68d1b-86f3-4344-ad8d-45df878b744c.json +++ b/data/reward-bench/Databricks-Mosaic-Research/PGRM/02e68d1b-86f3-4344-ad8d-45df878b744c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/Databricks-Mosaic-Research_PGRM/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8002 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7937 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5062 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7404 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9289 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9424 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.8893 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/HFXM/RAMO-Llama3.1-8B/f712ab4a-1127-44ba-b6b9-7a40290f3322.json b/data/reward-bench/HFXM/RAMO-Llama3.1-8B/f712ab4a-1127-44ba-b6b9-7a40290f3322.json index 15402671e..7ccfc23f1 100644 --- a/data/reward-bench/HFXM/RAMO-Llama3.1-8B/f712ab4a-1127-44ba-b6b9-7a40290f3322.json +++ b/data/reward-bench/HFXM/RAMO-Llama3.1-8B/f712ab4a-1127-44ba-b6b9-7a40290f3322.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/HFXM_RAMO-Llama3.1-8B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6917 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6547 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.375 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5628 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9756 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9071 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6752 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/HuggingFaceH4/starchat2-15b-v0.1/b4175f0f-f9f4-4418-b4aa-a31e7f1f93f4.json b/data/reward-bench/HuggingFaceH4/starchat2-15b-v0.1/b4175f0f-f9f4-4418-b4aa-a31e7f1f93f4.json index 4d710efce..a6b1abca8 100644 --- a/data/reward-bench/HuggingFaceH4/starchat2-15b-v0.1/b4175f0f-f9f4-4418-b4aa-a31e7f1f93f4.json +++ b/data/reward-bench/HuggingFaceH4/starchat2-15b-v0.1/b4175f0f-f9f4-4418-b4aa-a31e7f1f93f4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/HuggingFaceH4_starchat2-15b-v0.1/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7322 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9385 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5548 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7095 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8159 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5525 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/HuggingFaceH4/zephyr-7b-alpha/9879e9a7-ddbc-4338-abc7-e3bc394869e9.json b/data/reward-bench/HuggingFaceH4/zephyr-7b-alpha/9879e9a7-ddbc-4338-abc7-e3bc394869e9.json index 86a8b8344..b313fb87e 100644 --- a/data/reward-bench/HuggingFaceH4/zephyr-7b-alpha/9879e9a7-ddbc-4338-abc7-e3bc394869e9.json +++ b/data/reward-bench/HuggingFaceH4/zephyr-7b-alpha/9879e9a7-ddbc-4338-abc7-e3bc394869e9.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/HuggingFaceH4_zephyr-7b-alpha/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7392 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9162 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.625 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7662 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7514 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5353 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/HuggingFaceH4/zephyr-7b-beta/d7d8a5cb-e295-4ced-b528-d99d814ff008.json b/data/reward-bench/HuggingFaceH4/zephyr-7b-beta/d7d8a5cb-e295-4ced-b528-d99d814ff008.json index 36c56448c..7d0709109 100644 --- a/data/reward-bench/HuggingFaceH4/zephyr-7b-beta/d7d8a5cb-e295-4ced-b528-d99d814ff008.json +++ b/data/reward-bench/HuggingFaceH4/zephyr-7b-beta/d7d8a5cb-e295-4ced-b528-d99d814ff008.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/HuggingFaceH4_zephyr-7b-beta/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7281 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9525 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6272 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6568 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7789 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5216 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/HuggingFaceH4/zephyr-7b-gemma-v0.1/bff86a1f-71c3-4f27-aeae-bba6d03635ef.json b/data/reward-bench/HuggingFaceH4/zephyr-7b-gemma-v0.1/bff86a1f-71c3-4f27-aeae-bba6d03635ef.json index ea611719b..89a96432c 100644 --- a/data/reward-bench/HuggingFaceH4/zephyr-7b-gemma-v0.1/bff86a1f-71c3-4f27-aeae-bba6d03635ef.json +++ b/data/reward-bench/HuggingFaceH4/zephyr-7b-gemma-v0.1/bff86a1f-71c3-4f27-aeae-bba6d03635ef.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/HuggingFaceH4_zephyr-7b-gemma-v0.1/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6758 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9581 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4956 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5824 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7463 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5171 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/IDEA-CCNL/Ziya-LLaMA-7B-Reward/723281f8-54b7-4db6-8253-5a6dcf4f3d4a.json b/data/reward-bench/IDEA-CCNL/Ziya-LLaMA-7B-Reward/723281f8-54b7-4db6-8253-5a6dcf4f3d4a.json index e8bc23b6a..73199f618 100644 --- a/data/reward-bench/IDEA-CCNL/Ziya-LLaMA-7B-Reward/723281f8-54b7-4db6-8253-5a6dcf4f3d4a.json +++ b/data/reward-bench/IDEA-CCNL/Ziya-LLaMA-7B-Reward/723281f8-54b7-4db6-8253-5a6dcf4f3d4a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/IDEA-CCNL_Ziya-LLaMA-7B-Reward/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6378 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8687 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4605 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6405 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.5775 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6461 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/LxzGordon/URM-LLaMa-3-8B/0ce7dc54-f608-4985-9904-75cee09b6288.json b/data/reward-bench/LxzGordon/URM-LLaMa-3-8B/0ce7dc54-f608-4985-9904-75cee09b6288.json index 9f7301f3d..2f1093e30 100644 --- a/data/reward-bench/LxzGordon/URM-LLaMa-3-8B/0ce7dc54-f608-4985-9904-75cee09b6288.json +++ b/data/reward-bench/LxzGordon/URM-LLaMa-3-8B/0ce7dc54-f608-4985-9904-75cee09b6288.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/LxzGordon_URM-LLaMa-3-8B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8991 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9693 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7873 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8824 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9574 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/5bb0aaa4-2cc5-4622-8235-993bc4178f12.json b/data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/5bb0aaa4-2cc5-4622-8235-993bc4178f12.json index b34ec8f07..d56005468 100644 --- a/data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/5bb0aaa4-2cc5-4622-8235-993bc4178f12.json +++ b/data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/5bb0aaa4-2cc5-4622-8235-993bc4178f12.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/LxzGordon_URM-LLaMa-3.1-8B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9294 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9553 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8816 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9108 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9698 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/85ab22b8-0587-4e2b-857f-3d6d84d571a4.json b/data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/85ab22b8-0587-4e2b-857f-3d6d84d571a4.json index 7ce19f3b5..a57e5caa3 100644 --- a/data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/85ab22b8-0587-4e2b-857f-3d6d84d571a4.json +++ b/data/reward-bench/LxzGordon/URM-LLaMa-3.1-8B/85ab22b8-0587-4e2b-857f-3d6d84d571a4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/LxzGordon_URM-LLaMa-3.1-8B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7394 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6884 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.45 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6393 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9178 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9758 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.7653 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/NCSOFT/Llama-3-OffsetBias-8B/37aa6702-b2fa-43bf-b5a9-36740f627217.json b/data/reward-bench/NCSOFT/Llama-3-OffsetBias-8B/37aa6702-b2fa-43bf-b5a9-36740f627217.json index d03295a6a..a9c1eb53c 100644 --- a/data/reward-bench/NCSOFT/Llama-3-OffsetBias-8B/37aa6702-b2fa-43bf-b5a9-36740f627217.json +++ b/data/reward-bench/NCSOFT/Llama-3-OffsetBias-8B/37aa6702-b2fa-43bf-b5a9-36740f627217.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/NCSOFT_Llama-3-OffsetBias-8B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8397 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9246 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8026 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8676 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.7639 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/57f48d0c-e424-410d-b9ee-4707e2add036.json b/data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/57f48d0c-e424-410d-b9ee-4707e2add036.json index ab1c474bd..f38bf29f7 100644 --- a/data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/57f48d0c-e424-410d-b9ee-4707e2add036.json +++ b/data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/57f48d0c-e424-410d-b9ee-4707e2add036.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/NCSOFT_Llama-3-OffsetBias-RM-8B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8942 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9721 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.818 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8676 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9192 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/8643b4dd-e18c-442c-adb5-84ef756534f8.json b/data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/8643b4dd-e18c-442c-adb5-84ef756534f8.json index fd43b8778..ec0f3756c 100644 --- a/data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/8643b4dd-e18c-442c-adb5-84ef756534f8.json +++ b/data/reward-bench/NCSOFT/Llama-3-OffsetBias-RM-8B/8643b4dd-e18c-442c-adb5-84ef756534f8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/NCSOFT_Llama-3-OffsetBias-RM-8B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.648 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6084 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5191 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7222 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9596 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6786 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Nexusflow/Starling-RM-34B/2f3d2e46-1f9e-4b1c-9729-ab0a93cc245c.json b/data/reward-bench/Nexusflow/Starling-RM-34B/2f3d2e46-1f9e-4b1c-9729-ab0a93cc245c.json index 8e269f55a..3b7921590 100644 --- a/data/reward-bench/Nexusflow/Starling-RM-34B/2f3d2e46-1f9e-4b1c-9729-ab0a93cc245c.json +++ b/data/reward-bench/Nexusflow/Starling-RM-34B/2f3d2e46-1f9e-4b1c-9729-ab0a93cc245c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Nexusflow_Starling-RM-34B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8133 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9693 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5724 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.877 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8845 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.7137 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Nexusflow/Starling-RM-34B/4aec78d3-a38c-48e0-b9e2-b6dc063bd37e.json b/data/reward-bench/Nexusflow/Starling-RM-34B/4aec78d3-a38c-48e0-b9e2-b6dc063bd37e.json index 6a782cd05..a2c665080 100644 --- a/data/reward-bench/Nexusflow/Starling-RM-34B/4aec78d3-a38c-48e0-b9e2-b6dc063bd37e.json +++ b/data/reward-bench/Nexusflow/Starling-RM-34B/4aec78d3-a38c-48e0-b9e2-b6dc063bd37e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/Nexusflow_Starling-RM-34B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.4553 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.4589 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3187 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6175 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7556 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.4808 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.1004 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/NousResearch/Hermes-3-Llama-3.1-70B/f9b60945-8b14-4564-9d44-3eb6db675ab9.json b/data/reward-bench/NousResearch/Hermes-3-Llama-3.1-70B/f9b60945-8b14-4564-9d44-3eb6db675ab9.json index 1bd4f9222..734675894 100644 --- a/data/reward-bench/NousResearch/Hermes-3-Llama-3.1-70B/f9b60945-8b14-4564-9d44-3eb6db675ab9.json +++ b/data/reward-bench/NousResearch/Hermes-3-Llama-3.1-70B/f9b60945-8b14-4564-9d44-3eb6db675ab9.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/NousResearch_Hermes-3-Llama-3.1-70B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7847 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9623 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5669 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.823 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.7867 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/NousResearch/Nous-Hermes-2-Mistral-7B-DPO/56703c11-eccb-4f66-af13-60f972a5068f.json b/data/reward-bench/NousResearch/Nous-Hermes-2-Mistral-7B-DPO/56703c11-eccb-4f66-af13-60f972a5068f.json index 28eb5c34d..0e770043c 100644 --- a/data/reward-bench/NousResearch/Nous-Hermes-2-Mistral-7B-DPO/56703c11-eccb-4f66-af13-60f972a5068f.json +++ b/data/reward-bench/NousResearch/Nous-Hermes-2-Mistral-7B-DPO/56703c11-eccb-4f66-af13-60f972a5068f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/NousResearch_Nous-Hermes-2-Mistral-7B-DPO/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7481 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9218 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6053 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8243 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7375 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.555 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO/fbd8be7e-5670-4729-a77d-83472510b734.json b/data/reward-bench/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO/fbd8be7e-5670-4729-a77d-83472510b734.json index f190b81a1..99623d8ff 100644 --- a/data/reward-bench/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO/fbd8be7e-5670-4729-a77d-83472510b734.json +++ b/data/reward-bench/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO/fbd8be7e-5670-4729-a77d-83472510b734.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/NousResearch_Nous-Hermes-2-Mixtral-8x7B-DPO/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7138 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9162 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6053 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8149 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6126 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5266 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/2e18ee77-9c46-4cf9-9521-303ad15e5be4.json b/data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/2e18ee77-9c46-4cf9-9521-303ad15e5be4.json index 4c1ad8e64..31e05eb58 100644 --- a/data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/2e18ee77-9c46-4cf9-9521-303ad15e5be4.json +++ b/data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/2e18ee77-9c46-4cf9-9521-303ad15e5be4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/OpenAssistant_oasst-rm-2-pythia-6.9b-epoch-1/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.615 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9246 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3728 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5446 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.5855 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6801 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/ec5b296e-03e8-4371-a8c1-eca0b0b9759d.json b/data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/ec5b296e-03e8-4371-a8c1-eca0b0b9759d.json index 4d37887c0..dfed08cd8 100644 --- a/data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/ec5b296e-03e8-4371-a8c1-eca0b0b9759d.json +++ b/data/reward-bench/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1/ec5b296e-03e8-4371-a8c1-eca0b0b9759d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/OpenAssistant_oasst-rm-2-pythia-6.9b-epoch-1/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.2653 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.3979 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.2875 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.377 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.3289 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.1535 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.047 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/07b61a55-a8e3-4a6f-9806-a4100f8d5297.json b/data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/07b61a55-a8e3-4a6f-9806-a4100f8d5297.json index 23d7514d9..85e007109 100644 --- a/data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/07b61a55-a8e3-4a6f-9806-a4100f8d5297.json +++ b/data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/07b61a55-a8e3-4a6f-9806-a4100f8d5297.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/OpenAssistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6901 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8855 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4868 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6311 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7752 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6533 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/3d534c25-5016-44de-9c47-24b7d7399b0f.json b/data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/3d534c25-5016-44de-9c47-24b7d7399b0f.json index bad24200c..38eca68e4 100644 --- a/data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/3d534c25-5016-44de-9c47-24b7d7399b0f.json +++ b/data/reward-bench/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5/3d534c25-5016-44de-9c47-24b7d7399b0f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/OpenAssistant_oasst-rm-2.1-pythia-1.4b-epoch-2.5/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.2648 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.3179 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.2625 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.3934 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.3244 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.2707 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.0198 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/4de91433-05b3-4f88-9d0f-66691c671f62.json b/data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/4de91433-05b3-4f88-9d0f-66691c671f62.json index 88c6cf6ea..365667594 100644 --- a/data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/4de91433-05b3-4f88-9d0f-66691c671f62.json +++ b/data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/4de91433-05b3-4f88-9d0f-66691c671f62.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/OpenAssistant_reward-model-deberta-v3-large-v2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.32 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.3853 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.2687 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5027 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.3667 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.2768 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.12 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/dc71f1ba-f4b8-4231-ac72-0acf9a22d73e.json b/data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/dc71f1ba-f4b8-4231-ac72-0acf9a22d73e.json index a8597cef2..4712b7f92 100644 --- a/data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/dc71f1ba-f4b8-4231-ac72-0acf9a22d73e.json +++ b/data/reward-bench/OpenAssistant/reward-model-deberta-v3-large-v2/dc71f1ba-f4b8-4231-ac72-0acf9a22d73e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/OpenAssistant_reward-model-deberta-v3-large-v2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6126 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8939 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4518 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7338 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.3855 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5836 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/36c4adc9-c2fb-4bc3-81ba-88478d30332e.json b/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/36c4adc9-c2fb-4bc3-81ba-88478d30332e.json index 77c839a13..ae927c59e 100644 --- a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/36c4adc9-c2fb-4bc3-81ba-88478d30332e.json +++ b/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/36c4adc9-c2fb-4bc3-81ba-88478d30332e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v1.0-cost/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5798 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6173 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4232 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7351 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.5482 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.57 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/f0827b15-20d0-4986-b5a0-bb4bc9be768e.json b/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/f0827b15-20d0-4986-b5a0-bb4bc9be768e.json index 1d30468c7..9e4c95ee2 100644 --- a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/f0827b15-20d0-4986-b5a0-bb4bc9be768e.json +++ b/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-cost/f0827b15-20d0-4986-b5a0-bb4bc9be768e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v1.0-cost/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.3332 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.3263 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.2313 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.3989 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7589 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.2939 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": -0.01 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/aeaa8b33-e327-4c65-9641-5dfc63feee3b.json b/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/aeaa8b33-e327-4c65-9641-5dfc63feee3b.json index 36bcdfdc9..fc41926be 100644 --- a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/aeaa8b33-e327-4c65-9641-5dfc63feee3b.json +++ b/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/aeaa8b33-e327-4c65-9641-5dfc63feee3b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v1.0-reward/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.1606 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.2105 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.2938 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.2623 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.1422 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.0646 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": -0.01 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/c97c79f3-fd92-49db-9131-5e45834a7eaf.json b/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/c97c79f3-fd92-49db-9131-5e45834a7eaf.json index 2100e18c1..d06d08a3c 100644 --- a/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/c97c79f3-fd92-49db-9131-5e45834a7eaf.json +++ b/data/reward-bench/PKU-Alignment/beaver-7b-v1.0-reward/c97c79f3-fd92-49db-9131-5e45834a7eaf.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v1.0-reward/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.4727 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8184 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.2873 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.3757 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.346 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5993 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/687099cb-c1bf-49ec-a902-329c2b818369.json b/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/687099cb-c1bf-49ec-a902-329c2b818369.json index e0c222d4d..3868ab64e 100644 --- a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/687099cb-c1bf-49ec-a902-329c2b818369.json +++ b/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/687099cb-c1bf-49ec-a902-329c2b818369.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v2.0-cost/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.3326 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.3789 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.275 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.3333 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7356 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.2828 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": -0.01 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/8da4f5eb-6264-4503-b9bc-fcf843b638be.json b/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/8da4f5eb-6264-4503-b9bc-fcf843b638be.json index 5908eca1e..4af7bc7a5 100644 --- a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/8da4f5eb-6264-4503-b9bc-fcf843b638be.json +++ b/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-cost/8da4f5eb-6264-4503-b9bc-fcf843b638be.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v2.0-cost/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5957 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5726 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4561 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7608 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6211 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5397 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/28a68b87-5412-4374-9e61-896b0fff7669.json b/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/28a68b87-5412-4374-9e61-896b0fff7669.json index 45166e401..376b686c0 100644 --- a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/28a68b87-5412-4374-9e61-896b0fff7669.json +++ b/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/28a68b87-5412-4374-9e61-896b0fff7669.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/PKU-Alignment_beaver-7b-v2.0-reward/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.2544 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.2168 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.2562 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.3825 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.3156 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.2606 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.0944 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/3209c869-03c5-4801-8e4b-4c8bcde3d58f.json b/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/3209c869-03c5-4801-8e4b-4c8bcde3d58f.json index 39d9d8ddf..49fb24c89 100644 --- a/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/3209c869-03c5-4801-8e4b-4c8bcde3d58f.json +++ b/data/reward-bench/PKU-Alignment/beaver-7b-v2.0-reward/3209c869-03c5-4801-8e4b-4c8bcde3d58f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/PKU-Alignment_beaver-7b-v2.0-reward/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6366 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8994 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.364 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6041 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6887 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6171 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022.../9d1e124c-e133-41d3-8ac7-5c8c5027aa02.json b/data/reward-bench/PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022.../9d1e124c-e133-41d3-8ac7-5c8c5027aa02.json index 88bc18eb8..f9da09026 100644 --- a/data/reward-bench/PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022.../9d1e124c-e133-41d3-8ac7-5c8c5027aa02.json +++ b/data/reward-bench/PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022.../9d1e124c-e133-41d3-8ac7-5c8c5027aa02.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/PoLL_gpt-3.5-turbo-0125_claude-3-sonnet-2024022.../1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7578 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9525 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5406 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8034 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.7346 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Qwen/Qwen1.5-0.5B-Chat/633d499b-58bd-4fca-9b56-0f005a5a21b8.json b/data/reward-bench/Qwen/Qwen1.5-0.5B-Chat/633d499b-58bd-4fca-9b56-0f005a5a21b8.json index 41b56ed43..8753b5ea8 100644 --- a/data/reward-bench/Qwen/Qwen1.5-0.5B-Chat/633d499b-58bd-4fca-9b56-0f005a5a21b8.json +++ b/data/reward-bench/Qwen/Qwen1.5-0.5B-Chat/633d499b-58bd-4fca-9b56-0f005a5a21b8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Qwen_Qwen1.5-0.5B-Chat/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5298 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.3547 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6294 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5703 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.5984 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.4629 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Qwen/Qwen1.5-1.8B-Chat/5c4f3caf-6af3-48c6-83e2-4710d31e6acf.json b/data/reward-bench/Qwen/Qwen1.5-1.8B-Chat/5c4f3caf-6af3-48c6-83e2-4710d31e6acf.json index ef965fde9..48dfa65fd 100644 --- a/data/reward-bench/Qwen/Qwen1.5-1.8B-Chat/5c4f3caf-6af3-48c6-83e2-4710d31e6acf.json +++ b/data/reward-bench/Qwen/Qwen1.5-1.8B-Chat/5c4f3caf-6af3-48c6-83e2-4710d31e6acf.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Qwen_Qwen1.5-1.8B-Chat/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.589 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5615 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6031 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.4838 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7793 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.4453 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Qwen/Qwen1.5-14B-Chat/77d1edc1-fb54-4371-bf7c-baebbb351163.json b/data/reward-bench/Qwen/Qwen1.5-14B-Chat/77d1edc1-fb54-4371-bf7c-baebbb351163.json index d5fc3341b..f34eee3d4 100644 --- a/data/reward-bench/Qwen/Qwen1.5-14B-Chat/77d1edc1-fb54-4371-bf7c-baebbb351163.json +++ b/data/reward-bench/Qwen/Qwen1.5-14B-Chat/77d1edc1-fb54-4371-bf7c-baebbb351163.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Qwen_Qwen1.5-14B-Chat/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6864 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5726 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7018 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7122 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8961 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.4123 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Qwen/Qwen1.5-4B-Chat/e7eecdb0-bc17-4d9f-b3e8-9ee777d2f595.json b/data/reward-bench/Qwen/Qwen1.5-4B-Chat/e7eecdb0-bc17-4d9f-b3e8-9ee777d2f595.json index ce4ff66dd..85d507824 100644 --- a/data/reward-bench/Qwen/Qwen1.5-4B-Chat/e7eecdb0-bc17-4d9f-b3e8-9ee777d2f595.json +++ b/data/reward-bench/Qwen/Qwen1.5-4B-Chat/e7eecdb0-bc17-4d9f-b3e8-9ee777d2f595.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Qwen_Qwen1.5-4B-Chat/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5477 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.3883 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6272 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5568 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6689 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.447 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Qwen/Qwen1.5-72B-Chat/3f3915b3-0d6e-451c-9185-fa4372b93f2b.json b/data/reward-bench/Qwen/Qwen1.5-72B-Chat/3f3915b3-0d6e-451c-9185-fa4372b93f2b.json index c36fc9a9f..f3cc894c3 100644 --- a/data/reward-bench/Qwen/Qwen1.5-72B-Chat/3f3915b3-0d6e-451c-9185-fa4372b93f2b.json +++ b/data/reward-bench/Qwen/Qwen1.5-72B-Chat/3f3915b3-0d6e-451c-9185-fa4372b93f2b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Qwen_Qwen1.5-72B-Chat/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6723 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6229 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6601 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6757 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8554 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.4226 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Qwen/Qwen1.5-7B-Chat/e534d37b-3009-4a7d-82d8-d7c85b95649e.json b/data/reward-bench/Qwen/Qwen1.5-7B-Chat/e534d37b-3009-4a7d-82d8-d7c85b95649e.json index 62722eeef..2373972cd 100644 --- a/data/reward-bench/Qwen/Qwen1.5-7B-Chat/e534d37b-3009-4a7d-82d8-d7c85b95649e.json +++ b/data/reward-bench/Qwen/Qwen1.5-7B-Chat/e534d37b-3009-4a7d-82d8-d7c85b95649e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Qwen_Qwen1.5-7B-Chat/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.675 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5363 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6908 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6919 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9041 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.4288 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Qwen/Qwen1.5-MoE-A2.7B-Chat/bd8f0ed1-75fc-48c1-996e-655d205c027c.json b/data/reward-bench/Qwen/Qwen1.5-MoE-A2.7B-Chat/bd8f0ed1-75fc-48c1-996e-655d205c027c.json index b694ee08b..7daa3735e 100644 --- a/data/reward-bench/Qwen/Qwen1.5-MoE-A2.7B-Chat/bd8f0ed1-75fc-48c1-996e-655d205c027c.json +++ b/data/reward-bench/Qwen/Qwen1.5-MoE-A2.7B-Chat/bd8f0ed1-75fc-48c1-996e-655d205c027c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Qwen_Qwen1.5-MoE-A2.7B-Chat/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6644 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7291 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6316 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.774 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.4536 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Qwen/WorldPM-72B/e9effaf6-e48b-4b35-b035-430be81b316b.json b/data/reward-bench/Qwen/WorldPM-72B/e9effaf6-e48b-4b35-b035-430be81b316b.json index 55bff16c3..6ee54b6e7 100644 --- a/data/reward-bench/Qwen/WorldPM-72B/e9effaf6-e48b-4b35-b035-430be81b316b.json +++ b/data/reward-bench/Qwen/WorldPM-72B/e9effaf6-e48b-4b35-b035-430be81b316b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/Qwen_WorldPM-72B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6333 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7074 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3125 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6557 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8533 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9172 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.3535 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-32B/d2132eea-eb88-41e5-b8e6-2e8e8a623ed1.json b/data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-32B/d2132eea-eb88-41e5-b8e6-2e8e8a623ed1.json index 5cad58eb3..d48e6bfec 100644 --- a/data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-32B/d2132eea-eb88-41e5-b8e6-2e8e8a623ed1.json +++ b/data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-32B/d2132eea-eb88-41e5-b8e6-2e8e8a623ed1.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/R-I-S-E_RISE-Judge-Qwen2.5-32B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9266 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9665 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8333 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9189 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9877 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-7B/ffd05bc7-3724-40ba-85b9-c25ebe71fba2.json b/data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-7B/ffd05bc7-3724-40ba-85b9-c25ebe71fba2.json index ac71f5f5f..2418db79b 100644 --- a/data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-7B/ffd05bc7-3724-40ba-85b9-c25ebe71fba2.json +++ b/data/reward-bench/R-I-S-E/RISE-Judge-Qwen2.5-7B/ffd05bc7-3724-40ba-85b9-c25ebe71fba2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/R-I-S-E_RISE-Judge-Qwen2.5-7B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8819 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9218 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7654 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8797 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9608 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/43f0e93d-f0b8-46af-a549-e1ac315d96ea.json b/data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/43f0e93d-f0b8-46af-a549-e1ac315d96ea.json index 4788e3bbf..7370b5a14 100644 --- a/data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/43f0e93d-f0b8-46af-a549-e1ac315d96ea.json +++ b/data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/43f0e93d-f0b8-46af-a549-e1ac315d96ea.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/RLHFlow_ArmoRM-Llama3-8B-v0.1/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6646 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6568 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4188 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6612 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8222 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7657 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6629 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/9ccab7bd-d2ed-4ab3-ad81-656650c29a3b.json b/data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/9ccab7bd-d2ed-4ab3-ad81-656650c29a3b.json index e46af9896..41532e2cf 100644 --- a/data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/9ccab7bd-d2ed-4ab3-ad81-656650c29a3b.json +++ b/data/reward-bench/RLHFlow/ArmoRM-Llama3-8B-v0.1/9ccab7bd-d2ed-4ab3-ad81-656650c29a3b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/RLHFlow_ArmoRM-Llama3-8B-v0.1/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.886 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9693 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7675 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9054 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9735 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.7429 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/RLHFlow/LLaMA3-iterative-DPO-final/c10d4213-f1fa-41e6-92d9-0d5337c1362b.json b/data/reward-bench/RLHFlow/LLaMA3-iterative-DPO-final/c10d4213-f1fa-41e6-92d9-0d5337c1362b.json index 4f2ac0d9d..e8c6cfc5b 100644 --- a/data/reward-bench/RLHFlow/LLaMA3-iterative-DPO-final/c10d4213-f1fa-41e6-92d9-0d5337c1362b.json +++ b/data/reward-bench/RLHFlow/LLaMA3-iterative-DPO-final/c10d4213-f1fa-41e6-92d9-0d5337c1362b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/RLHFlow_LLaMA3-iterative-DPO-final/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6783 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.838 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5921 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7865 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6161 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.4392 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/RLHFlow/RewardModel-Mistral-7B-for-DPA-v1/63b08ba0-eeb9-48ae-a5d1-d7d3792aa1c0.json b/data/reward-bench/RLHFlow/RewardModel-Mistral-7B-for-DPA-v1/63b08ba0-eeb9-48ae-a5d1-d7d3792aa1c0.json index 271dde1c8..35d7a9ebf 100644 --- a/data/reward-bench/RLHFlow/RewardModel-Mistral-7B-for-DPA-v1/63b08ba0-eeb9-48ae-a5d1-d7d3792aa1c0.json +++ b/data/reward-bench/RLHFlow/RewardModel-Mistral-7B-for-DPA-v1/63b08ba0-eeb9-48ae-a5d1-d7d3792aa1c0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/RLHFlow_RewardModel-Mistral-7B-for-DPA-v1/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6633 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8799 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4978 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7068 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.5971 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6068 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/RLHFlow/pair-preference-model-LLaMA3-8B/d724076d-509f-4ad4-894c-976b0472de85.json b/data/reward-bench/RLHFlow/pair-preference-model-LLaMA3-8B/d724076d-509f-4ad4-894c-976b0472de85.json index db8b6e695..ee0769a51 100644 --- a/data/reward-bench/RLHFlow/pair-preference-model-LLaMA3-8B/d724076d-509f-4ad4-894c-976b0472de85.json +++ b/data/reward-bench/RLHFlow/pair-preference-model-LLaMA3-8B/d724076d-509f-4ad4-894c-976b0472de85.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/RLHFlow_pair-preference-model-LLaMA3-8B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8575 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9832 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6579 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8973 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9473 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.7458 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Ray2333/GRM-Gemma-2B-rewardmodel-ft/54d34f25-1cd9-4995-8e56-c36981842fc8.json b/data/reward-bench/Ray2333/GRM-Gemma-2B-rewardmodel-ft/54d34f25-1cd9-4995-8e56-c36981842fc8.json index ea2e28411..05c7b0ff2 100644 --- a/data/reward-bench/Ray2333/GRM-Gemma-2B-rewardmodel-ft/54d34f25-1cd9-4995-8e56-c36981842fc8.json +++ b/data/reward-bench/Ray2333/GRM-Gemma-2B-rewardmodel-ft/54d34f25-1cd9-4995-8e56-c36981842fc8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Ray2333_GRM-Gemma-2B-rewardmodel-ft/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8447 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8939 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7522 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8446 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.8881 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Ray2333/GRM-Gemma-2B-sftreg/63ae1c75-fd4d-4f40-afd0-b9f91d700014.json b/data/reward-bench/Ray2333/GRM-Gemma-2B-sftreg/63ae1c75-fd4d-4f40-afd0-b9f91d700014.json index dedd0ad29..8a5814b51 100644 --- a/data/reward-bench/Ray2333/GRM-Gemma-2B-sftreg/63ae1c75-fd4d-4f40-afd0-b9f91d700014.json +++ b/data/reward-bench/Ray2333/GRM-Gemma-2B-sftreg/63ae1c75-fd4d-4f40-afd0-b9f91d700014.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Ray2333_GRM-Gemma-2B-sftreg/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7451 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9553 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4868 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7932 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7684 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6983 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/1d5ebbce-8cfe-446b-82c0-a227d4e9247f.json b/data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/1d5ebbce-8cfe-446b-82c0-a227d4e9247f.json index 61a889811..ae24803ba 100644 --- a/data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/1d5ebbce-8cfe-446b-82c0-a227d4e9247f.json +++ b/data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/1d5ebbce-8cfe-446b-82c0-a227d4e9247f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Ray2333_GRM-Llama3-8B-rewardmodel-ft/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9154 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9553 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8618 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9081 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9362 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/3f9c81ac-5c76-43b4-a27d-7eaa055139c4.json b/data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/3f9c81ac-5c76-43b4-a27d-7eaa055139c4.json index 1e19c6890..2f035232b 100644 --- a/data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/3f9c81ac-5c76-43b4-a27d-7eaa055139c4.json +++ b/data/reward-bench/Ray2333/GRM-Llama3-8B-rewardmodel-ft/3f9c81ac-5c76-43b4-a27d-7eaa055139c4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/Ray2333_GRM-Llama3-8B-rewardmodel-ft/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6766 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6274 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.35 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5847 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9222 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8929 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6824 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/680098fb-76cf-47b6-a0ea-a1a06ca46dca.json b/data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/680098fb-76cf-47b6-a0ea-a1a06ca46dca.json index 1f04a08e6..fa7e8dccb 100644 --- a/data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/680098fb-76cf-47b6-a0ea-a1a06ca46dca.json +++ b/data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/680098fb-76cf-47b6-a0ea-a1a06ca46dca.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/Ray2333_GRM-gemma2-2B-rewardmodel-ft/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5966 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5305 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3125 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5902 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9222 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7455 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4788 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/6ec21338-9908-4ce4-a1f2-dac14c5e27ab.json b/data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/6ec21338-9908-4ce4-a1f2-dac14c5e27ab.json index 03ffaccd8..79c0a560c 100644 --- a/data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/6ec21338-9908-4ce4-a1f2-dac14c5e27ab.json +++ b/data/reward-bench/Ray2333/GRM-gemma2-2B-rewardmodel-ft/6ec21338-9908-4ce4-a1f2-dac14c5e27ab.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Ray2333_GRM-gemma2-2B-rewardmodel-ft/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8839 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9302 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7719 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9216 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.912 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Ray2333/GRM-llama3-8B-distill/592ad1e3-8a48-4c39-8013-81d7c731780f.json b/data/reward-bench/Ray2333/GRM-llama3-8B-distill/592ad1e3-8a48-4c39-8013-81d7c731780f.json index 499d8ac36..3c94abda8 100644 --- a/data/reward-bench/Ray2333/GRM-llama3-8B-distill/592ad1e3-8a48-4c39-8013-81d7c731780f.json +++ b/data/reward-bench/Ray2333/GRM-llama3-8B-distill/592ad1e3-8a48-4c39-8013-81d7c731780f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Ray2333_GRM-llama3-8B-distill/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8464 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9832 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6842 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8676 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9133 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.7209 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Ray2333/GRM-llama3-8B-distill/5b36f0af-7ff6-4564-9714-08fbf41d261f.json b/data/reward-bench/Ray2333/GRM-llama3-8B-distill/5b36f0af-7ff6-4564-9714-08fbf41d261f.json index 79518e254..7a518a591 100644 --- a/data/reward-bench/Ray2333/GRM-llama3-8B-distill/5b36f0af-7ff6-4564-9714-08fbf41d261f.json +++ b/data/reward-bench/Ray2333/GRM-llama3-8B-distill/5b36f0af-7ff6-4564-9714-08fbf41d261f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/Ray2333_GRM-llama3-8B-distill/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.589 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5874 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3875 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5902 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7222 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6727 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5743 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/04f120c6-b648-4c83-81d8-05118efb0904.json b/data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/04f120c6-b648-4c83-81d8-05118efb0904.json index 65a99ac02..fd63fed4b 100644 --- a/data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/04f120c6-b648-4c83-81d8-05118efb0904.json +++ b/data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/04f120c6-b648-4c83-81d8-05118efb0904.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Ray2333_GRM-llama3-8B-sftreg/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8542 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.986 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6776 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8919 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9229 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.7309 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/c907e494-ab2e-4a28-a28d-aeb68eb818ed.json b/data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/c907e494-ab2e-4a28-a28d-aeb68eb818ed.json index 4e9d051e4..c42486675 100644 --- a/data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/c907e494-ab2e-4a28-a28d-aeb68eb818ed.json +++ b/data/reward-bench/Ray2333/GRM-llama3-8B-sftreg/c907e494-ab2e-4a28-a28d-aeb68eb818ed.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/Ray2333_GRM-llama3-8B-sftreg/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6089 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6189 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3875 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5792 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7867 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6828 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5981 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Ray2333/GRM-llama3.2-3B-rewardmodel-ft/d9eed240-ebbe-482f-8dae-c5251ed6d067.json b/data/reward-bench/Ray2333/GRM-llama3.2-3B-rewardmodel-ft/d9eed240-ebbe-482f-8dae-c5251ed6d067.json index 394d5ae65..ef4f104a0 100644 --- a/data/reward-bench/Ray2333/GRM-llama3.2-3B-rewardmodel-ft/d9eed240-ebbe-482f-8dae-c5251ed6d067.json +++ b/data/reward-bench/Ray2333/GRM-llama3.2-3B-rewardmodel-ft/d9eed240-ebbe-482f-8dae-c5251ed6d067.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Ray2333_GRM-llama3.2-3B-rewardmodel-ft/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9092 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9162 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8487 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.927 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.945 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Ray2333/Gemma-2B-rewardmodel-baseline/670865e1-f219-465b-9fbe-6da6f73ac9e6.json b/data/reward-bench/Ray2333/Gemma-2B-rewardmodel-baseline/670865e1-f219-465b-9fbe-6da6f73ac9e6.json index 94d947a18..429660d5b 100644 --- a/data/reward-bench/Ray2333/Gemma-2B-rewardmodel-baseline/670865e1-f219-465b-9fbe-6da6f73ac9e6.json +++ b/data/reward-bench/Ray2333/Gemma-2B-rewardmodel-baseline/670865e1-f219-465b-9fbe-6da6f73ac9e6.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Ray2333_Gemma-2B-rewardmodel-baseline/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.729 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9413 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4693 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7865 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7384 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6897 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Ray2333/Gemma-2B-rewardmodel-ft/88953298-b63e-499f-a31e-f0f586c4772d.json b/data/reward-bench/Ray2333/Gemma-2B-rewardmodel-ft/88953298-b63e-499f-a31e-f0f586c4772d.json index ca7b25606..f7eece540 100644 --- a/data/reward-bench/Ray2333/Gemma-2B-rewardmodel-ft/88953298-b63e-499f-a31e-f0f586c4772d.json +++ b/data/reward-bench/Ray2333/Gemma-2B-rewardmodel-ft/88953298-b63e-499f-a31e-f0f586c4772d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Ray2333_Gemma-2B-rewardmodel-ft/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8048 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7793 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7478 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8527 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.8393 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Ray2333/reward-model-Mistral-7B-instruct-Unifie.../3acb690c-ffc0-4e67-8ae1-e79bcee4f824.json b/data/reward-bench/Ray2333/reward-model-Mistral-7B-instruct-Unifie.../3acb690c-ffc0-4e67-8ae1-e79bcee4f824.json index 17c691fe8..9a4e578d4 100644 --- a/data/reward-bench/Ray2333/reward-model-Mistral-7B-instruct-Unifie.../3acb690c-ffc0-4e67-8ae1-e79bcee4f824.json +++ b/data/reward-bench/Ray2333/reward-model-Mistral-7B-instruct-Unifie.../3acb690c-ffc0-4e67-8ae1-e79bcee4f824.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Ray2333_reward-model-Mistral-7B-instruct-Unifie.../1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7661 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9777 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5066 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8527 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7389 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.7434 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/SF-Foundation/TextEval-Llama3.1-70B/6ad2cb6a-f9a3-424e-aed2-9493899872e3.json b/data/reward-bench/SF-Foundation/TextEval-Llama3.1-70B/6ad2cb6a-f9a3-424e-aed2-9493899872e3.json index cac72f2a0..c50e15fdc 100644 --- a/data/reward-bench/SF-Foundation/TextEval-Llama3.1-70B/6ad2cb6a-f9a3-424e-aed2-9493899872e3.json +++ b/data/reward-bench/SF-Foundation/TextEval-Llama3.1-70B/6ad2cb6a-f9a3-424e-aed2-9493899872e3.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/SF-Foundation_TextEval-Llama3.1-70B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9348 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9413 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.9013 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9324 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9641 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/SF-Foundation/TextEval-OffsetBias-12B/1892bf75-916b-4d4f-96ab-fda36872ae5d.json b/data/reward-bench/SF-Foundation/TextEval-OffsetBias-12B/1892bf75-916b-4d4f-96ab-fda36872ae5d.json index 5fca94bb9..b71080064 100644 --- a/data/reward-bench/SF-Foundation/TextEval-OffsetBias-12B/1892bf75-916b-4d4f-96ab-fda36872ae5d.json +++ b/data/reward-bench/SF-Foundation/TextEval-OffsetBias-12B/1892bf75-916b-4d4f-96ab-fda36872ae5d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/SF-Foundation_TextEval-OffsetBias-12B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9105 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.919 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8662 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9203 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9365 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Salesforce/SFR-LLaMa-3.1-70B-Judge-r/e06e1863-c28f-4c96-a672-b1073c80aa71.json b/data/reward-bench/Salesforce/SFR-LLaMa-3.1-70B-Judge-r/e06e1863-c28f-4c96-a672-b1073c80aa71.json index 02ed42a46..49c043587 100644 --- a/data/reward-bench/Salesforce/SFR-LLaMa-3.1-70B-Judge-r/e06e1863-c28f-4c96-a672-b1073c80aa71.json +++ b/data/reward-bench/Salesforce/SFR-LLaMa-3.1-70B-Judge-r/e06e1863-c28f-4c96-a672-b1073c80aa71.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Salesforce_SFR-LLaMa-3.1-70B-Judge-r/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9272 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9693 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8476 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9162 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9757 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Salesforce/SFR-LLaMa-3.1-8B-Judge-r/d923f7aa-a9d4-406a-b5d7-bdab508f04f7.json b/data/reward-bench/Salesforce/SFR-LLaMa-3.1-8B-Judge-r/d923f7aa-a9d4-406a-b5d7-bdab508f04f7.json index cbebf4cf5..deced96e1 100644 --- a/data/reward-bench/Salesforce/SFR-LLaMa-3.1-8B-Judge-r/d923f7aa-a9d4-406a-b5d7-bdab508f04f7.json +++ b/data/reward-bench/Salesforce/SFR-LLaMa-3.1-8B-Judge-r/d923f7aa-a9d4-406a-b5d7-bdab508f04f7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Salesforce_SFR-LLaMa-3.1-8B-Judge-r/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8865 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9553 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7774 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8622 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9513 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Salesforce/SFR-nemo-12B-Judge-r/5c5e40b1-e86a-4d30-b93c-f8f9e73cdca8.json b/data/reward-bench/Salesforce/SFR-nemo-12B-Judge-r/5c5e40b1-e86a-4d30-b93c-f8f9e73cdca8.json index 4b8e440cd..616e9bc30 100644 --- a/data/reward-bench/Salesforce/SFR-nemo-12B-Judge-r/5c5e40b1-e86a-4d30-b93c-f8f9e73cdca8.json +++ b/data/reward-bench/Salesforce/SFR-nemo-12B-Judge-r/5c5e40b1-e86a-4d30-b93c-f8f9e73cdca8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Salesforce_SFR-nemo-12B-Judge-r/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9027 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9721 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8224 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8649 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9513 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Schrieffer/Llama-SARM-4B/59299d8c-e468-490f-8a52-eef49b0aaeea.json b/data/reward-bench/Schrieffer/Llama-SARM-4B/59299d8c-e468-490f-8a52-eef49b0aaeea.json index f8eea126b..4492a4262 100644 --- a/data/reward-bench/Schrieffer/Llama-SARM-4B/59299d8c-e468-490f-8a52-eef49b0aaeea.json +++ b/data/reward-bench/Schrieffer/Llama-SARM-4B/59299d8c-e468-490f-8a52-eef49b0aaeea.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/Schrieffer_Llama-SARM-4B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7379 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6874 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4281 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6448 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9178 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9556 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.7939 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/3ce9612f-9b57-476e-9fa4-6e63f14568a7.json b/data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/3ce9612f-9b57-476e-9fa4-6e63f14568a7.json index 4322979ce..6723992e6 100644 --- a/data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/3ce9612f-9b57-476e-9fa4-6e63f14568a7.json +++ b/data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/3ce9612f-9b57-476e-9fa4-6e63f14568a7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ShikaiChen_LDL-Reward-Gemma-2-27B-v0.1/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9499 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9637 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.9079 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9378 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9903 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/9c605bf1-2533-43db-a610-e71c0aaecdb5.json b/data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/9c605bf1-2533-43db-a610-e71c0aaecdb5.json index 66d1a28bd..e51beb588 100644 --- a/data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/9c605bf1-2533-43db-a610-e71c0aaecdb5.json +++ b/data/reward-bench/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1/9c605bf1-2533-43db-a610-e71c0aaecdb5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/ShikaiChen_LDL-Reward-Gemma-2-27B-v0.1/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7249 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7558 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.35 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6448 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9222 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9131 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.7633 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-70B/c289f778-92b8-44df-a079-3bced33c8ab5.json b/data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-70B/c289f778-92b8-44df-a079-3bced33c8ab5.json index 399d00b39..7f469a316 100644 --- a/data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-70B/c289f778-92b8-44df-a079-3bced33c8ab5.json +++ b/data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-70B/c289f778-92b8-44df-a079-3bced33c8ab5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Skywork_Skywork-Critic-Llama-3.1-70B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9331 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9665 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8794 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9311 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9554 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-8B/329d4101-e740-490c-9fbc-1708f76a2f61.json b/data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-8B/329d4101-e740-490c-9fbc-1708f76a2f61.json index f6484f7e1..cf3327493 100644 --- a/data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-8B/329d4101-e740-490c-9fbc-1708f76a2f61.json +++ b/data/reward-bench/Skywork/Skywork-Critic-Llama-3.1-8B/329d4101-e740-490c-9fbc-1708f76a2f61.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Skywork_Skywork-Critic-Llama-3.1-8B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8896 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9358 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8136 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9108 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.898 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/3e87f52e-b136-4cb3-8cbb-d8d8a8571051.json b/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/3e87f52e-b136-4cb3-8cbb-d8d8a8571051.json index 4fcc4b504..4ac0f4414 100644 --- a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/3e87f52e-b136-4cb3-8cbb-d8d8a8571051.json +++ b/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/3e87f52e-b136-4cb3-8cbb-d8d8a8571051.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Gemma-2-27B-v0.2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7531 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7674 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.375 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6721 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9689 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9172 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.8182 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/62b9adca-db38-46c0-a68a-ed7a8e735035.json b/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/62b9adca-db38-46c0-a68a-ed7a8e735035.json index 890821552..5c04a1152 100644 --- a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/62b9adca-db38-46c0-a68a-ed7a8e735035.json +++ b/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B-v0.2/62b9adca-db38-46c0-a68a-ed7a8e735035.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Gemma-2-27B-v0.2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9426 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9609 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8991 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9297 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9807 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/4d2f43eb-e6f3-4686-a9d9-6b6c6b68b86c.json b/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/4d2f43eb-e6f3-4686-a9d9-6b6c6b68b86c.json index 5809afc66..08b4c8323 100644 --- a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/4d2f43eb-e6f3-4686-a9d9-6b6c6b68b86c.json +++ b/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/4d2f43eb-e6f3-4686-a9d9-6b6c6b68b86c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Gemma-2-27B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7576 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7368 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4031 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7049 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9422 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9323 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.8261 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/830df3fd-d479-4af8-a92b-93d82e804fec.json b/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/830df3fd-d479-4af8-a92b-93d82e804fec.json index 021c98b9c..22de7e431 100644 --- a/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/830df3fd-d479-4af8-a92b-93d82e804fec.json +++ b/data/reward-bench/Skywork/Skywork-Reward-Gemma-2-27B/830df3fd-d479-4af8-a92b-93d82e804fec.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Gemma-2-27B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.938 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9581 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.9145 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9189 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9606 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/0e6d85b8-aa37-448c-adb2-0da2bd13e322.json b/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/0e6d85b8-aa37-448c-adb2-0da2bd13e322.json index a309aa94b..e5a811527 100644 --- a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/0e6d85b8-aa37-448c-adb2-0da2bd13e322.json +++ b/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/0e6d85b8-aa37-448c-adb2-0da2bd13e322.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9313 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9469 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8838 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.927 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9675 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/45f0bd9c-e939-4b83-a623-1db61f431500.json b/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/45f0bd9c-e939-4b83-a623-1db61f431500.json index 2de246c7d..1941ebc04 100644 --- a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/45f0bd9c-e939-4b83-a623-1db61f431500.json +++ b/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2/45f0bd9c-e939-4b83-a623-1db61f431500.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Llama-3.1-8B-v0.2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7175 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6968 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4062 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6011 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9422 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9414 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.7169 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/0f710903-7dd8-44ea-914d-d43bbfe894f1.json b/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/0f710903-7dd8-44ea-914d-d43bbfe894f1.json index 0b2359ed4..ba30f4f9f 100644 --- a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/0f710903-7dd8-44ea-914d-d43bbfe894f1.json +++ b/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/0f710903-7dd8-44ea-914d-d43bbfe894f1.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-Llama-3.1-8B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7314 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6989 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.425 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9333 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9616 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.741 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/b9ddd960-f6f7-4962-8297-88ec7fbbbd1f.json b/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/b9ddd960-f6f7-4962-8297-88ec7fbbbd1f.json index 6bf4c02d1..03903b4ec 100644 --- a/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/b9ddd960-f6f7-4962-8297-88ec7fbbbd1f.json +++ b/data/reward-bench/Skywork/Skywork-Reward-Llama-3.1-8B/b9ddd960-f6f7-4962-8297-88ec7fbbbd1f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Skywork_Skywork-Reward-Llama-3.1-8B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9252 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9581 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8728 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9081 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.962 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.1-8B/25a4520b-c780-45fc-a00f-36db1776c6a8.json b/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.1-8B/25a4520b-c780-45fc-a00f-36db1776c6a8.json index 7c195cad3..b19a61534 100644 --- a/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.1-8B/25a4520b-c780-45fc-a00f-36db1776c6a8.json +++ b/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.1-8B/25a4520b-c780-45fc-a00f-36db1776c6a8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-V2-Llama-3.1-8B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8413 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8463 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6625 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.776 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9667 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9838 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.8124 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-1B/96d7e5c1-2f43-4f09-9702-0af090afa141.json b/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-1B/96d7e5c1-2f43-4f09-9702-0af090afa141.json index d9d0560be..2ff90cff2 100644 --- a/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-1B/96d7e5c1-2f43-4f09-9702-0af090afa141.json +++ b/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-1B/96d7e5c1-2f43-4f09-9702-0af090afa141.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-V2-Llama-3.2-1B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6438 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6084 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4562 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6011 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8733 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8929 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4306 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-3B/5a47f8bd-401a-4b6b-91b0-9593b36e5996.json b/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-3B/5a47f8bd-401a-4b6b-91b0-9593b36e5996.json index 3d9cbc212..9f8069f50 100644 --- a/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-3B/5a47f8bd-401a-4b6b-91b0-9593b36e5996.json +++ b/data/reward-bench/Skywork/Skywork-Reward-V2-Llama-3.2-3B/5a47f8bd-401a-4b6b-91b0-9593b36e5996.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-V2-Llama-3.2-3B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7466 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7621 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4562 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.694 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9311 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9596 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6768 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-0.6B/c27e98d4-f5ea-48f9-babc-3ccda2d21d2a.json b/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-0.6B/c27e98d4-f5ea-48f9-babc-3ccda2d21d2a.json index 3d71cef8d..44ea9887d 100644 --- a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-0.6B/c27e98d4-f5ea-48f9-babc-3ccda2d21d2a.json +++ b/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-0.6B/c27e98d4-f5ea-48f9-babc-3ccda2d21d2a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-V2-Qwen3-0.6B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6125 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.58 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7158 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8444 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7949 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.3397 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-1.7B/060bf847-e7b5-4e30-934f-5306d01c499a.json b/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-1.7B/060bf847-e7b5-4e30-934f-5306d01c499a.json index 708a6390b..f670ad051 100644 --- a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-1.7B/060bf847-e7b5-4e30-934f-5306d01c499a.json +++ b/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-1.7B/060bf847-e7b5-4e30-934f-5306d01c499a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-V2-Qwen3-1.7B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6818 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6568 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4437 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7268 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8911 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8848 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4872 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-4B/e648e6c2-18bb-49d7-b08f-47ce41a67d4f.json b/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-4B/e648e6c2-18bb-49d7-b08f-47ce41a67d4f.json index 47d640d45..6f6900c68 100644 --- a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-4B/e648e6c2-18bb-49d7-b08f-47ce41a67d4f.json +++ b/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-4B/e648e6c2-18bb-49d7-b08f-47ce41a67d4f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-V2-Qwen3-4B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7551 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7737 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4625 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7322 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9222 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9657 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6743 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-8B/537e92cb-25db-47f5-916a-6f666e14639a.json b/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-8B/537e92cb-25db-47f5-916a-6f666e14639a.json index 769d2cb8b..1c01babb8 100644 --- a/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-8B/537e92cb-25db-47f5-916a-6f666e14639a.json +++ b/data/reward-bench/Skywork/Skywork-Reward-V2-Qwen3-8B/537e92cb-25db-47f5-916a-6f666e14639a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/Skywork_Skywork-Reward-V2-Qwen3-8B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7837 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7989 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7705 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.94 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9636 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.7294 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Skywork/Skywork-VL-Reward-7B/e59ca33f-c6ce-44d4-9cb4-2fd65608313b.json b/data/reward-bench/Skywork/Skywork-VL-Reward-7B/e59ca33f-c6ce-44d4-9cb4-2fd65608313b.json index cce15d82f..47757e3b6 100644 --- a/data/reward-bench/Skywork/Skywork-VL-Reward-7B/e59ca33f-c6ce-44d4-9cb4-2fd65608313b.json +++ b/data/reward-bench/Skywork/Skywork-VL-Reward-7B/e59ca33f-c6ce-44d4-9cb4-2fd65608313b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Skywork_Skywork-VL-Reward-7B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9007 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8994 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.875 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9108 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9176 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/Skywork/Skywork-VL-Reward-7B/fc99848b-82c7-459e-8327-1867a332ff28.json b/data/reward-bench/Skywork/Skywork-VL-Reward-7B/fc99848b-82c7-459e-8327-1867a332ff28.json index acad04a74..adb50e622 100644 --- a/data/reward-bench/Skywork/Skywork-VL-Reward-7B/fc99848b-82c7-459e-8327-1867a332ff28.json +++ b/data/reward-bench/Skywork/Skywork-VL-Reward-7B/fc99848b-82c7-459e-8327-1867a332ff28.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/Skywork_Skywork-VL-Reward-7B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6885 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6063 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.35 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6339 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8911 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8909 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.7586 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/SultanR/SmolTulu-1.7b-RM/357f4f03-9542-495f-b575-4274111bbe1f.json b/data/reward-bench/SultanR/SmolTulu-1.7b-RM/357f4f03-9542-495f-b575-4274111bbe1f.json index 8e503b342..16bfc7b82 100644 --- a/data/reward-bench/SultanR/SmolTulu-1.7b-RM/357f4f03-9542-495f-b575-4274111bbe1f.json +++ b/data/reward-bench/SultanR/SmolTulu-1.7b-RM/357f4f03-9542-495f-b575-4274111bbe1f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/SultanR_SmolTulu-1.7b-RM/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5094 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.743 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4408 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5716 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.2821 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ZiyiYe/Con-J-Qwen2-7B/d78c42d6-fc0d-4719-bbb6-7a53dbb0d017.json b/data/reward-bench/ZiyiYe/Con-J-Qwen2-7B/d78c42d6-fc0d-4719-bbb6-7a53dbb0d017.json index f06598ac7..5ae0638d6 100644 --- a/data/reward-bench/ZiyiYe/Con-J-Qwen2-7B/d78c42d6-fc0d-4719-bbb6-7a53dbb0d017.json +++ b/data/reward-bench/ZiyiYe/Con-J-Qwen2-7B/d78c42d6-fc0d-4719-bbb6-7a53dbb0d017.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ZiyiYe_Con-J-Qwen2-7B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8712 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.919 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8026 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8824 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.8808 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ai2/llama-2-chat-7b-nectar-3.8m.json/c94ddbe5-2bc0-4a33-b06b-10671fb22b70.json b/data/reward-bench/ai2/llama-2-chat-7b-nectar-3.8m.json/c94ddbe5-2bc0-4a33-b06b-10671fb22b70.json index 71e6f10bb..400642b9b 100644 --- a/data/reward-bench/ai2/llama-2-chat-7b-nectar-3.8m.json/c94ddbe5-2bc0-4a33-b06b-10671fb22b70.json +++ b/data/reward-bench/ai2/llama-2-chat-7b-nectar-3.8m.json/c94ddbe5-2bc0-4a33-b06b-10671fb22b70.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ai2_llama-2-chat-7b-nectar-3.8m.json/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5843 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8631 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.2654 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,7 +83,12 @@ }, "score_details": { "score": 0.6243 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ai2/llama-2-chat-nectar-180k.json/cc2ac405-1710-46fa-aeba-dd86797c666c.json b/data/reward-bench/ai2/llama-2-chat-nectar-180k.json/cc2ac405-1710-46fa-aeba-dd86797c666c.json index 02fb796d2..26ba58fae 100644 --- a/data/reward-bench/ai2/llama-2-chat-nectar-180k.json/cc2ac405-1710-46fa-aeba-dd86797c666c.json +++ b/data/reward-bench/ai2/llama-2-chat-nectar-180k.json/cc2ac405-1710-46fa-aeba-dd86797c666c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ai2_llama-2-chat-nectar-180k.json/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5235 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8827 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.2851 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,7 +83,12 @@ }, "score_details": { "score": 0.4027 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ai2/llama-2-chat-ultrafeedback-60k.jsonl/49fcb3e2-2883-4c3d-b519-d511c6b10162.json b/data/reward-bench/ai2/llama-2-chat-ultrafeedback-60k.jsonl/49fcb3e2-2883-4c3d-b519-d511c6b10162.json index 73085ecb0..4b539edf5 100644 --- a/data/reward-bench/ai2/llama-2-chat-ultrafeedback-60k.jsonl/49fcb3e2-2883-4c3d-b519-d511c6b10162.json +++ b/data/reward-bench/ai2/llama-2-chat-ultrafeedback-60k.jsonl/49fcb3e2-2883-4c3d-b519-d511c6b10162.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ai2_llama-2-chat-ultrafeedback-60k.jsonl/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.644 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9441 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4539 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,7 +83,12 @@ }, "score_details": { "score": 0.5338 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../0ba5ce6c-f311-4b02-a67a-d49539119a8e.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../0ba5ce6c-f311-4b02-a67a-d49539119a8e.json index 60e788067..277116c3d 100644 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../0ba5ce6c-f311-4b02-a67a-d49539119a8e.json +++ b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../0ba5ce6c-f311-4b02-a67a-d49539119a8e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7058 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9525 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3947 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,7 +83,12 @@ }, "score_details": { "score": 0.7703 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../49029c9e-a831-4219-8e26-df20862ad3e1.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../49029c9e-a831-4219-8e26-df20862ad3e1.json index 966dbbe7a..dfc9008b5 100644 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../49029c9e-a831-4219-8e26-df20862ad3e1.json +++ b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../49029c9e-a831-4219-8e26-df20862ad3e1.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7004 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9413 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3882 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,7 +83,12 @@ }, "score_details": { "score": 0.7716 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../6dedd117-eab0-4c31-b50b-4890099d9904.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../6dedd117-eab0-4c31-b50b-4890099d9904.json index 20188aabb..974cd9980 100644 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../6dedd117-eab0-4c31-b50b-4890099d9904.json +++ b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../6dedd117-eab0-4c31-b50b-4890099d9904.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6905 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9441 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3596 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,7 +83,12 @@ }, "score_details": { "score": 0.7676 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../71c20c06-efb8-428e-9e9d-e4fedf11041a.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../71c20c06-efb8-428e-9e9d-e4fedf11041a.json index f233bfae7..fb2652a50 100644 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../71c20c06-efb8-428e-9e9d-e4fedf11041a.json +++ b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../71c20c06-efb8-428e-9e9d-e4fedf11041a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6945 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9385 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3706 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,7 +83,12 @@ }, "score_details": { "score": 0.7743 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../862f3d57-8f5f-4372-b6fb-876fb35efba4.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../862f3d57-8f5f-4372-b6fb-876fb35efba4.json index 8a33df5ad..a8d6993af 100644 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../862f3d57-8f5f-4372-b6fb-876fb35efba4.json +++ b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../862f3d57-8f5f-4372-b6fb-876fb35efba4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6808 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9302 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3596 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,7 +83,12 @@ }, "score_details": { "score": 0.7527 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../93ea2bfa-e058-42d5-afac-0d3fc50fce91.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../93ea2bfa-e058-42d5-afac-0d3fc50fce91.json index 0717d8e99..4d645ea3b 100644 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../93ea2bfa-e058-42d5-afac-0d3fc50fce91.json +++ b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../93ea2bfa-e058-42d5-afac-0d3fc50fce91.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6895 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9385 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3706 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,7 +83,12 @@ }, "score_details": { "score": 0.7595 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c1331fa1-7793-4526-b24b-02261bb4437f.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c1331fa1-7793-4526-b24b-02261bb4437f.json index fb237ec91..f0096c309 100644 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c1331fa1-7793-4526-b24b-02261bb4437f.json +++ b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c1331fa1-7793-4526-b24b-02261bb4437f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7019 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9497 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.375 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,7 +83,12 @@ }, "score_details": { "score": 0.7811 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c3cab72a-47b3-47ec-bb2d-986903ab8c26.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c3cab72a-47b3-47ec-bb2d-986903ab8c26.json index 6d1bc4865..7ccfca2e6 100644 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c3cab72a-47b3-47ec-bb2d-986903ab8c26.json +++ b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../c3cab72a-47b3-47ec-bb2d-986903ab8c26.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7008 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9385 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3882 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,7 +83,12 @@ }, "score_details": { "score": 0.7757 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../cd0452a7-0370-4024-a51f-b3deff290db9.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../cd0452a7-0370-4024-a51f-b3deff290db9.json index f3d24d1cb..dddd173cb 100644 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../cd0452a7-0370-4024-a51f-b3deff290db9.json +++ b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../cd0452a7-0370-4024-a51f-b3deff290db9.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-3.8m-check.../1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6924 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9441 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3575 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,7 +83,12 @@ }, "score_details": { "score": 0.7757 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json/6fd85045-d600-451f-8d27-da637add4081.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json/6fd85045-d600-451f-8d27-da637add4081.json index c9f57ea2b..fbf11359b 100644 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json/6fd85045-d600-451f-8d27-da637add4081.json +++ b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json/6fd85045-d600-451f-8d27-da637add4081.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized-700k.json/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7127 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9358 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4079 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,7 +83,12 @@ }, "score_details": { "score": 0.7946 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized.json/a15ca8c3-fd90-4ef9-80c5-40eeac60d785.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized.json/a15ca8c3-fd90-4ef9-80c5-40eeac60d785.json index 7af294ed7..3770ce48e 100644 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized.json/a15ca8c3-fd90-4ef9-80c5-40eeac60d785.json +++ b/data/reward-bench/ai2/tulu-2-7b-rm-v0-nectar-binarized.json/a15ca8c3-fd90-4ef9-80c5-40eeac60d785.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0-nectar-binarized.json/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6756 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9134 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3904 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,7 +83,12 @@ }, "score_details": { "score": 0.723 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/ai2/tulu-2-7b-rm-v0.json/5f43832f-14fa-49e1-a851-949163aec826.json b/data/reward-bench/ai2/tulu-2-7b-rm-v0.json/5f43832f-14fa-49e1-a851-949163aec826.json index d81909091..ffe63e3e4 100644 --- a/data/reward-bench/ai2/tulu-2-7b-rm-v0.json/5f43832f-14fa-49e1-a851-949163aec826.json +++ b/data/reward-bench/ai2/tulu-2-7b-rm-v0.json/5f43832f-14fa-49e1-a851-949163aec826.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/ai2_tulu-2-7b-rm-v0.json/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6655 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.933 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4539 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,7 +83,12 @@ }, "score_details": { "score": 0.6095 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/1f8869e7-e434-469e-906d-d34621582cba.json b/data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/1f8869e7-e434-469e-906d-d34621582cba.json index 83967095d..fef53f2e4 100644 --- a/data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/1f8869e7-e434-469e-906d-d34621582cba.json +++ b/data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/1f8869e7-e434-469e-906d-d34621582cba.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_Llama-3.1-70B-Instruct-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7606 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8126 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4188 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6995 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8844 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8646 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.8835 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/8f9d05db-9bb0-4998-bc75-96dbfa695548.json b/data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/8f9d05db-9bb0-4998-bc75-96dbfa695548.json index ba3b8f2bf..419aa0a24 100644 --- a/data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/8f9d05db-9bb0-4998-bc75-96dbfa695548.json +++ b/data/reward-bench/allenai/Llama-3.1-70B-Instruct-RM-RB2/8f9d05db-9bb0-4998-bc75-96dbfa695548.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/allenai_Llama-3.1-70B-Instruct-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9021 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9665 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8355 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9095 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8969 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.0 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/2681e475-da0a-48a9-ab68-e0bf59240f90.json b/data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/2681e475-da0a-48a9-ab68-e0bf59240f90.json index d5327ebdc..77a854ced 100644 --- a/data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/2681e475-da0a-48a9-ab68-e0bf59240f90.json +++ b/data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/2681e475-da0a-48a9-ab68-e0bf59240f90.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_Llama-3.1-8B-Base-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.649 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.72 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3625 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.612 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8267 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8323 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5406 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/e2986d78-100d-417a-9f38-9a570a335d95.json b/data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/e2986d78-100d-417a-9f38-9a570a335d95.json index b7c605956..a01839a2f 100644 --- a/data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/e2986d78-100d-417a-9f38-9a570a335d95.json +++ b/data/reward-bench/allenai/Llama-3.1-8B-Base-RM-RB2/e2986d78-100d-417a-9f38-9a570a335d95.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/allenai_Llama-3.1-8B-Base-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8463 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.933 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7785 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8851 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7886 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.0 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1bc5cd51-5a3a-46ea-bc78-56f9b3081f69.json b/data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1bc5cd51-5a3a-46ea-bc78-56f9b3081f69.json index 50aa7aae8..7175068fd 100644 --- a/data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1bc5cd51-5a3a-46ea-bc78-56f9b3081f69.json +++ b/data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1bc5cd51-5a3a-46ea-bc78-56f9b3081f69.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/allenai_Llama-3.1-8B-Instruct-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8885 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9581 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8158 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8932 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.887 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.0 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1d1127ee-7a0e-4915-b8bf-0b22f8ba338b.json b/data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1d1127ee-7a0e-4915-b8bf-0b22f8ba338b.json index f7e4dfc34..095adf95a 100644 --- a/data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1d1127ee-7a0e-4915-b8bf-0b22f8ba338b.json +++ b/data/reward-bench/allenai/Llama-3.1-8B-Instruct-RM-RB2/1d1127ee-7a0e-4915-b8bf-0b22f8ba338b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_Llama-3.1-8B-Instruct-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7285 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7432 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4437 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6175 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8956 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9071 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.7638 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/4bb55ff5-5adf-407f-a9d6-910c6c9d2770.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/4bb55ff5-5adf-407f-a9d6-910c6c9d2770.json index de7dc82bc..f3bf51149 100644 --- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/4bb55ff5-5adf-407f-a9d6-910c6c9d2770.json +++ b/data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/4bb55ff5-5adf-407f-a9d6-910c6c9d2770.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-70B-SFT-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.722 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8084 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3688 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6776 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8689 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7778 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.8308 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/daebee0b-3856-4270-94c6-c14bd84f5cf5.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/daebee0b-3856-4270-94c6-c14bd84f5cf5.json index 451b6f9b6..d0492cb5e 100644 --- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/daebee0b-3856-4270-94c6-c14bd84f5cf5.json +++ b/data/reward-bench/allenai/Llama-3.1-Tulu-3-70B-SFT-RM-RB2/daebee0b-3856-4270-94c6-c14bd84f5cf5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-70B-SFT-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8892 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9693 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8268 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9027 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8583 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.0 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1be99417-352e-4a94-8108-b43123553667.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1be99417-352e-4a94-8108-b43123553667.json index 5788822b2..042aac2cb 100644 --- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1be99417-352e-4a94-8108-b43123553667.json +++ b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1be99417-352e-4a94-8108-b43123553667.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.687 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7516 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3875 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.86 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8545 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6397 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/8d3fbc68-2ee7-4989-a40c-f4a45e579b5c.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/8d3fbc68-2ee7-4989-a40c-f4a45e579b5c.json index 748ceff12..8adafbc18 100644 --- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/8d3fbc68-2ee7-4989-a40c-f4a45e579b5c.json +++ b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-DPO-RM-RB2/8d3fbc68-2ee7-4989-a40c-f4a45e579b5c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-8B-DPO-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8431 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9553 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.761 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8662 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7898 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.0 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/9533891f-c2f7-4e82-9f39-131768dbc28a.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/9533891f-c2f7-4e82-9f39-131768dbc28a.json index 7db85157a..98a6ce817 100644 --- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/9533891f-c2f7-4e82-9f39-131768dbc28a.json +++ b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/9533891f-c2f7-4e82-9f39-131768dbc28a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8369 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9469 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7588 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8703 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7715 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.0 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/b8a47660-f0a5-4136-a743-979863c53e3a.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/b8a47660-f0a5-4136-a743-979863c53e3a.json index 7d48f70bb..d3b513f5d 100644 --- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/b8a47660-f0a5-4136-a743-979863c53e3a.json +++ b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RL-RM-RB2/b8a47660-f0a5-4136-a743-979863c53e3a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-8B-RL-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6871 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7642 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6175 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8644 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8485 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6281 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RM/2673bea2-42eb-42a5-9dc2-13d43341c9b2.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RM/2673bea2-42eb-42a5-9dc2-13d43341c9b2.json index 14e9efed4..e3e043728 100644 --- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RM/2673bea2-42eb-42a5-9dc2-13d43341c9b2.json +++ b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-RM/2673bea2-42eb-42a5-9dc2-13d43341c9b2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-8B-RM/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.59 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7453 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3469 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6448 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7422 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5364 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5243 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/6f5555c2-588a-48d1-811c-be53634bbdef.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/6f5555c2-588a-48d1-811c-be53634bbdef.json index 6f121b539..44e1a6e59 100644 --- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/6f5555c2-588a-48d1-811c-be53634bbdef.json +++ b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/6f5555c2-588a-48d1-811c-be53634bbdef.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8551 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9497 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7917 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8784 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8005 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.0 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/9c96fa7b-52e8-4aed-9fdd-f389091d5e6f.json b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/9c96fa7b-52e8-4aed-9fdd-f389091d5e6f.json index d0adf9b8d..674d59e88 100644 --- a/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/9c96fa7b-52e8-4aed-9fdd-f389091d5e6f.json +++ b/data/reward-bench/allenai/Llama-3.1-Tulu-3-8B-SFT-RM-RB2/9c96fa7b-52e8-4aed-9fdd-f389091d5e6f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_Llama-3.1-Tulu-3-8B-SFT-RM-RB2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6821 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7326 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3875 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5792 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8978 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8889 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6063 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/OLMo-7B-Instruct/0519d9fb-f220-40ab-8257-f20ed98a8b47.json b/data/reward-bench/allenai/OLMo-7B-Instruct/0519d9fb-f220-40ab-8257-f20ed98a8b47.json index f511228aa..f8ff8a104 100644 --- a/data/reward-bench/allenai/OLMo-7B-Instruct/0519d9fb-f220-40ab-8257-f20ed98a8b47.json +++ b/data/reward-bench/allenai/OLMo-7B-Instruct/0519d9fb-f220-40ab-8257-f20ed98a8b47.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/allenai_OLMo-7B-Instruct/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6727 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8966 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5066 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6486 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7168 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5173 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/llama-3-tulu-2-70b-uf-mean-rm/ece70375-447f-41e8-aa03-8f4b26abea73.json b/data/reward-bench/allenai/llama-3-tulu-2-70b-uf-mean-rm/ece70375-447f-41e8-aa03-8f4b26abea73.json index 2571fe4e1..5110dfc2e 100644 --- a/data/reward-bench/allenai/llama-3-tulu-2-70b-uf-mean-rm/ece70375-447f-41e8-aa03-8f4b26abea73.json +++ b/data/reward-bench/allenai/llama-3-tulu-2-70b-uf-mean-rm/ece70375-447f-41e8-aa03-8f4b26abea73.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/allenai_llama-3-tulu-2-70b-uf-mean-rm/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7019 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8631 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5614 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6095 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8268 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5957 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/llama-3-tulu-2-8b-uf-mean-rm/7bbaffdd-f822-48cf-a0f2-e66b16db678d.json b/data/reward-bench/allenai/llama-3-tulu-2-8b-uf-mean-rm/7bbaffdd-f822-48cf-a0f2-e66b16db678d.json index 607b1292d..e1917bdfd 100644 --- a/data/reward-bench/allenai/llama-3-tulu-2-8b-uf-mean-rm/7bbaffdd-f822-48cf-a0f2-e66b16db678d.json +++ b/data/reward-bench/allenai/llama-3-tulu-2-8b-uf-mean-rm/7bbaffdd-f822-48cf-a0f2-e66b16db678d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/allenai_llama-3-tulu-2-8b-uf-mean-rm/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7342 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9525 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5921 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6162 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8212 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6434 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/llama-3-tulu-2-dpo-70b/27c5c441-64ce-41dd-8384-f84c8f6ccc14.json b/data/reward-bench/allenai/llama-3-tulu-2-dpo-70b/27c5c441-64ce-41dd-8384-f84c8f6ccc14.json index 38ddc2375..bef93ec9e 100644 --- a/data/reward-bench/allenai/llama-3-tulu-2-dpo-70b/27c5c441-64ce-41dd-8384-f84c8f6ccc14.json +++ b/data/reward-bench/allenai/llama-3-tulu-2-dpo-70b/27c5c441-64ce-41dd-8384-f84c8f6ccc14.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/allenai_llama-3-tulu-2-dpo-70b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7496 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9637 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5746 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7486 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.802 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5687 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/llama-3-tulu-2-dpo-8b/38a14e6a-2094-4e0b-be22-45181ede2a63.json b/data/reward-bench/allenai/llama-3-tulu-2-dpo-8b/38a14e6a-2094-4e0b-be22-45181ede2a63.json index b4a96b007..a54ed9cc6 100644 --- a/data/reward-bench/allenai/llama-3-tulu-2-dpo-8b/38a14e6a-2094-4e0b-be22-45181ede2a63.json +++ b/data/reward-bench/allenai/llama-3-tulu-2-dpo-8b/38a14e6a-2094-4e0b-be22-45181ede2a63.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/allenai_llama-3-tulu-2-dpo-8b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7275 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9525 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5351 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6649 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8663 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5097 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739590997/cee37c2c-2766-47b7-9192-a141e5d22f2d.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739590997/cee37c2c-2766-47b7-9192-a141e5d22f2d.json index 5511a688f..264f422e1 100644 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739590997/cee37c2c-2766-47b7-9192-a141e5d22f2d.json +++ b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739590997/cee37c2c-2766-47b7-9192-a141e5d22f2d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739590997/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6004 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7032 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.375 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.623 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7867 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.598 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5165 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739871066/d1d69392-8717-462d-9ce0-c7ddf5faf97d.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739871066/d1d69392-8717-462d-9ce0-c7ddf5faf97d.json index 4d1d727af..1d7e43d9e 100644 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739871066/d1d69392-8717-462d-9ce0-c7ddf5faf97d.json +++ b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739871066/d1d69392-8717-462d-9ce0-c7ddf5faf97d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739871066/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6012 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6989 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.425 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7978 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.604 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4527 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739925892/72071bb1-57c0-4727-8100-ba24d8da10f5.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739925892/72071bb1-57c0-4727-8100-ba24d8da10f5.json index f0d0e8725..ccb6f9252 100644 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739925892/72071bb1-57c0-4727-8100-ba24d8da10f5.json +++ b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739925892/72071bb1-57c0-4727-8100-ba24d8da10f5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739925892/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6345 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7432 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3563 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.623 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8111 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7131 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5606 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943850/7626c158-edaf-48f3-9ac3-1188be0c6032.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943850/7626c158-edaf-48f3-9ac3-1188be0c6032.json index e113a319b..8e0cbdf9e 100644 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943850/7626c158-edaf-48f3-9ac3-1188be0c6032.json +++ b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943850/7626c158-edaf-48f3-9ac3-1188be0c6032.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739943850/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.4978 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5726 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3125 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5191 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6489 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6222 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.3114 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943881/c37be7a8-dc10-4fea-962b-202986a4581e.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943881/c37be7a8-dc10-4fea-962b-202986a4581e.json index d1d507707..b0fd4a9be 100644 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943881/c37be7a8-dc10-4fea-962b-202986a4581e.json +++ b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943881/c37be7a8-dc10-4fea-962b-202986a4581e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739943881/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5998 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7032 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3187 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5792 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8222 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6727 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5025 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943972/223dc616-b20f-4065-91a7-3c35bfd11c94.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943972/223dc616-b20f-4065-91a7-3c35bfd11c94.json index 6efc37a05..c0ad13c6c 100644 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943972/223dc616-b20f-4065-91a7-3c35bfd11c94.json +++ b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739943972/223dc616-b20f-4065-91a7-3c35bfd11c94.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739943972/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5289 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6168 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.375 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5738 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6844 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5657 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.3577 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739957701/4236b0a9-9d1e-41f6-8364-a7e8ebf51635.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739957701/4236b0a9-9d1e-41f6-8364-a7e8ebf51635.json index 15342b481..6eb775d20 100644 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739957701/4236b0a9-9d1e-41f6-8364-a7e8ebf51635.json +++ b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739957701/4236b0a9-9d1e-41f6-8364-a7e8ebf51635.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739957701/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6194 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6779 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3563 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6011 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8022 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.697 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5822 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971507/c8030a87-0cdf-4918-b0d5-d1fb0e284656.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971507/c8030a87-0cdf-4918-b0d5-d1fb0e284656.json index db24dc879..c409d89eb 100644 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971507/c8030a87-0cdf-4918-b0d5-d1fb0e284656.json +++ b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971507/c8030a87-0cdf-4918-b0d5-d1fb0e284656.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739971507/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5717 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.68 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.375 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6066 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7667 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5475 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4545 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971529/e6ecc1eb-7ff1-46aa-bf03-37bad1b391b7.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971529/e6ecc1eb-7ff1-46aa-bf03-37bad1b391b7.json index 76286022b..c8518a2e2 100644 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971529/e6ecc1eb-7ff1-46aa-bf03-37bad1b391b7.json +++ b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739971529/e6ecc1eb-7ff1-46aa-bf03-37bad1b391b7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739971529/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5564 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6568 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3563 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5956 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7533 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5737 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4027 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739998765/64872b1a-1eae-4171-95ec-a80c782b69f0.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739998765/64872b1a-1eae-4171-95ec-a80c782b69f0.json index 77dd9be0e..b41f9844e 100644 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739998765/64872b1a-1eae-4171-95ec-a80c782b69f0.json +++ b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1739998765/64872b1a-1eae-4171-95ec-a80c782b69f0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1739998765/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6008 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7095 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4125 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6066 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8022 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5859 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4883 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740005072/37484401-c7fe-469d-889a-e70f7cadbf82.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740005072/37484401-c7fe-469d-889a-e70f7cadbf82.json index c78258032..04e0eceb6 100644 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740005072/37484401-c7fe-469d-889a-e70f7cadbf82.json +++ b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740005072/37484401-c7fe-469d-889a-e70f7cadbf82.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1740005072/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6097 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7137 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3937 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6339 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7778 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6343 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5047 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740129284/8cf36288-3add-4fcd-a012-0df9eae2a059.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740129284/8cf36288-3add-4fcd-a012-0df9eae2a059.json index 660d601e2..a39194f60 100644 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740129284/8cf36288-3add-4fcd-a012-0df9eae2a059.json +++ b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1740129284/8cf36288-3add-4fcd-a012-0df9eae2a059.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1740129284/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6129 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7116 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4437 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6448 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8022 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6101 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4652 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741286813/f2c8f979-c331-4b9b-b0a7-5efa82c17d3b.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741286813/f2c8f979-c331-4b9b-b0a7-5efa82c17d3b.json index f109fdac1..6657f9ea6 100644 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741286813/f2c8f979-c331-4b9b-b0a7-5efa82c17d3b.json +++ b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741286813/f2c8f979-c331-4b9b-b0a7-5efa82c17d3b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1741286813/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6557 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6295 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4188 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.612 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9111 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8263 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5365 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741287363/de409ce8-fb68-4113-8879-23712769cbde.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741287363/de409ce8-fb68-4113-8879-23712769cbde.json index 2320f7feb..56ed0daae 100644 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741287363/de409ce8-fb68-4113-8879-23712769cbde.json +++ b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741287363/de409ce8-fb68-4113-8879-23712769cbde.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1741287363/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6672 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6295 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.375 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6066 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.88 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9374 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5748 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741292911/264f20d7-1574-448c-8917-eb3f20810819.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741292911/264f20d7-1574-448c-8917-eb3f20810819.json index 68df2fbb6..d217f0d3d 100644 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741292911/264f20d7-1574-448c-8917-eb3f20810819.json +++ b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1741292911/264f20d7-1574-448c-8917-eb3f20810819.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1741292911/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6607 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6589 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6066 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9089 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8869 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5028 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742338142/0ebaec42-9190-4326-95dd-5ecb48bf1a72.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742338142/0ebaec42-9190-4326-95dd-5ecb48bf1a72.json index a29c40de9..4897a8825 100644 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742338142/0ebaec42-9190-4326-95dd-5ecb48bf1a72.json +++ b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742338142/0ebaec42-9190-4326-95dd-5ecb48bf1a72.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1742338142/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6344 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7326 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3812 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7049 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.88 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6323 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.475 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519610/29515933-c60b-4686-b475-70ef53d75457.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519610/29515933-c60b-4686-b475-70ef53d75457.json index 7b2861f0f..e2f564fd7 100644 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519610/29515933-c60b-4686-b475-70ef53d75457.json +++ b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519610/29515933-c60b-4686-b475-70ef53d75457.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1742519610/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6361 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7074 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3812 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6721 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.82 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6444 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5915 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519628/414174a9-7e44-4f7b-94ce-0757639f5af7.json b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519628/414174a9-7e44-4f7b-94ce-0757639f5af7.json index ad3f3b414..e92fdfc16 100644 --- a/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519628/414174a9-7e44-4f7b-94ce-0757639f5af7.json +++ b/data/reward-bench/allenai/open_instruct_dev-reward_modeling__1__1742519628/414174a9-7e44-4f7b-94ce-0757639f5af7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-reward_modeling__1__1742519628/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5609 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5179 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3563 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.623 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8356 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5071 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5254 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455/48513083-f854-455e-8455-ddbd2698ec03.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455/48513083-f854-455e-8455-ddbd2698ec03.json index 8cc753229..c75b5a4dd 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455/48513083-f854-455e-8455-ddbd2698ec03.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455/48513083-f854-455e-8455-ddbd2698ec03.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_100pctflipped__1__1744241455/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.0576 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.04 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.1313 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.0546 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.0489 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.0808 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": -0.01 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511/0b373560-854f-4482-81d0-6c984e130144.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511/0b373560-854f-4482-81d0-6c984e130144.json index 25e15cee3..d30d94d18 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511/0b373560-854f-4482-81d0-6c984e130144.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511/0b373560-854f-4482-81d0-6c984e130144.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_10pctflipped__1__1743295511/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5499 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6821 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3937 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5956 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7356 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5212 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.3711 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406/1a021cab-d569-4077-af5e-1643f45de03d.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406/1a021cab-d569-4077-af5e-1643f45de03d.json index b9b8966dc..d5744d0a2 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406/1a021cab-d569-4077-af5e-1643f45de03d.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406/1a021cab-d569-4077-af5e-1643f45de03d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_20pctflipped__1__1743295406/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5054 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6358 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3688 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6066 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6867 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.4424 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.2922 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136/e26e230d-59b3-4243-a6c4-3845ab74b89b.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136/e26e230d-59b3-4243-a6c4-3845ab74b89b.json index 7ae1c6a61..1263be4bc 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136/e26e230d-59b3-4243-a6c4-3845ab74b89b.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136/e26e230d-59b3-4243-a6c4-3845ab74b89b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_30pctflipped__1__1743325136/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.478 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6442 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3563 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.612 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6356 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.2707 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.3496 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398/aa0991d0-9c5e-4f94-bc12-3342ca389e99.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398/aa0991d0-9c5e-4f94-bc12-3342ca389e99.json index ea8c8eb0d..8bd48e77c 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398/aa0991d0-9c5e-4f94-bc12-3342ca389e99.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398/aa0991d0-9c5e-4f94-bc12-3342ca389e99.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_50pctflipped__1__1744241398/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.219 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.2484 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.2812 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.2623 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.3422 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.1717 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.008 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535/397abe47-d5e9-487d-b883-ec49db16c584.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535/397abe47-d5e9-487d-b883-ec49db16c584.json index ce94d3dd6..a81128f70 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535/397abe47-d5e9-487d-b883-ec49db16c584.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535/397abe47-d5e9-487d-b883-ec49db16c584.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_5pctflipped__1__1743444535/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5625 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6821 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4062 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6011 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7511 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5313 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.403 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo__1__1743550054/82f52a35-41b5-4b9c-bb3e-4bf18eed0b92.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo__1__1743550054/82f52a35-41b5-4b9c-bb3e-4bf18eed0b92.json index 8f5f2f727..53333e181 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo__1__1743550054/82f52a35-41b5-4b9c-bb3e-4bf18eed0b92.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo__1__1743550054/82f52a35-41b5-4b9c-bb3e-4bf18eed0b92.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_dpo__1__1743550054/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5759 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7074 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.375 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.623 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7578 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5333 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.459 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271/670382ab-a8a1-43f3-a572-b9a5aeae23ef.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271/670382ab-a8a1-43f3-a572-b9a5aeae23ef.json index 3560ba862..ba5b508f6 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271/670382ab-a8a1-43f3-a572-b9a5aeae23ef.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271/670382ab-a8a1-43f3-a572-b9a5aeae23ef.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_dpo_skyworks__1__1744530271/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6057 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5053 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.375 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5902 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8422 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7798 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5419 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181/a4b3c031-7c01-4f7a-8cfe-52b3260d6ecc.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181/a4b3c031-7c01-4f7a-8cfe-52b3260d6ecc.json index f5393bdb8..55a7568d2 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181/a4b3c031-7c01-4f7a-8cfe-52b3260d6ecc.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181/a4b3c031-7c01-4f7a-8cfe-52b3260d6ecc.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_dpo_skyworkstulufull__1__1743550181/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6535 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7137 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3812 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6175 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8244 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7737 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6101 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl__1__1743551221/7fcd3fce-2296-4b5c-8362-24b1c70ccb8f.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl__1__1743551221/7fcd3fce-2296-4b5c-8362-24b1c70ccb8f.json index 03b3b1315..b781deca8 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl__1__1743551221/7fcd3fce-2296-4b5c-8362-24b1c70ccb8f.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl__1__1743551221/7fcd3fce-2296-4b5c-8362-24b1c70ccb8f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_rl__1__1743551221/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5799 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7116 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3812 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.76 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5374 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.461 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262/4f164e8b-55a1-498f-b586-cf78da7d0b57.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262/4f164e8b-55a1-498f-b586-cf78da7d0b57.json index ed8838f86..9f032475e 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262/4f164e8b-55a1-498f-b586-cf78da7d0b57.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262/4f164e8b-55a1-498f-b586-cf78da7d0b57.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_rl_skyworks__1__1744530262/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5903 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.4863 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3625 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5738 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8489 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7778 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4926 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523/a84d3d61-6e05-4d4d-bc89-7f663e9667fb.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523/a84d3d61-6e05-4d4d-bc89-7f663e9667fb.json index ae1b169a8..9d860b297 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523/a84d3d61-6e05-4d4d-bc89-7f663e9667fb.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523/a84d3d61-6e05-4d4d-bc89-7f663e9667fb.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_rl_skyworkstulufull__1__1743551523/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6483 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7074 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3625 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6175 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8222 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7758 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6044 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750/7aa98f71-8262-4c1f-a71c-1ef36f2ef04c.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750/7aa98f71-8262-4c1f-a71c-1ef36f2ef04c.json index 7679c69db..7961d2bec 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750/7aa98f71-8262-4c1f-a71c-1ef36f2ef04c.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750/7aa98f71-8262-4c1f-a71c-1ef36f2ef04c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_1_skyworkstulumix__1__1743205750/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5157 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6084 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3688 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6066 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7089 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.4222 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.3791 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427/93398c1f-3129-4be4-83b5-62a4a45c6b84.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427/93398c1f-3129-4be4-83b5-62a4a45c6b84.json index 95de0b2a6..9a7aa8751 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427/93398c1f-3129-4be4-83b5-62a4a45c6b84.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427/93398c1f-3129-4be4-83b5-62a4a45c6b84.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_2_10pctflipped__1__1743295427/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6009 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7263 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.375 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5902 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7933 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7273 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.3931 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446/62493784-f899-4736-bdce-2107ec99a752.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446/62493784-f899-4736-bdce-2107ec99a752.json index dd6172be6..0629539f9 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446/62493784-f899-4736-bdce-2107ec99a752.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446/62493784-f899-4736-bdce-2107ec99a752.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_2_20pctflipped__1__1743295446/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5716 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6779 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3937 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5464 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7533 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7051 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.3534 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094/9b68ecaa-cf9d-414e-9cf1-c662c765bb5c.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094/9b68ecaa-cf9d-414e-9cf1-c662c765bb5c.json index 13df8e692..bca56ed2b 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094/9b68ecaa-cf9d-414e-9cf1-c662c765bb5c.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094/9b68ecaa-cf9d-414e-9cf1-c662c765bb5c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_2_30pctflipped__1__1743325094/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5151 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6484 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3312 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5574 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7289 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.4889 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.3357 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636/76f3d0bd-2b71-4406-a0d4-b01b6c91c4ff.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636/76f3d0bd-2b71-4406-a0d4-b01b6c91c4ff.json index 8cfb3dab5..54fc2bfbe 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636/76f3d0bd-2b71-4406-a0d4-b01b6c91c4ff.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636/76f3d0bd-2b71-4406-a0d4-b01b6c91c4ff.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_2_5pctflipped__1__1743444636/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6119 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.72 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4062 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8067 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6889 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.421 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_dpo__1__1743549325/2dc5ab6f-2427-42ae-9582-a0e6139f451a.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_dpo__1__1743549325/2dc5ab6f-2427-42ae-9582-a0e6139f451a.json index 91011ae74..55ddbdfe8 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_dpo__1__1743549325/2dc5ab6f-2427-42ae-9582-a0e6139f451a.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_dpo__1__1743549325/2dc5ab6f-2427-42ae-9582-a0e6139f451a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_2_dpo__1__1743549325/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6008 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7179 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.35 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5956 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6707 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4707 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_rl__1__1743551238/0db97be6-6562-47d8-bd1a-5b469250e54b.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_rl__1__1743551238/0db97be6-6562-47d8-bd1a-5b469250e54b.json index 3b7dac4aa..4b4ef4368 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_rl__1__1743551238/0db97be6-6562-47d8-bd1a-5b469250e54b.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_rl__1__1743551238/0db97be6-6562-47d8-bd1a-5b469250e54b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_2_rl__1__1743551238/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5965 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7095 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3438 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.612 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8044 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6566 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.453 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906/228e4dc4-e517-4023-b690-7f0c321286b2.json b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906/228e4dc4-e517-4023-b690-7f0c321286b2.json index 9f627ce74..2cf126b25 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906/228e4dc4-e517-4023-b690-7f0c321286b2.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906/228e4dc4-e517-4023-b690-7f0c321286b2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_1e-6_2_skyworkstulumix__1__1743205906/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5574 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6526 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3937 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6011 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7711 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5051 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4208 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529/9442b27c-c94d-41c0-a752-3bd82385272d.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529/9442b27c-c94d-41c0-a752-3bd82385272d.json index 776e5e4aa..8dc1a9073 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529/9442b27c-c94d-41c0-a752-3bd82385272d.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529/9442b27c-c94d-41c0-a752-3bd82385272d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_1_100pctflipped__1__1744241529/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.0719 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.0421 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.2062 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.0601 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.0378 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.0949 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": -0.01 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305/561039ac-b156-40eb-bf53-21a275b858ca.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305/561039ac-b156-40eb-bf53-21a275b858ca.json index bce0553bd..5f62f67ab 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305/561039ac-b156-40eb-bf53-21a275b858ca.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305/561039ac-b156-40eb-bf53-21a275b858ca.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_1_10pctflipped__1__1743295305/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.553 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6674 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3563 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6733 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5697 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4227 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778/d801d700-7b4d-4a62-883b-3d85b05385ea.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778/d801d700-7b4d-4a62-883b-3d85b05385ea.json index e616ad328..431df7a47 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778/d801d700-7b4d-4a62-883b-3d85b05385ea.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778/d801d700-7b4d-4a62-883b-3d85b05385ea.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_1_20pctflipped__1__1743324778/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.4955 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6189 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.325 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5792 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6378 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5657 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.2466 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459/b8f24058-4441-4d19-898e-80470cc7b685.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459/b8f24058-4441-4d19-898e-80470cc7b685.json index 258844fef..6552eeea3 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459/b8f24058-4441-4d19-898e-80470cc7b685.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459/b8f24058-4441-4d19-898e-80470cc7b685.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_1_30pctflipped__1__1743326459/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.4198 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5747 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3375 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5464 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.4933 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.3596 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.2073 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747/1f372e00-e7a8-43ef-8e14-ef1b08e5e957.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747/1f372e00-e7a8-43ef-8e14-ef1b08e5e957.json index 6f242885e..6c4f2b7cc 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747/1f372e00-e7a8-43ef-8e14-ef1b08e5e957.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747/1f372e00-e7a8-43ef-8e14-ef1b08e5e957.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_1_5pctflipped__1__1743443747/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5465 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6821 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.375 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.612 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7333 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5051 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.3713 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935/0200a1b3-71f1-4633-96a5-4ca9883a67a7.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935/0200a1b3-71f1-4633-96a5-4ca9883a67a7.json index 6377b0646..aabf71c3b 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935/0200a1b3-71f1-4633-96a5-4ca9883a67a7.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935/0200a1b3-71f1-4633-96a5-4ca9883a67a7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_1_skyworkstulumix__1__1743205935/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5197 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6126 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3375 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5847 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7333 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.4646 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.3855 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360/55479901-aec7-4875-b792-ba73b54aa37a.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360/55479901-aec7-4875-b792-ba73b54aa37a.json index fa1623133..a1a7dd25e 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360/55479901-aec7-4875-b792-ba73b54aa37a.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360/55479901-aec7-4875-b792-ba73b54aa37a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_2_10pctflipped__1__1743295360/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.4555 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5495 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3063 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.4262 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.5711 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6101 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.2696 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366/872597b2-4392-4f23-b5b2-41d418b6cf89.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366/872597b2-4392-4f23-b5b2-41d418b6cf89.json index 9b0a80d97..f5aa5f436 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366/872597b2-4392-4f23-b5b2-41d418b6cf89.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366/872597b2-4392-4f23-b5b2-41d418b6cf89.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_2_20pctflipped__1__1743295366/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.4422 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5053 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3375 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.4044 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.5422 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6646 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.1991 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352/5cb437b5-5993-418d-bd9f-81dea71d9edf.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352/5cb437b5-5993-418d-bd9f-81dea71d9edf.json index 81139c06d..a0af647f5 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352/5cb437b5-5993-418d-bd9f-81dea71d9edf.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352/5cb437b5-5993-418d-bd9f-81dea71d9edf.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_2_30pctflipped__1__1743326352/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.341 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.4674 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.2875 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.3333 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.3711 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.3919 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.195 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634/c471cdf7-73f9-48c9-a970-baa66b609093.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634/c471cdf7-73f9-48c9-a970-baa66b609093.json index cf6f1e202..c1a864b6a 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634/c471cdf7-73f9-48c9-a970-baa66b609093.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634/c471cdf7-73f9-48c9-a970-baa66b609093.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_2_5pctflipped__1__1743444634/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.4698 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5853 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.2562 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5027 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6489 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5697 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.2562 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988/794a71b4-8a43-4c69-a663-369eea6a84a3.json b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988/794a71b4-8a43-4c69-a663-369eea6a84a3.json index 2f9ef1c59..4f624253c 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988/794a71b4-8a43-4c69-a663-369eea6a84a3.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988/794a71b4-8a43-4c69-a663-369eea6a84a3.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_2e-5_2_skyworkstulumix__1__1743205988/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.4791 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6421 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3125 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.541 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6911 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.4182 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.27 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103/2ad22375-4ed8-4be6-a012-a6f6799581e2.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103/2ad22375-4ed8-4be6-a012-a6f6799581e2.json index 656ff3ace..da7381bc1 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103/2ad22375-4ed8-4be6-a012-a6f6799581e2.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103/2ad22375-4ed8-4be6-a012-a6f6799581e2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_100pctflipped__1__1744242103/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.0607 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.0274 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.1625 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.0656 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.04 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.0788 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": -0.01 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835/a8df0dc2-d16c-4e1a-b0b5-abe2a4a1d803.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835/a8df0dc2-d16c-4e1a-b0b5-abe2a4a1d803.json index 15ddbb19b..a04867707 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835/a8df0dc2-d16c-4e1a-b0b5-abe2a4a1d803.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835/a8df0dc2-d16c-4e1a-b0b5-abe2a4a1d803.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_10pctflipped__1__1743324835/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6089 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4375 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.612 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7622 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6444 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4686 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221/ca0a010a-fe3a-4b87-8c80-4a8d3e2597fb.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221/ca0a010a-fe3a-4b87-8c80-4a8d3e2597fb.json index fd550ff4e..de5939755 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221/ca0a010a-fe3a-4b87-8c80-4a8d3e2597fb.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221/ca0a010a-fe3a-4b87-8c80-4a8d3e2597fb.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_1pctflipped__1__1743445221/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6032 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7158 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4062 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7778 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5859 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5051 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826/5d1c166c-6a22-4afb-b1b1-f7db9ec38bd8.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826/5d1c166c-6a22-4afb-b1b1-f7db9ec38bd8.json index cb78f43a1..7f0d231a3 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826/5d1c166c-6a22-4afb-b1b1-f7db9ec38bd8.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826/5d1c166c-6a22-4afb-b1b1-f7db9ec38bd8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_20pctflipped__1__1743324826/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5831 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6947 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4188 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.623 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.74 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5758 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4465 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363/10a432fa-dfef-4c9c-bdf7-ce0f81fd1895.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363/10a432fa-dfef-4c9c-bdf7-ce0f81fd1895.json index b0a09b23e..25ae9470b 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363/10a432fa-dfef-4c9c-bdf7-ce0f81fd1895.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363/10a432fa-dfef-4c9c-bdf7-ce0f81fd1895.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_30pctflipped__1__1743326363/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5268 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.68 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3688 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5792 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7178 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.4343 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.3809 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498/a550663c-2a04-4dfb-8663-b177a7181f3d.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498/a550663c-2a04-4dfb-8663-b177a7181f3d.json index e0aa38363..0676aafc3 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498/a550663c-2a04-4dfb-8663-b177a7181f3d.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498/a550663c-2a04-4dfb-8663-b177a7181f3d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_5pctflipped__1__1743444498/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6093 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7326 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4313 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6339 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7578 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5859 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5143 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__2__1743897475/72b6196e-0a2b-4ec9-80a3-a7eb14f7be09.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__2__1743897475/72b6196e-0a2b-4ec9-80a3-a7eb14f7be09.json index 5f55252f9..42e9ce0e8 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__2__1743897475/72b6196e-0a2b-4ec9-80a3-a7eb14f7be09.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__2__1743897475/72b6196e-0a2b-4ec9-80a3-a7eb14f7be09.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1__2__1743897475/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6122 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7368 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.623 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8044 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.602 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5071 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__3__1744311421/5e41f068-f009-4e32-bac1-9de5220a2ce2.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__3__1744311421/5e41f068-f009-4e32-bac1-9de5220a2ce2.json index 7a3fade87..c9eb27faa 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__3__1744311421/5e41f068-f009-4e32-bac1-9de5220a2ce2.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1__3__1744311421/5e41f068-f009-4e32-bac1-9de5220a2ce2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1__3__1744311421/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5995 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7179 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3375 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6066 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6323 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.503 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo__1__1743549903/eca1331f-6503-481a-b77b-3d96791f54e8.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo__1__1743549903/eca1331f-6503-481a-b77b-3d96791f54e8.json index 44c242478..86624d603 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo__1__1743549903/eca1331f-6503-481a-b77b-3d96791f54e8.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo__1__1743549903/eca1331f-6503-481a-b77b-3d96791f54e8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_dpo__1__1743549903/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6154 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7326 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4375 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6339 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7778 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6061 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5043 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368/69def7de-a916-4d23-984b-e676e91e1d8c.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368/69def7de-a916-4d23-984b-e676e91e1d8c.json index 04069c084..ade9d6695 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368/69def7de-a916-4d23-984b-e676e91e1d8c.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368/69def7de-a916-4d23-984b-e676e91e1d8c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_dpo_skyworks__1__1744530368/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6604 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6316 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3937 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5792 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9044 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8929 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5604 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182/679c6e0b-9e0b-4224-b1e3-59df149739a0.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182/679c6e0b-9e0b-4224-b1e3-59df149739a0.json index 60fb1cc4c..c5d2892d2 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182/679c6e0b-9e0b-4224-b1e3-59df149739a0.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182/679c6e0b-9e0b-4224-b1e3-59df149739a0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_dpo_skyworkstulufull__1__1743550182/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6783 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7705 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6066 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.84 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8101 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6427 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__2__1744316012/2335433d-37c6-47f0-ad3b-5e0a42e9488f.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__2__1744316012/2335433d-37c6-47f0-ad3b-5e0a42e9488f.json index c7922a644..e2e75967a 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__2__1744316012/2335433d-37c6-47f0-ad3b-5e0a42e9488f.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__2__1744316012/2335433d-37c6-47f0-ad3b-5e0a42e9488f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_no_if__2__1744316012/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5911 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7347 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.74 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.604 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4392 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__3__1744315765/fe84f8a3-5fe9-4385-b6d4-0436fb7e5197.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__3__1744315765/fe84f8a3-5fe9-4385-b6d4-0436fb7e5197.json index a845558dc..2d3cbe13a 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__3__1744315765/fe84f8a3-5fe9-4385-b6d4-0436fb7e5197.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_no_if__3__1744315765/fe84f8a3-5fe9-4385-b6d4-0436fb7e5197.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_no_if__3__1744315765/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5926 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7263 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3563 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.623 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7889 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5879 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4733 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl__1__1743551527/70d2697e-0df5-40ae-9268-b906c9cabd9d.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl__1__1743551527/70d2697e-0df5-40ae-9268-b906c9cabd9d.json index 007b55ac9..d305acb78 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl__1__1743551527/70d2697e-0df5-40ae-9268-b906c9cabd9d.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl__1__1743551527/70d2697e-0df5-40ae-9268-b906c9cabd9d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_rl__1__1743551527/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6126 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7411 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.425 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.623 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7822 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5939 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5104 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236/0a30fd70-2381-4a4b-89aa-dbd169c856f0.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236/0a30fd70-2381-4a4b-89aa-dbd169c856f0.json index f2f9d18b0..bf0b750c2 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236/0a30fd70-2381-4a4b-89aa-dbd169c856f0.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236/0a30fd70-2381-4a4b-89aa-dbd169c856f0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_rl_skyworks__1__1744530236/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6525 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6021 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3875 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5792 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8933 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8626 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.59 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530/b9c787f9-3bcd-4215-a157-7fcfa2df82cc.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530/b9c787f9-3bcd-4215-a157-7fcfa2df82cc.json index 15a69a666..bf056a3a5 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530/b9c787f9-3bcd-4215-a157-7fcfa2df82cc.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530/b9c787f9-3bcd-4215-a157-7fcfa2df82cc.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_rl_skyworkstulufull__1__1743551530/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6849 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7453 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3812 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.612 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8422 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8404 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6885 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417/bdd98f27-fbfd-4de7-bd4e-3b8c3e4e7cc0.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417/bdd98f27-fbfd-4de7-bd4e-3b8c3e4e7cc0.json index 64c48beaa..b8d8f17e6 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417/bdd98f27-fbfd-4de7-bd4e-3b8c3e4e7cc0.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417/bdd98f27-fbfd-4de7-bd4e-3b8c3e4e7cc0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulu75__1__1743534417/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.586 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6632 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.425 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6557 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7778 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5172 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.477 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486/44b20109-d534-4aa9-867d-fa59935ef6d0.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486/44b20109-d534-4aa9-867d-fa59935ef6d0.json index 0c4f3ed5a..5bc8800b1 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486/44b20109-d534-4aa9-867d-fa59935ef6d0.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486/44b20109-d534-4aa9-867d-fa59935ef6d0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulufull__1__1743446486/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6773 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7432 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.612 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8422 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.804 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6626 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745/d1196312-4153-4a38-aa46-2940d63d7924.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745/d1196312-4153-4a38-aa46-2940d63d7924.json index 211703e24..620c1403f 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745/d1196312-4153-4a38-aa46-2940d63d7924.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745/d1196312-4153-4a38-aa46-2940d63d7924.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulufull__2__1744314745/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6793 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7558 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4062 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8311 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8061 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6485 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661/4b1e3070-04ef-47e7-b720-739320194e7b.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661/4b1e3070-04ef-47e7-b720-739320194e7b.json index 2ebaeb360..93ad54ca5 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661/4b1e3070-04ef-47e7-b720-739320194e7b.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661/4b1e3070-04ef-47e7-b720-739320194e7b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulufull__3__1744311661/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6611 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.72 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3563 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6393 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8444 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7636 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6428 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472/247f400e-dca8-4dab-bebf-092f778f02c9.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472/247f400e-dca8-4dab-bebf-092f778f02c9.json index 42b80157b..12de82f8f 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472/247f400e-dca8-4dab-bebf-092f778f02c9.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472/247f400e-dca8-4dab-bebf-092f778f02c9.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_1_skyworkstulumix__1__1743204472/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5778 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6674 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3875 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6011 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7933 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5172 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5003 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267/d043ad21-102b-49f0-9e8e-6daef7cc3a2e.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267/d043ad21-102b-49f0-9e8e-6daef7cc3a2e.json index 85cca367e..7f64660cc 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267/d043ad21-102b-49f0-9e8e-6daef7cc3a2e.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267/d043ad21-102b-49f0-9e8e-6daef7cc3a2e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_10pctflipped__1__1743295267/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5746 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6505 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.35 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5082 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7844 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7414 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4128 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759/d45ec8b8-1ee6-49bb-9237-a7271ba9d13c.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759/d45ec8b8-1ee6-49bb-9237-a7271ba9d13c.json index 07424bed4..cd842e3b9 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759/d45ec8b8-1ee6-49bb-9237-a7271ba9d13c.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759/d45ec8b8-1ee6-49bb-9237-a7271ba9d13c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_1pctflipped__1__1743445759/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6065 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7116 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.35 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5792 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8178 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7152 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.465 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905/05a4c6aa-9af2-44f0-8c55-8aeed2e75eaf.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905/05a4c6aa-9af2-44f0-8c55-8aeed2e75eaf.json index e27b5cfcd..eb5fdd21c 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905/05a4c6aa-9af2-44f0-8c55-8aeed2e75eaf.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905/05a4c6aa-9af2-44f0-8c55-8aeed2e75eaf.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_20pctflipped__1__1743324905/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5305 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5832 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3312 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.459 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7178 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7071 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.3849 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363/a6ef712e-014e-470e-8d5b-f3b51f677aee.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363/a6ef712e-014e-470e-8d5b-f3b51f677aee.json index 68eaa49b9..f42f0f831 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363/a6ef712e-014e-470e-8d5b-f3b51f677aee.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363/a6ef712e-014e-470e-8d5b-f3b51f677aee.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_30pctflipped__1__1743326363/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.4436 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5411 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3312 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.3115 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6267 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5414 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.31 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505/35a039ba-06be-4ec2-9bde-a6a6db2eefec.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505/35a039ba-06be-4ec2-9bde-a6a6db2eefec.json index f0c8fa826..f6b4dff52 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505/35a039ba-06be-4ec2-9bde-a6a6db2eefec.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505/35a039ba-06be-4ec2-9bde-a6a6db2eefec.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_5pctflipped__1__1743444505/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5925 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.68 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3688 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5519 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.78 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7434 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.431 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo__1__1743550180/97cb96f8-ce4c-403f-bfbc-386d3c611c81.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo__1__1743550180/97cb96f8-ce4c-403f-bfbc-386d3c611c81.json index 275d60107..d291b202f 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo__1__1743550180/97cb96f8-ce4c-403f-bfbc-386d3c611c81.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo__1__1743550180/97cb96f8-ce4c-403f-bfbc-386d3c611c81.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_dpo__1__1743550180/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6198 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7263 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3312 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6339 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8133 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7232 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4908 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187/3a1621e9-75ee-4b34-9c0d-ae15399b1dab.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187/3a1621e9-75ee-4b34-9c0d-ae15399b1dab.json index 1949117b5..0d4d4902f 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187/3a1621e9-75ee-4b34-9c0d-ae15399b1dab.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187/3a1621e9-75ee-4b34-9c0d-ae15399b1dab.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_dpo_skyworkstulufull__1__1743550187/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6763 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7411 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.375 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.612 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8844 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8545 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5908 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl__1__1743551509/237218ac-4c74-4647-82b1-700360ddfdbd.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl__1__1743551509/237218ac-4c74-4647-82b1-700360ddfdbd.json index f848eb22f..6717d04eb 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl__1__1743551509/237218ac-4c74-4647-82b1-700360ddfdbd.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl__1__1743551509/237218ac-4c74-4647-82b1-700360ddfdbd.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_rl__1__1743551509/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6245 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7242 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.35 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6175 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8178 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7253 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5124 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498/2858d126-d2ef-4512-8fc8-c39faf24b908.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498/2858d126-d2ef-4512-8fc8-c39faf24b908.json index 19a835907..7e1595151 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498/2858d126-d2ef-4512-8fc8-c39faf24b908.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498/2858d126-d2ef-4512-8fc8-c39faf24b908.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_rl_skyworkstulufull__1__1743551498/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6673 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7326 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3438 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6175 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8622 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8566 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5911 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926/d118ddb1-aafc-4ddf-b5c7-f3ff921bbe0c.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926/d118ddb1-aafc-4ddf-b5c7-f3ff921bbe0c.json index 54f6f5b7e..599698f0e 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926/d118ddb1-aafc-4ddf-b5c7-f3ff921bbe0c.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926/d118ddb1-aafc-4ddf-b5c7-f3ff921bbe0c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_skyworkstulu75__1__1743548926/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5863 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6674 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3937 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5515 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4768 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661/379ec82f-a6a7-4976-a4a6-ab80cb9da293.json b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661/379ec82f-a6a7-4976-a4a6-ab80cb9da293.json index ff4fe8a13..aa704ecb8 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661/379ec82f-a6a7-4976-a4a6-ab80cb9da293.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661/379ec82f-a6a7-4976-a4a6-ab80cb9da293.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_3e-6_2_skyworkstulumix__1__1743205661/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.589 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6842 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3688 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6393 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7867 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6081 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.447 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598/c4df42d1-a838-4717-a814-40559fcd7342.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598/c4df42d1-a838-4717-a814-40559fcd7342.json index 239e14075..1c2c98ef1 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598/c4df42d1-a838-4717-a814-40559fcd7342.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598/c4df42d1-a838-4717-a814-40559fcd7342.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama70b_skyworkstulufull__4__1747266598/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7306 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7474 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.375 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.694 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8622 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8061 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.8992 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923/f022d826-3252-4def-b37b-3ce44d78f4ce.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923/f022d826-3252-4def-b37b-3ce44d78f4ce.json index dd8ead8fd..a60528170 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923/f022d826-3252-4def-b37b-3ce44d78f4ce.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923/f022d826-3252-4def-b37b-3ce44d78f4ce.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama70b_skyworkstulufull__8__1745387923/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7573 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8168 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4125 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7049 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8733 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8545 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.8814 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1__1__1743896628/cecc321b-efbd-434e-8a31-a97bbb8bbb3b.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1__1__1743896628/cecc321b-efbd-434e-8a31-a97bbb8bbb3b.json index 1b6ae8e92..7523cf126 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1__1__1743896628/cecc321b-efbd-434e-8a31-a97bbb8bbb3b.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1__1__1743896628/cecc321b-efbd-434e-8a31-a97bbb8bbb3b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_1e-6_1__1__1743896628/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6637 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6947 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4062 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8422 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7273 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6834 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999/278c2132-3415-48f4-a839-ed09d71e9240.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999/278c2132-3415-48f4-a839-ed09d71e9240.json index a91a3b5a1..884f2c1d7 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999/278c2132-3415-48f4-a839-ed09d71e9240.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999/278c2132-3415-48f4-a839-ed09d71e9240.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_1e-6_1_skyworks__1__1744062999/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6665 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5979 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3688 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6339 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8956 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8606 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6422 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777/92bbda1a-ecb1-493d-aa39-a29522c1a11e.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777/92bbda1a-ecb1-493d-aa39-a29522c1a11e.json index 34e3c97b7..58d203adf 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777/92bbda1a-ecb1-493d-aa39-a29522c1a11e.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777/92bbda1a-ecb1-493d-aa39-a29522c1a11e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_1e-6_1_skyworkstulufull__1__1743712777/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7038 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6947 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3937 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6557 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8867 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8586 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.7331 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2__1__1743896638/f43b2dff-9e73-4779-86e0-b2cc30ae8b40.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2__1__1743896638/f43b2dff-9e73-4779-86e0-b2cc30ae8b40.json index efc7adbb8..eb2f9451c 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2__1__1743896638/f43b2dff-9e73-4779-86e0-b2cc30ae8b40.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2__1__1743896638/f43b2dff-9e73-4779-86e0-b2cc30ae8b40.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_1e-6_2__1__1743896638/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6754 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6716 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6339 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8756 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7737 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6976 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938/59a98f5d-d017-4b1a-a563-5abd113337e9.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938/59a98f5d-d017-4b1a-a563-5abd113337e9.json index 370fb153c..b741da56e 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938/59a98f5d-d017-4b1a-a563-5abd113337e9.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938/59a98f5d-d017-4b1a-a563-5abd113337e9.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_1e-6_2_skyworkstulufull__1__1743800938/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7241 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7305 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6667 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9422 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9414 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6635 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885/a41597ed-fbab-41af-9625-c277ca988546.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885/a41597ed-fbab-41af-9625-c277ca988546.json index 389f17a6d..4c6ec4ab3 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885/a41597ed-fbab-41af-9625-c277ca988546.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885/a41597ed-fbab-41af-9625-c277ca988546.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_2e-5_1_skyworkstulufull__1__1743712885/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6716 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6632 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3688 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.82 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8303 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.719 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773/e311eb59-f217-4bc2-b69b-dcea434797a8.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773/e311eb59-f217-4bc2-b69b-dcea434797a8.json index 03485eca0..92fb6052e 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773/e311eb59-f217-4bc2-b69b-dcea434797a8.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773/e311eb59-f217-4bc2-b69b-dcea434797a8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_2e-5_2_skyworkstulufull__1__1743800773/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6207 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6358 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.375 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5902 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8267 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.802 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4948 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867/69b037c3-bae2-4889-b10d-e732c45851e9.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867/69b037c3-bae2-4889-b10d-e732c45851e9.json index 6484f285a..5c7a0911d 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867/69b037c3-bae2-4889-b10d-e732c45851e9.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867/69b037c3-bae2-4889-b10d-e732c45851e9.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_2e-6_1_skyworkstulufull__1__1743893867/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.719 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7263 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3875 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6393 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8956 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9273 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.738 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__1__1743929424/adeee000-0b62-4a0c-afaa-5e8c5f29ff6d.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__1__1743929424/adeee000-0b62-4a0c-afaa-5e8c5f29ff6d.json index 2e7c644e7..8b895898b 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__1__1743929424/adeee000-0b62-4a0c-afaa-5e8c5f29ff6d.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__1__1743929424/adeee000-0b62-4a0c-afaa-5e8c5f29ff6d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_1__1__1743929424/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6572 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7305 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3688 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8289 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.703 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6837 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__2__1744311395/4464d588-62b2-440b-8188-2450bd7a94c5.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__2__1744311395/4464d588-62b2-440b-8188-2450bd7a94c5.json index 9bf9f5677..cba9daba8 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__2__1744311395/4464d588-62b2-440b-8188-2450bd7a94c5.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__2__1744311395/4464d588-62b2-440b-8188-2450bd7a94c5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_1__2__1744311395/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6938 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7537 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.45 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6393 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8667 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7616 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6913 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__3__1744311491/bf358648-a41d-43ee-8c14-f8b8eef41871.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__3__1744311491/bf358648-a41d-43ee-8c14-f8b8eef41871.json index f600554b8..acca1c710 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__3__1744311491/bf358648-a41d-43ee-8c14-f8b8eef41871.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1__3__1744311491/bf358648-a41d-43ee-8c14-f8b8eef41871.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_1__3__1744311491/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6754 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7242 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4062 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8422 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7535 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6976 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787/afd99f12-f739-40d3-aa11-ef3a45316931.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787/afd99f12-f739-40d3-aa11-ef3a45316931.json index ba7ec7691..d8abd1886 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787/afd99f12-f739-40d3-aa11-ef3a45316931.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787/afd99f12-f739-40d3-aa11-ef3a45316931.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_1_skyworks__1__1744062787/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7045 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6253 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3812 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6667 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.92 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9232 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.7109 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461/49b4a24b-ddf1-47f0-ba39-9366892a1213.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461/49b4a24b-ddf1-47f0-ba39-9366892a1213.json index aed65ca51..68019510c 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461/49b4a24b-ddf1-47f0-ba39-9366892a1213.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461/49b4a24b-ddf1-47f0-ba39-9366892a1213.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__2__1744311461/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7189 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7305 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3937 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6066 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8978 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9374 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.7475 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780/ea14a487-39c3-488b-b52b-998e57135487.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780/ea14a487-39c3-488b-b52b-998e57135487.json index 9b879bd49..cb7ea678a 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780/ea14a487-39c3-488b-b52b-998e57135487.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780/ea14a487-39c3-488b-b52b-998e57135487.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_1_skyworkstulufull__3__1744311780/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7172 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7242 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4313 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6175 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8778 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.897 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.7555 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2__1__1743896489/02f74b6a-7f63-484e-a7c1-0c53bd801b87.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2__1__1743896489/02f74b6a-7f63-484e-a7c1-0c53bd801b87.json index 7b6ba9d0a..462dccb75 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2__1__1743896489/02f74b6a-7f63-484e-a7c1-0c53bd801b87.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2__1__1743896489/02f74b6a-7f63-484e-a7c1-0c53bd801b87.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_2__1__1743896489/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6813 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7137 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4437 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8644 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7596 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6781 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713/e492c59d-4b03-4dce-983e-a8724de35a60.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713/e492c59d-4b03-4dce-983e-a8724de35a60.json index 32bb74ed1..1f6674a88 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713/e492c59d-4b03-4dce-983e-a8724de35a60.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713/e492c59d-4b03-4dce-983e-a8724de35a60.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_3e-6_2_skyworkstulufull__1__1743800713/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7209 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7116 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3875 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6612 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9067 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9172 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.7414 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911/53de0394-8516-4882-b2bc-c7e62e3d8ef0.json b/data/reward-bench/allenai/open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911/53de0394-8516-4882-b2bc-c7e62e3d8ef0.json index 7a4cda11c..f774c416d 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911/53de0394-8516-4882-b2bc-c7e62e3d8ef0.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911/53de0394-8516-4882-b2bc-c7e62e3d8ef0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llama_4e-6_1_skyworkstulufull__1__1743893911/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7266 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7347 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4313 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6339 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8933 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.897 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.7697 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412/56d4c1c5-5238-45dc-8331-64a14b830779.json b/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412/56d4c1c5-5238-45dc-8331-64a14b830779.json index d5a03d7e7..41b8b3c61 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412/56d4c1c5-5238-45dc-8331-64a14b830779.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412/56d4c1c5-5238-45dc-8331-64a14b830779.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llamabase_1e-6_1_skyworkstulufull__1__1745386412/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5342 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6042 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.275 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7222 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.5818 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.3935 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922/7003c9d4-c758-4373-a7a3-04822978bf35.json b/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922/7003c9d4-c758-4373-a7a3-04822978bf35.json index b721ec24f..19e1f2b82 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922/7003c9d4-c758-4373-a7a3-04822978bf35.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922/7003c9d4-c758-4373-a7a3-04822978bf35.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llamabase_1e-6_2_skyworkstulufull__1__1745441922/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6111 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6884 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3063 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.623 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8289 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7576 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4628 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495/75a7dcb6-789c-49de-b209-4cf7d27465e4.json b/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495/75a7dcb6-789c-49de-b209-4cf7d27465e4.json index 7349185f8..504f0108c 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495/75a7dcb6-789c-49de-b209-4cf7d27465e4.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495/75a7dcb6-789c-49de-b209-4cf7d27465e4.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llamabase_2e-5_1_skyworkstulufull__1__1745386495/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5825 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6379 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.325 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5355 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8222 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7051 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4691 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507/e91d3910-4f20-4e82-b1fb-8605f5d2b8ac.json b/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507/e91d3910-4f20-4e82-b1fb-8605f5d2b8ac.json index 8b98a2800..b50170dc8 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507/e91d3910-4f20-4e82-b1fb-8605f5d2b8ac.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507/e91d3910-4f20-4e82-b1fb-8605f5d2b8ac.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llamabase_2e-5_2_skyworkstulufull__1__1745386507/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5598 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5495 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3563 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5902 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.76 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7273 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.3754 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507/f18bfd44-3097-4eb8-a09c-2372c3ecd738.json b/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507/f18bfd44-3097-4eb8-a09c-2372c3ecd738.json index 32bdafd8d..2ce4a6752 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507/f18bfd44-3097-4eb8-a09c-2372c3ecd738.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507/f18bfd44-3097-4eb8-a09c-2372c3ecd738.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_llamabase_3e-6_1_skyworkstulufull__1__1745386507/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6101 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6632 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.35 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6175 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7778 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7111 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5408 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917/9ca974b9-c5fb-4fc4-ab3e-1246e31ecdb2.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917/9ca974b9-c5fb-4fc4-ab3e-1246e31ecdb2.json index 308f6d0b2..aef503803 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917/9ca974b9-c5fb-4fc4-ab3e-1246e31ecdb2.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917/9ca974b9-c5fb-4fc4-ab3e-1246e31ecdb2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen32b_1e-6_skyworkstulufull__8__1748235917/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7185 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7305 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4125 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7158 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7933 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8545 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.804 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961/fb1ab5e0-18db-4e5f-add3-2352d9a1f260.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961/fb1ab5e0-18db-4e5f-add3-2352d9a1f260.json index a7319cf44..3c5aab68c 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961/fb1ab5e0-18db-4e5f-add3-2352d9a1f260.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961/fb1ab5e0-18db-4e5f-add3-2352d9a1f260.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen32b_3e-6_skyworkstulufull__8__1748288961/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7325 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7474 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4437 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7158 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7978 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8141 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.8763 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830/60ba1f0d-7e85-49e4-8c73-330d74de6707.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830/60ba1f0d-7e85-49e4-8c73-330d74de6707.json index 03156b7ff..4f30313c2 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830/60ba1f0d-7e85-49e4-8c73-330d74de6707.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830/60ba1f0d-7e85-49e4-8c73-330d74de6707.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_1e-6_1_skyworks__1__1744062830/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6022 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.325 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.694 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7556 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7616 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5486 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024/29d1c194-8b87-466c-8701-e0fcf267665c.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024/29d1c194-8b87-466c-8701-e0fcf267665c.json index 2acbfdf87..e82661177 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024/29d1c194-8b87-466c-8701-e0fcf267665c.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024/29d1c194-8b87-466c-8701-e0fcf267665c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_1e-6_1_skyworks__2__1744576024/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5948 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5579 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.2875 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6776 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.72 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7394 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5863 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914/31e8f616-7b64-4d1a-b395-20bf8bb4629c.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914/31e8f616-7b64-4d1a-b395-20bf8bb4629c.json index 61118bf8f..0ba320e56 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914/31e8f616-7b64-4d1a-b395-20bf8bb4629c.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914/31e8f616-7b64-4d1a-b395-20bf8bb4629c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_1e-6_1_skyworkstulufull__1__1743712914/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6492 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6084 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.35 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6776 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.76 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.699 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091/cc3f315d-3cea-47e4-83b4-b5045e778c5e.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091/cc3f315d-3cea-47e4-83b4-b5045e778c5e.json index f8d27de74..4d27acb60 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091/cc3f315d-3cea-47e4-83b4-b5045e778c5e.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091/cc3f315d-3cea-47e4-83b4-b5045e778c5e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_2e-5_1_skyworkstulufull__1__1743713091/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6764 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7074 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6885 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8622 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.802 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6984 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829/5d20dbf8-bb14-46af-adcd-b7ba05f8352c.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829/5d20dbf8-bb14-46af-adcd-b7ba05f8352c.json index f7eccad79..68ba40d15 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829/5d20dbf8-bb14-46af-adcd-b7ba05f8352c.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829/5d20dbf8-bb14-46af-adcd-b7ba05f8352c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_3e-6_1_skyworks__1__1744062829/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6408 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6337 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3063 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6831 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8467 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8222 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5529 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050/06f2cb33-3937-4fde-84e2-6b5467f051c6.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050/06f2cb33-3937-4fde-84e2-6b5467f051c6.json index 54dfa9a73..69c6a7c36 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050/06f2cb33-3937-4fde-84e2-6b5467f051c6.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050/06f2cb33-3937-4fde-84e2-6b5467f051c6.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_3e-6_1_skyworks__2__1744576050/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6452 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6063 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3187 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7158 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8356 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8343 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5603 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916/f35c4efa-3767-4a0e-8769-06230cda2512.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916/f35c4efa-3767-4a0e-8769-06230cda2512.json index 3fd04e107..3817a0a83 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916/f35c4efa-3767-4a0e-8769-06230cda2512.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916/f35c4efa-3767-4a0e-8769-06230cda2512.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_3e-6_1_skyworkstulufull__1__1743712916/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7013 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7263 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3438 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6995 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8222 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8444 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.7714 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_2__1__1743023576/6cb65d6a-6c46-4991-8154-f28b101954f6.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_2__1__1743023576/6cb65d6a-6c46-4991-8154-f28b101954f6.json index 958f44c2b..05ff3f527 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_2__1__1743023576/6cb65d6a-6c46-4991-8154-f28b101954f6.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_2__1__1743023576/6cb65d6a-6c46-4991-8154-f28b101954f6.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_3e-6_2__1__1743023576/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6369 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6905 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3187 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6448 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7844 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7596 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6236 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_3__1__1743023619/6e15a49b-7dc4-4d69-965e-cb962c084e4a.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_3__1__1743023619/6e15a49b-7dc4-4d69-965e-cb962c084e4a.json index 7b5c77287..2295c0011 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_3__1__1743023619/6e15a49b-7dc4-4d69-965e-cb962c084e4a.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_qwen_3e-6_3__1__1743023619/6e15a49b-7dc4-4d69-965e-cb962c084e4a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwen_3e-6_3__1__1743023619/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6221 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6674 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.325 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.612 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7978 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7455 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5852 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583/9f5591f4-751d-48d3-a348-4bb59f6bb1a3.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583/9f5591f4-751d-48d3-a348-4bb59f6bb1a3.json index b67cb4765..afd3a2b0e 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583/9f5591f4-751d-48d3-a348-4bb59f6bb1a3.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583/9f5591f4-751d-48d3-a348-4bb59f6bb1a3.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwenbase_1e-6_1_skyworkstulufull__1__1745388583/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5735 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5895 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.2625 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6448 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6889 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6727 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5823 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604/b609c002-fa0a-46a8-b5a1-9213ee89606c.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604/b609c002-fa0a-46a8-b5a1-9213ee89606c.json index 93b0e4d7c..ce70a17a6 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604/b609c002-fa0a-46a8-b5a1-9213ee89606c.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604/b609c002-fa0a-46a8-b5a1-9213ee89606c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwenbase_1e-6_2_skyworkstulufull__1__1745388604/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6336 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6337 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3063 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6885 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7244 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.802 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6465 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738/b147fc7f-0e31-49ca-abfd-ba990a925097.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738/b147fc7f-0e31-49ca-abfd-ba990a925097.json index 303899a02..23d89ab11 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738/b147fc7f-0e31-49ca-abfd-ba990a925097.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738/b147fc7f-0e31-49ca-abfd-ba990a925097.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwenbase_2e-5_1_skyworkstulufull__1__1745388738/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6824 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6989 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3625 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6831 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8311 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8081 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.7107 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191/e4fbfe23-2b70-459e-821b-db0116d43d8c.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191/e4fbfe23-2b70-459e-821b-db0116d43d8c.json index de6749902..b6f1f989c 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191/e4fbfe23-2b70-459e-821b-db0116d43d8c.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191/e4fbfe23-2b70-459e-821b-db0116d43d8c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwenbase_2e-5_2_skyworkstulufull__1__1745388191/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6392 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6589 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3312 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6995 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7933 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7717 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5804 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737/2ab7dc14-af3e-4fb2-8c0c-fe0e14100321.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737/2ab7dc14-af3e-4fb2-8c0c-fe0e14100321.json index 00bf97d41..9f16f2dae 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737/2ab7dc14-af3e-4fb2-8c0c-fe0e14100321.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737/2ab7dc14-af3e-4fb2-8c0c-fe0e14100321.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwenbase_3e-6_1_skyworkstulufull__1__1745388737/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.664 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6821 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3312 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6448 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8133 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8061 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.7066 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138/aca2c665-79f2-4226-b806-307be277ed08.json b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138/aca2c665-79f2-4226-b806-307be277ed08.json index 24ccc05c6..d9c996fb0 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138/aca2c665-79f2-4226-b806-307be277ed08.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138/aca2c665-79f2-4226-b806-307be277ed08.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_qwenbase_3e-6_2_skyworkstulufull__1__1745388138/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6678 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6505 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3312 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6831 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7978 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8808 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6632 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_1__8__1742924455/d37a63df-6d38-4083-bf87-11064162efde.json b/data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_1__8__1742924455/d37a63df-6d38-4083-bf87-11064162efde.json index 27cd1cbd9..61f3caac6 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_1__8__1742924455/d37a63df-6d38-4083-bf87-11064162efde.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_1__8__1742924455/d37a63df-6d38-4083-bf87-11064162efde.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_tulu3_70b_1__8__1742924455/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6618 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7958 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.325 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6557 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8311 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6323 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.7311 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_2__8__1742982964/16e550cc-e59d-4aaa-b221-8cf71e1b26d2.json b/data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_2__8__1742982964/16e550cc-e59d-4aaa-b221-8cf71e1b26d2.json index d97c5ebcc..2b5278d3d 100644 --- a/data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_2__8__1742982964/16e550cc-e59d-4aaa-b221-8cf71e1b26d2.json +++ b/data/reward-bench/allenai/open_instruct_dev-rm_tulu3_70b_2__8__1742982964/16e550cc-e59d-4aaa-b221-8cf71e1b26d2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/allenai_open_instruct_dev-rm_tulu3_70b_2__8__1742982964/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6605 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7789 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3688 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6448 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8844 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6667 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6195 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/tulu-2-dpo-13b/47058e2a-dc41-45f8-8c32-bc496a8d3bc5.json b/data/reward-bench/allenai/tulu-2-dpo-13b/47058e2a-dc41-45f8-8c32-bc496a8d3bc5.json index f588b1e6b..16dc6eef6 100644 --- a/data/reward-bench/allenai/tulu-2-dpo-13b/47058e2a-dc41-45f8-8c32-bc496a8d3bc5.json +++ b/data/reward-bench/allenai/tulu-2-dpo-13b/47058e2a-dc41-45f8-8c32-bc496a8d3bc5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/allenai_tulu-2-dpo-13b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7368 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9581 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5833 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7946 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7323 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.4947 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/tulu-2-dpo-70b/7199c8b3-8346-4200-b07e-4362ad13a7db.json b/data/reward-bench/allenai/tulu-2-dpo-70b/7199c8b3-8346-4200-b07e-4362ad13a7db.json index 8c042ac24..d68ff3ff8 100644 --- a/data/reward-bench/allenai/tulu-2-dpo-70b/7199c8b3-8346-4200-b07e-4362ad13a7db.json +++ b/data/reward-bench/allenai/tulu-2-dpo-70b/7199c8b3-8346-4200-b07e-4362ad13a7db.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/allenai_tulu-2-dpo-70b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7621 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9749 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6053 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8446 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7407 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5278 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/tulu-2-dpo-7b/de7e59d5-e2ce-4479-bbd9-ab9deb3beed3.json b/data/reward-bench/allenai/tulu-2-dpo-7b/de7e59d5-e2ce-4479-bbd9-ab9deb3beed3.json index 04b1542bf..bef43cd19 100644 --- a/data/reward-bench/allenai/tulu-2-dpo-7b/de7e59d5-e2ce-4479-bbd9-ab9deb3beed3.json +++ b/data/reward-bench/allenai/tulu-2-dpo-7b/de7e59d5-e2ce-4479-bbd9-ab9deb3beed3.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/allenai_tulu-2-dpo-7b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7212 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9749 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5614 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7527 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7176 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.4774 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/tulu-v2.5-13b-preference-mix-rm/17e011c3-1a53-40ae-b7b4-cb24c23df3de.json b/data/reward-bench/allenai/tulu-v2.5-13b-preference-mix-rm/17e011c3-1a53-40ae-b7b4-cb24c23df3de.json index f43f7de4d..15c29fb58 100644 --- a/data/reward-bench/allenai/tulu-v2.5-13b-preference-mix-rm/17e011c3-1a53-40ae-b7b4-cb24c23df3de.json +++ b/data/reward-bench/allenai/tulu-v2.5-13b-preference-mix-rm/17e011c3-1a53-40ae-b7b4-cb24c23df3de.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/allenai_tulu-v2.5-13b-preference-mix-rm/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8027 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9358 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.682 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.773 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.885 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6724 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/tulu-v2.5-13b-uf-rm/1125dd05-2f0d-48ca-825c-f5efa18564aa.json b/data/reward-bench/allenai/tulu-v2.5-13b-uf-rm/1125dd05-2f0d-48ca-825c-f5efa18564aa.json index 0c9d35f42..817d26686 100644 --- a/data/reward-bench/allenai/tulu-v2.5-13b-uf-rm/1125dd05-2f0d-48ca-825c-f5efa18564aa.json +++ b/data/reward-bench/allenai/tulu-v2.5-13b-uf-rm/1125dd05-2f0d-48ca-825c-f5efa18564aa.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/allenai_tulu-v2.5-13b-uf-rm/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.4806 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.3939 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4232 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5554 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.4737 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6326 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/tulu-v2.5-70b-preference-mix-rm/88014e0d-e89b-4fed-9eb6-5276bd7658df.json b/data/reward-bench/allenai/tulu-v2.5-70b-preference-mix-rm/88014e0d-e89b-4fed-9eb6-5276bd7658df.json index 13869a984..9fc720998 100644 --- a/data/reward-bench/allenai/tulu-v2.5-70b-preference-mix-rm/88014e0d-e89b-4fed-9eb6-5276bd7658df.json +++ b/data/reward-bench/allenai/tulu-v2.5-70b-preference-mix-rm/88014e0d-e89b-4fed-9eb6-5276bd7658df.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/allenai_tulu-v2.5-70b-preference-mix-rm/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6516 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7737 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5921 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8486 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.4138 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6079 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/allenai/tulu-v2.5-70b-uf-rm/7cc9bfc2-570d-456c-918f-68fd4b711f05.json b/data/reward-bench/allenai/tulu-v2.5-70b-uf-rm/7cc9bfc2-570d-456c-918f-68fd4b711f05.json index 2b34c68aa..b30d36361 100644 --- a/data/reward-bench/allenai/tulu-v2.5-70b-uf-rm/7cc9bfc2-570d-456c-918f-68fd4b711f05.json +++ b/data/reward-bench/allenai/tulu-v2.5-70b-uf-rm/7cc9bfc2-570d-456c-918f-68fd4b711f05.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/allenai_tulu-v2.5-70b-uf-rm/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7398 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8659 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7171 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7014 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.757 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5757 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/berkeley-nest/Starling-RM-7B-alpha/77b0957f-8779-4dbe-a6ea-cff50c4ee73b.json b/data/reward-bench/berkeley-nest/Starling-RM-7B-alpha/77b0957f-8779-4dbe-a6ea-cff50c4ee73b.json index a31abcbec..66d40e95d 100644 --- a/data/reward-bench/berkeley-nest/Starling-RM-7B-alpha/77b0957f-8779-4dbe-a6ea-cff50c4ee73b.json +++ b/data/reward-bench/berkeley-nest/Starling-RM-7B-alpha/77b0957f-8779-4dbe-a6ea-cff50c4ee73b.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/berkeley-nest_Starling-RM-7B-alpha/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7113 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9804 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4561 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8446 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.58 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6794 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/facebook/Self-taught-Llama-3-70B/ba0ce7ce-a755-4337-bfec-0391680d3625.json b/data/reward-bench/facebook/Self-taught-Llama-3-70B/ba0ce7ce-a755-4337-bfec-0391680d3625.json index 4bd82d8a1..96835a8ce 100644 --- a/data/reward-bench/facebook/Self-taught-Llama-3-70B/ba0ce7ce-a755-4337-bfec-0391680d3625.json +++ b/data/reward-bench/facebook/Self-taught-Llama-3-70B/ba0ce7ce-a755-4337-bfec-0391680d3625.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/facebook_Self-taught-Llama-3-70B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8863 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9693 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8399 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9108 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.8251 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/facebook/Self-taught-evaluator-llama3.1-70B/4eb460eb-b3ad-4e0d-b131-5b59ef54015c.json b/data/reward-bench/facebook/Self-taught-evaluator-llama3.1-70B/4eb460eb-b3ad-4e0d-b131-5b59ef54015c.json index 9dd3e9229..a0b337292 100644 --- a/data/reward-bench/facebook/Self-taught-evaluator-llama3.1-70B/4eb460eb-b3ad-4e0d-b131-5b59ef54015c.json +++ b/data/reward-bench/facebook/Self-taught-evaluator-llama3.1-70B/4eb460eb-b3ad-4e0d-b131-5b59ef54015c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/facebook_Self-taught-evaluator-llama3.1-70B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9001 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9693 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8509 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8959 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.8844 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/general-preference/GPM-Gemma-2B/6868a1e5-ee86-4f89-8452-5e939ac19169.json b/data/reward-bench/general-preference/GPM-Gemma-2B/6868a1e5-ee86-4f89-8452-5e939ac19169.json index c1f8c90fa..30c26da89 100644 --- a/data/reward-bench/general-preference/GPM-Gemma-2B/6868a1e5-ee86-4f89-8452-5e939ac19169.json +++ b/data/reward-bench/general-preference/GPM-Gemma-2B/6868a1e5-ee86-4f89-8452-5e939ac19169.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/general-preference_GPM-Gemma-2B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7449 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7151 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6974 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8122 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.755 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/general-preference/GPM-Llama-3.1-8B/4a151d43-5fac-4afe-9c23-ba0e86a60849.json b/data/reward-bench/general-preference/GPM-Llama-3.1-8B/4a151d43-5fac-4afe-9c23-ba0e86a60849.json index ec4043ed2..d66a7ae70 100644 --- a/data/reward-bench/general-preference/GPM-Llama-3.1-8B/4a151d43-5fac-4afe-9c23-ba0e86a60849.json +++ b/data/reward-bench/general-preference/GPM-Llama-3.1-8B/4a151d43-5fac-4afe-9c23-ba0e86a60849.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/general-preference_GPM-Llama-3.1-8B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9224 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.933 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.886 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9108 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9597 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/google/flame-1.0-24B-july-2024/5f16d574-adef-4016-abcf-9e7936771ba7.json b/data/reward-bench/google/flame-1.0-24B-july-2024/5f16d574-adef-4016-abcf-9e7936771ba7.json index debba95cf..4f1439052 100644 --- a/data/reward-bench/google/flame-1.0-24B-july-2024/5f16d574-adef-4016-abcf-9e7936771ba7.json +++ b/data/reward-bench/google/flame-1.0-24B-july-2024/5f16d574-adef-4016-abcf-9e7936771ba7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/google_flame-1.0-24B-july-2024/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8781 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9218 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7566 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8959 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.938 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/google/gemini-1.5-flash-001/f3e0300f-39ed-4cfd-bd03-218904836037.json b/data/reward-bench/google/gemini-1.5-flash-001/f3e0300f-39ed-4cfd-bd03-218904836037.json index 47953379d..0c59d6494 100644 --- a/data/reward-bench/google/gemini-1.5-flash-001/f3e0300f-39ed-4cfd-bd03-218904836037.json +++ b/data/reward-bench/google/gemini-1.5-flash-001/f3e0300f-39ed-4cfd-bd03-218904836037.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/google_gemini-1.5-flash-001/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8054 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9218 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6349 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8696 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8512 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6937 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/google/gemini-1.5-flash-8b/42c82c00-b74e-4152-a222-15d481a13e0c.json b/data/reward-bench/google/gemini-1.5-flash-8b/42c82c00-b74e-4152-a222-15d481a13e0c.json index d3c96b98e..11fe9a9c7 100644 --- a/data/reward-bench/google/gemini-1.5-flash-8b/42c82c00-b74e-4152-a222-15d481a13e0c.json +++ b/data/reward-bench/google/gemini-1.5-flash-8b/42c82c00-b74e-4152-a222-15d481a13e0c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/google_gemini-1.5-flash-8b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.4851 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.4611 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3625 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5082 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6622 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6747 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.2421 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/google/gemini-1.5-pro-0514/68096be8-c49f-4a23-824e-1275248369f7.json b/data/reward-bench/google/gemini-1.5-pro-0514/68096be8-c49f-4a23-824e-1275248369f7.json index ebd01fc22..1faa0442f 100644 --- a/data/reward-bench/google/gemini-1.5-pro-0514/68096be8-c49f-4a23-824e-1275248369f7.json +++ b/data/reward-bench/google/gemini-1.5-pro-0514/68096be8-c49f-4a23-824e-1275248369f7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/google_gemini-1.5-pro-0514/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.882 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9232 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8059 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8791 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9199 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/google/gemini-1.5-pro-0924/c91270bd-3731-452a-b429-6cd4943d1194.json b/data/reward-bench/google/gemini-1.5-pro-0924/c91270bd-3731-452a-b429-6cd4943d1194.json index d0d312a19..2eb44d882 100644 --- a/data/reward-bench/google/gemini-1.5-pro-0924/c91270bd-3731-452a-b429-6cd4943d1194.json +++ b/data/reward-bench/google/gemini-1.5-pro-0924/c91270bd-3731-452a-b429-6cd4943d1194.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/google_gemini-1.5-pro-0924/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8678 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9413 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7697 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8581 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9022 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/google/gemini-2.5-flash-preview-04-17/337c7a43-46a7-4acb-b7f1-936e1f2cf46f.json b/data/reward-bench/google/gemini-2.5-flash-preview-04-17/337c7a43-46a7-4acb-b7f1-936e1f2cf46f.json index a742b1b32..8abf7e861 100644 --- a/data/reward-bench/google/gemini-2.5-flash-preview-04-17/337c7a43-46a7-4acb-b7f1-936e1f2cf46f.json +++ b/data/reward-bench/google/gemini-2.5-flash-preview-04-17/337c7a43-46a7-4acb-b7f1-936e1f2cf46f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/google_gemini-2.5-flash-preview-04-17/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7721 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6574 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5531 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8115 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9094 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8672 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.8341 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/google/gemini-2.5-flash/3b00f881-8f73-4608-8cbb-846fe7d1cfea.json b/data/reward-bench/google/gemini-2.5-flash/3b00f881-8f73-4608-8cbb-846fe7d1cfea.json index 6446b47a9..5a4a0577c 100644 --- a/data/reward-bench/google/gemini-2.5-flash/3b00f881-8f73-4608-8cbb-846fe7d1cfea.json +++ b/data/reward-bench/google/gemini-2.5-flash/3b00f881-8f73-4608-8cbb-846fe7d1cfea.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/google_gemini-2.5-flash/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7767 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.674 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.575 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.852 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.909 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.841 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.809 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/google/gemini-2.5-pro-preview-05-06/2821dfdc-291b-405e-bd81-cf536c802885.json b/data/reward-bench/google/gemini-2.5-pro-preview-05-06/2821dfdc-291b-405e-bd81-cf536c802885.json index e1e5d510c..a4d4ee1dd 100644 --- a/data/reward-bench/google/gemini-2.5-pro-preview-05-06/2821dfdc-291b-405e-bd81-cf536c802885.json +++ b/data/reward-bench/google/gemini-2.5-pro-preview-05-06/2821dfdc-291b-405e-bd81-cf536c802885.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/google_gemini-2.5-pro-preview-05-06/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6775 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6532 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4688 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5342 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8806 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8308 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6973 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/google/gemini-2.5-pro/7d441240-7e85-4776-b51c-3c1bc84456ba.json b/data/reward-bench/google/gemini-2.5-pro/7d441240-7e85-4776-b51c-3c1bc84456ba.json index a31100099..f67d63bbb 100644 --- a/data/reward-bench/google/gemini-2.5-pro/7d441240-7e85-4776-b51c-3c1bc84456ba.json +++ b/data/reward-bench/google/gemini-2.5-pro/7d441240-7e85-4776-b51c-3c1bc84456ba.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/google_gemini-2.5-pro/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7948 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.755 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.619 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.898 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.881 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.805 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.811 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/google/gemma-2-27b-it/840d35d9-441e-4ba3-bbc3-1f4ff2627517.json b/data/reward-bench/google/gemma-2-27b-it/840d35d9-441e-4ba3-bbc3-1f4ff2627517.json index 20d232e8c..5bc50f14c 100644 --- a/data/reward-bench/google/gemma-2-27b-it/840d35d9-441e-4ba3-bbc3-1f4ff2627517.json +++ b/data/reward-bench/google/gemma-2-27b-it/840d35d9-441e-4ba3-bbc3-1f4ff2627517.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/google_gemma-2-27b-it/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.809 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9483 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.591 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8635 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.833 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/0127f3c5-9657-4eb6-a77a-5a6476a8fc79.json b/data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/0127f3c5-9657-4eb6-a77a-5a6476a8fc79.json index 8909aa067..b2d697ddd 100644 --- a/data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/0127f3c5-9657-4eb6-a77a-5a6476a8fc79.json +++ b/data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/0127f3c5-9657-4eb6-a77a-5a6476a8fc79.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/hendrydong_Mistral-RM-for-RAFT-GSHF-v0/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7847 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9832 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5789 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.85 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7434 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.7508 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/b72e2988-75e4-4d26-9a47-daae4786b02f.json b/data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/b72e2988-75e4-4d26-9a47-daae4786b02f.json index 9d40f5e06..53579af56 100644 --- a/data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/b72e2988-75e4-4d26-9a47-daae4786b02f.json +++ b/data/reward-bench/hendrydong/Mistral-RM-for-RAFT-GSHF-v0/b72e2988-75e4-4d26-9a47-daae4786b02f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/hendrydong_Mistral-RM-for-RAFT-GSHF-v0/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5851 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5779 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3625 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6011 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6956 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6747 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5988 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/infly/INF-ORM-Llama3.1-70B/643cf5a3-8992-4126-87c9-814887314266.json b/data/reward-bench/infly/INF-ORM-Llama3.1-70B/643cf5a3-8992-4126-87c9-814887314266.json index b55666165..4bce79497 100644 --- a/data/reward-bench/infly/INF-ORM-Llama3.1-70B/643cf5a3-8992-4126-87c9-814887314266.json +++ b/data/reward-bench/infly/INF-ORM-Llama3.1-70B/643cf5a3-8992-4126-87c9-814887314266.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/infly_INF-ORM-Llama3.1-70B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7648 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7411 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4188 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6995 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9644 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.903 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.8622 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/infly/INF-ORM-Llama3.1-70B/f81f1f67-6506-481f-87ce-a17a6a7578f3.json b/data/reward-bench/infly/INF-ORM-Llama3.1-70B/f81f1f67-6506-481f-87ce-a17a6a7578f3.json index f1498d55c..d0e17fefb 100644 --- a/data/reward-bench/infly/INF-ORM-Llama3.1-70B/f81f1f67-6506-481f-87ce-a17a6a7578f3.json +++ b/data/reward-bench/infly/INF-ORM-Llama3.1-70B/f81f1f67-6506-481f-87ce-a17a6a7578f3.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/infly_INF-ORM-Llama3.1-70B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9511 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9665 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.9101 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9365 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9912 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/internlm/internlm2-1_8b-reward/32b35218-a099-410e-8a65-a0d6e2f380a6.json b/data/reward-bench/internlm/internlm2-1_8b-reward/32b35218-a099-410e-8a65-a0d6e2f380a6.json index ca734a8e5..009a9841f 100644 --- a/data/reward-bench/internlm/internlm2-1_8b-reward/32b35218-a099-410e-8a65-a0d6e2f380a6.json +++ b/data/reward-bench/internlm/internlm2-1_8b-reward/32b35218-a099-410e-8a65-a0d6e2f380a6.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/internlm_internlm2-1_8b-reward/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.3902 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.2758 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3625 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.4426 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.4711 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.596 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.1934 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/internlm/internlm2-1_8b-reward/deec1e7c-0cb8-4e6f-b3ac-d37790b709f3.json b/data/reward-bench/internlm/internlm2-1_8b-reward/deec1e7c-0cb8-4e6f-b3ac-d37790b709f3.json index a8db9512e..c3fbd28d8 100644 --- a/data/reward-bench/internlm/internlm2-1_8b-reward/deec1e7c-0cb8-4e6f-b3ac-d37790b709f3.json +++ b/data/reward-bench/internlm/internlm2-1_8b-reward/deec1e7c-0cb8-4e6f-b3ac-d37790b709f3.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/internlm_internlm2-1_8b-reward/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8217 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9358 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6623 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8162 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.8724 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/internlm/internlm2-20b-reward/e42a9986-4dcc-4017-be97-8135646c7424.json b/data/reward-bench/internlm/internlm2-20b-reward/e42a9986-4dcc-4017-be97-8135646c7424.json index 7c100da57..332851441 100644 --- a/data/reward-bench/internlm/internlm2-20b-reward/e42a9986-4dcc-4017-be97-8135646c7424.json +++ b/data/reward-bench/internlm/internlm2-20b-reward/e42a9986-4dcc-4017-be97-8135646c7424.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/internlm_internlm2-20b-reward/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9016 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9888 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7654 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8946 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9576 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/internlm/internlm2-20b-reward/ffc92063-606a-4f31-bfdd-5683aa748ccc.json b/data/reward-bench/internlm/internlm2-20b-reward/ffc92063-606a-4f31-bfdd-5683aa748ccc.json index be1a99b56..ceaeec27a 100644 --- a/data/reward-bench/internlm/internlm2-20b-reward/ffc92063-606a-4f31-bfdd-5683aa748ccc.json +++ b/data/reward-bench/internlm/internlm2-20b-reward/ffc92063-606a-4f31-bfdd-5683aa748ccc.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/internlm_internlm2-20b-reward/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5628 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5558 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3625 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5738 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6111 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7253 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5483 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/internlm/internlm2-7b-reward/23a5398c-0911-4a66-930d-abada12bf985.json b/data/reward-bench/internlm/internlm2-7b-reward/23a5398c-0911-4a66-930d-abada12bf985.json index 9bfc27874..3c136cd53 100644 --- a/data/reward-bench/internlm/internlm2-7b-reward/23a5398c-0911-4a66-930d-abada12bf985.json +++ b/data/reward-bench/internlm/internlm2-7b-reward/23a5398c-0911-4a66-930d-abada12bf985.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/internlm_internlm2-7b-reward/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5335 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.4211 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5628 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.5956 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7051 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5164 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/internlm/internlm2-7b-reward/80b0bbcb-a57a-453c-8fff-502646520b1d.json b/data/reward-bench/internlm/internlm2-7b-reward/80b0bbcb-a57a-453c-8fff-502646520b1d.json index 36efccc5e..2273cb29a 100644 --- a/data/reward-bench/internlm/internlm2-7b-reward/80b0bbcb-a57a-453c-8fff-502646520b1d.json +++ b/data/reward-bench/internlm/internlm2-7b-reward/80b0bbcb-a57a-453c-8fff-502646520b1d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/internlm_internlm2-7b-reward/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8759 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9916 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6952 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8716 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9453 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/jondurbin/bagel-dpo-34b-v0.5/e383c939-b952-4fdd-94e3-eb3716691860.json b/data/reward-bench/jondurbin/bagel-dpo-34b-v0.5/e383c939-b952-4fdd-94e3-eb3716691860.json index 6343b6db0..090fb16d8 100644 --- a/data/reward-bench/jondurbin/bagel-dpo-34b-v0.5/e383c939-b952-4fdd-94e3-eb3716691860.json +++ b/data/reward-bench/jondurbin/bagel-dpo-34b-v0.5/e383c939-b952-4fdd-94e3-eb3716691860.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/jondurbin_bagel-dpo-34b-v0.5/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7215 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9385 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5504 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6446 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8889 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.4487 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/llm-blender/PairRM-hf/daf873f9-ab03-49df-96cb-a0f5a8613048.json b/data/reward-bench/llm-blender/PairRM-hf/daf873f9-ab03-49df-96cb-a0f5a8613048.json index d9c5f6c62..ed400926f 100644 --- a/data/reward-bench/llm-blender/PairRM-hf/daf873f9-ab03-49df-96cb-a0f5a8613048.json +++ b/data/reward-bench/llm-blender/PairRM-hf/daf873f9-ab03-49df-96cb-a0f5a8613048.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/llm-blender_PairRM-hf/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6087 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9022 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5219 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.477 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.4898 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6961 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/mattshumer/Reflection-70B/f4cff132-3b2f-4e03-bb49-098b16d87cef.json b/data/reward-bench/mattshumer/Reflection-70B/f4cff132-3b2f-4e03-bb49-098b16d87cef.json index c24b6696c..48058a174 100644 --- a/data/reward-bench/mattshumer/Reflection-70B/f4cff132-3b2f-4e03-bb49-098b16d87cef.json +++ b/data/reward-bench/mattshumer/Reflection-70B/f4cff132-3b2f-4e03-bb49-098b16d87cef.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/mattshumer_Reflection-70B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8422 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9749 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7061 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8318 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.8562 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/meta-llama/Meta-Llama-3-70B-Instruct/f80685de-058c-4ab8-aa35-dc7321d1cea6.json b/data/reward-bench/meta-llama/Meta-Llama-3-70B-Instruct/f80685de-058c-4ab8-aa35-dc7321d1cea6.json index d392ad9e8..d29586253 100644 --- a/data/reward-bench/meta-llama/Meta-Llama-3-70B-Instruct/f80685de-058c-4ab8-aa35-dc7321d1cea6.json +++ b/data/reward-bench/meta-llama/Meta-Llama-3-70B-Instruct/f80685de-058c-4ab8-aa35-dc7321d1cea6.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/meta-llama_Meta-Llama-3-70B-Instruct/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7627 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9763 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5888 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7297 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7854 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.7035 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/meta-llama/Meta-Llama-3-8B-Instruct/c8e4349d-a084-4eb5-990f-403ba930a9ad.json b/data/reward-bench/meta-llama/Meta-Llama-3-8B-Instruct/c8e4349d-a084-4eb5-990f-403ba930a9ad.json index 9fb252ef8..21b28918d 100644 --- a/data/reward-bench/meta-llama/Meta-Llama-3-8B-Instruct/c8e4349d-a084-4eb5-990f-403ba930a9ad.json +++ b/data/reward-bench/meta-llama/Meta-Llama-3-8B-Instruct/c8e4349d-a084-4eb5-990f-403ba930a9ad.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/meta-llama_Meta-Llama-3-8B-Instruct/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.645 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8547 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4156 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6797 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6482 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6082 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo/729ca9c0-0680-49f1-97b9-5581be17a352.json b/data/reward-bench/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo/729ca9c0-0680-49f1-97b9-5581be17a352.json index 0b144abbe..2f6cdcbbc 100644 --- a/data/reward-bench/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo/729ca9c0-0680-49f1-97b9-5581be17a352.json +++ b/data/reward-bench/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo/729ca9c0-0680-49f1-97b9-5581be17a352.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/meta-llama_Meta-Llama-3.1-405B-Instruct-Turbo/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8412 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9721 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7456 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7757 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.8715 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/fdd4add5-b44d-46f9-8c98-da3120df4161.json b/data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/fdd4add5-b44d-46f9-8c98-da3120df4161.json index 3716eb41b..1181fd3d1 100644 --- a/data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/fdd4add5-b44d-46f9-8c98-da3120df4161.json +++ b/data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo/fdd4add5-b44d-46f9-8c98-da3120df4161.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/meta-llama_Meta-Llama-3.1-70B-Instruct-Turbo/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7808 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8757 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6689 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7507 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.828 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct/6b5ef643-30dd-4381-b66f-e9ecd6b0d06e.json b/data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct/6b5ef643-30dd-4381-b66f-e9ecd6b0d06e.json index 854546b01..2f41e3adc 100644 --- a/data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct/6b5ef643-30dd-4381-b66f-e9ecd6b0d06e.json +++ b/data/reward-bench/meta-llama/Meta-Llama-3.1-70B-Instruct/6b5ef643-30dd-4381-b66f-e9ecd6b0d06e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/meta-llama_Meta-Llama-3.1-70B-Instruct/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8405 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9721 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7018 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8284 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.8599 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo/95271b8c-4135-48bf-bbad-ae94baa37640.json b/data/reward-bench/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo/95271b8c-4135-48bf-bbad-ae94baa37640.json index 30a79283f..47e395707 100644 --- a/data/reward-bench/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo/95271b8c-4135-48bf-bbad-ae94baa37640.json +++ b/data/reward-bench/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo/95271b8c-4135-48bf-bbad-ae94baa37640.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/meta-llama_Meta-Llama-3.1-8B-Instruct-Turbo/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6565 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8073 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4978 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6399 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.6811 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/meta-metrics/MetaMetrics-RM-v1.0/f437e790-efe1-4dc5-8ccc-5b0bfd800069.json b/data/reward-bench/meta-metrics/MetaMetrics-RM-v1.0/f437e790-efe1-4dc5-8ccc-5b0bfd800069.json index aa32c142b..5a185daab 100644 --- a/data/reward-bench/meta-metrics/MetaMetrics-RM-v1.0/f437e790-efe1-4dc5-8ccc-5b0bfd800069.json +++ b/data/reward-bench/meta-metrics/MetaMetrics-RM-v1.0/f437e790-efe1-4dc5-8ccc-5b0bfd800069.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/meta-metrics_MetaMetrics-RM-v1.0/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9342 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9832 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.864 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9081 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9816 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/mightbe/Better-PairRM/7d0f761a-2650-4029-b1e9-13af2f0cc69d.json b/data/reward-bench/mightbe/Better-PairRM/7d0f761a-2650-4029-b1e9-13af2f0cc69d.json index 01de5c6fa..46baee169 100644 --- a/data/reward-bench/mightbe/Better-PairRM/7d0f761a-2650-4029-b1e9-13af2f0cc69d.json +++ b/data/reward-bench/mightbe/Better-PairRM/7d0f761a-2650-4029-b1e9-13af2f0cc69d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/mightbe_Better-PairRM/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.673 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9553 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3925 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8203 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.4983 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.724 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/mistralai/Mixtral-8x7B-Instruct-v0.1/49fc601e-4ac6-4672-a53d-0e89f19959c1.json b/data/reward-bench/mistralai/Mixtral-8x7B-Instruct-v0.1/49fc601e-4ac6-4672-a53d-0e89f19959c1.json index 078f1bc19..83daa1359 100644 --- a/data/reward-bench/mistralai/Mixtral-8x7B-Instruct-v0.1/49fc601e-4ac6-4672-a53d-0e89f19959c1.json +++ b/data/reward-bench/mistralai/Mixtral-8x7B-Instruct-v0.1/49fc601e-4ac6-4672-a53d-0e89f19959c1.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/mistralai_Mixtral-8x7B-Instruct-v0.1/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7455 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9497 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6404 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7257 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7872 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5033 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/my_model/6195e81a-d5a5-40af-96f6-259252009ad7.json b/data/reward-bench/my_model/6195e81a-d5a5-40af-96f6-259252009ad7.json index e482bb5fe..366e80763 100644 --- a/data/reward-bench/my_model/6195e81a-d5a5-40af-96f6-259252009ad7.json +++ b/data/reward-bench/my_model/6195e81a-d5a5-40af-96f6-259252009ad7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/my_model_/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5267 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.4553 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5592 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.4392 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.6532 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/nicolinho/QRM-Gemma-2-27B/2dec0f50-d374-4af3-9d27-80fcf50dac2c.json b/data/reward-bench/nicolinho/QRM-Gemma-2-27B/2dec0f50-d374-4af3-9d27-80fcf50dac2c.json index 661fe8b53..e2fe3a9e8 100644 --- a/data/reward-bench/nicolinho/QRM-Gemma-2-27B/2dec0f50-d374-4af3-9d27-80fcf50dac2c.json +++ b/data/reward-bench/nicolinho/QRM-Gemma-2-27B/2dec0f50-d374-4af3-9d27-80fcf50dac2c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/nicolinho_QRM-Gemma-2-27B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7667 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.7853 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3719 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6995 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9578 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.9535 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.8321 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/nicolinho/QRM-Gemma-2-27B/96722888-0cc9-4dfd-b38d-91f4118c0be2.json b/data/reward-bench/nicolinho/QRM-Gemma-2-27B/96722888-0cc9-4dfd-b38d-91f4118c0be2.json index cb05973d0..d61d3e0af 100644 --- a/data/reward-bench/nicolinho/QRM-Gemma-2-27B/96722888-0cc9-4dfd-b38d-91f4118c0be2.json +++ b/data/reward-bench/nicolinho/QRM-Gemma-2-27B/96722888-0cc9-4dfd-b38d-91f4118c0be2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/nicolinho_QRM-Gemma-2-27B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9444 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9665 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.9013 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.927 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9826 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/nicolinho/QRM-Llama3-8B/683abc2a-fce0-4d3d-bdcc-5cac2c76a46a.json b/data/reward-bench/nicolinho/QRM-Llama3-8B/683abc2a-fce0-4d3d-bdcc-5cac2c76a46a.json index 627fa84e2..52e654851 100644 --- a/data/reward-bench/nicolinho/QRM-Llama3-8B/683abc2a-fce0-4d3d-bdcc-5cac2c76a46a.json +++ b/data/reward-bench/nicolinho/QRM-Llama3-8B/683abc2a-fce0-4d3d-bdcc-5cac2c76a46a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/nicolinho_QRM-Llama3-8B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.911 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9581 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8114 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8986 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9758 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/121344ec-61ef-49c5-a74b-b86f605d513e.json b/data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/121344ec-61ef-49c5-a74b-b86f605d513e.json index 182b54745..aef21f0ec 100644 --- a/data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/121344ec-61ef-49c5-a74b-b86f605d513e.json +++ b/data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/121344ec-61ef-49c5-a74b-b86f605d513e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/nicolinho_QRM-Llama3.1-8B-v2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7074 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6653 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4062 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.612 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9467 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.8909 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.7234 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/8594f86b-a7f2-4046-a3a7-830d7ac20690.json b/data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/8594f86b-a7f2-4046-a3a7-830d7ac20690.json index ecf41624c..45f32ccdc 100644 --- a/data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/8594f86b-a7f2-4046-a3a7-830d7ac20690.json +++ b/data/reward-bench/nicolinho/QRM-Llama3.1-8B-v2/8594f86b-a7f2-4046-a3a7-830d7ac20690.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/nicolinho_QRM-Llama3.1-8B-v2/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9314 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9637 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8684 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9257 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9677 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/nicolinho/QRM-Llama3.1-8B/c0c5e5e1-801c-48fd-a994-a4a69c0b1213.json b/data/reward-bench/nicolinho/QRM-Llama3.1-8B/c0c5e5e1-801c-48fd-a994-a4a69c0b1213.json index 7663fc8b5..d0517b2db 100644 --- a/data/reward-bench/nicolinho/QRM-Llama3.1-8B/c0c5e5e1-801c-48fd-a994-a4a69c0b1213.json +++ b/data/reward-bench/nicolinho/QRM-Llama3.1-8B/c0c5e5e1-801c-48fd-a994-a4a69c0b1213.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/nicolinho_QRM-Llama3.1-8B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9306 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9441 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8969 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.923 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9583 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/nvidia/Llama-3.1-Nemotron-70B-Reward/0411ac30-1536-4639-8350-fc11d53298e3.json b/data/reward-bench/nvidia/Llama-3.1-Nemotron-70B-Reward/0411ac30-1536-4639-8350-fc11d53298e3.json index c6cfcb55e..639ea033b 100644 --- a/data/reward-bench/nvidia/Llama-3.1-Nemotron-70B-Reward/0411ac30-1536-4639-8350-fc11d53298e3.json +++ b/data/reward-bench/nvidia/Llama-3.1-Nemotron-70B-Reward/0411ac30-1536-4639-8350-fc11d53298e3.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/nvidia_Llama-3.1-Nemotron-70B-Reward/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.9411 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9749 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8575 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9514 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9807 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/nvidia/Llama3-70B-SteerLM-RM/92281e58-4160-4d76-9119-b38fb47ffd8f.json b/data/reward-bench/nvidia/Llama3-70B-SteerLM-RM/92281e58-4160-4d76-9119-b38fb47ffd8f.json index 4d74cb3a3..34cb0b116 100644 --- a/data/reward-bench/nvidia/Llama3-70B-SteerLM-RM/92281e58-4160-4d76-9119-b38fb47ffd8f.json +++ b/data/reward-bench/nvidia/Llama3-70B-SteerLM-RM/92281e58-4160-4d76-9119-b38fb47ffd8f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/nvidia_Llama3-70B-SteerLM-RM/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8877 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9134 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8026 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9284 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9064 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/nvidia/Nemotron-4-340B-Reward/43687871-2e19-4d2b-9754-1cb6527496c1.json b/data/reward-bench/nvidia/Nemotron-4-340B-Reward/43687871-2e19-4d2b-9754-1cb6527496c1.json index e9305d7aa..81a8ec028 100644 --- a/data/reward-bench/nvidia/Nemotron-4-340B-Reward/43687871-2e19-4d2b-9754-1cb6527496c1.json +++ b/data/reward-bench/nvidia/Nemotron-4-340B-Reward/43687871-2e19-4d2b-9754-1cb6527496c1.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/nvidia_Nemotron-4-340B-Reward/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.92 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9581 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.8706 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.9149 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9363 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/openai/gpt-3.5-turbo-0125/1debe1de-b394-4856-a946-9d14bd867bf6.json b/data/reward-bench/openai/gpt-3.5-turbo-0125/1debe1de-b394-4856-a946-9d14bd867bf6.json index a806badc9..53333618b 100644 --- a/data/reward-bench/openai/gpt-3.5-turbo-0125/1debe1de-b394-4856-a946-9d14bd867bf6.json +++ b/data/reward-bench/openai/gpt-3.5-turbo-0125/1debe1de-b394-4856-a946-9d14bd867bf6.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/openai_gpt-3.5-turbo-0125/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6534 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9218 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4452 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6547 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.5912 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6548 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/openai/gpt-4-0125-preview/80c589d2-c1eb-4dcf-8be8-042f4f66b7eb.json b/data/reward-bench/openai/gpt-4-0125-preview/80c589d2-c1eb-4dcf-8be8-042f4f66b7eb.json index f7eab0bc7..24b1269d5 100644 --- a/data/reward-bench/openai/gpt-4-0125-preview/80c589d2-c1eb-4dcf-8be8-042f4f66b7eb.json +++ b/data/reward-bench/openai/gpt-4-0125-preview/80c589d2-c1eb-4dcf-8be8-042f4f66b7eb.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/openai_gpt-4-0125-preview/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8434 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9525 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7434 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8757 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8692 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.7085 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/openai/gpt-4-turbo-2024-04-09/62478772-bb85-4d3f-a916-c3d17db3ee61.json b/data/reward-bench/openai/gpt-4-turbo-2024-04-09/62478772-bb85-4d3f-a916-c3d17db3ee61.json index de6b17e12..8fbd65118 100644 --- a/data/reward-bench/openai/gpt-4-turbo-2024-04-09/62478772-bb85-4d3f-a916-c3d17db3ee61.json +++ b/data/reward-bench/openai/gpt-4-turbo-2024-04-09/62478772-bb85-4d3f-a916-c3d17db3ee61.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/openai_gpt-4-turbo-2024-04-09/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8395 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9525 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7544 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8757 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.827 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.7363 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/openai/gpt-4.1-2025-04-14/a070bae2-c927-418b-91cc-161781c4f5b7.json b/data/reward-bench/openai/gpt-4.1-2025-04-14/a070bae2-c927-418b-91cc-161781c4f5b7.json index d66bbee74..7ad659dc4 100644 --- a/data/reward-bench/openai/gpt-4.1-2025-04-14/a070bae2-c927-418b-91cc-161781c4f5b7.json +++ b/data/reward-bench/openai/gpt-4.1-2025-04-14/a070bae2-c927-418b-91cc-161781c4f5b7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/openai_gpt-4.1-2025-04-14/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7232 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8289 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3974 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6521 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8726 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7338 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.8542 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/openai/gpt-4.1-mini-2025-04-14/b884c919-a272-4f67-9a09-3d232f56d083.json b/data/reward-bench/openai/gpt-4.1-mini-2025-04-14/b884c919-a272-4f67-9a09-3d232f56d083.json index c6287632a..fe081bf47 100644 --- a/data/reward-bench/openai/gpt-4.1-mini-2025-04-14/b884c919-a272-4f67-9a09-3d232f56d083.json +++ b/data/reward-bench/openai/gpt-4.1-mini-2025-04-14/b884c919-a272-4f67-9a09-3d232f56d083.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/openai_gpt-4.1-mini-2025-04-14/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6573 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6084 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4125 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7213 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7265 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7354 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.74 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/openai/gpt-4.1-nano-2025-04-14/deac33dd-187b-4406-a76a-b33caf417380.json b/data/reward-bench/openai/gpt-4.1-nano-2025-04-14/deac33dd-187b-4406-a76a-b33caf417380.json index b8c07d779..9236ca4a4 100644 --- a/data/reward-bench/openai/gpt-4.1-nano-2025-04-14/deac33dd-187b-4406-a76a-b33caf417380.json +++ b/data/reward-bench/openai/gpt-4.1-nano-2025-04-14/deac33dd-187b-4406-a76a-b33caf417380.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/openai_gpt-4.1-nano-2025-04-14/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.4849 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.4646 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.2578 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5041 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7156 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.466 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5015 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/openai/gpt-4o-2024-05-13/185bd742-d7d4-4600-86bd-bcda75ed2ebc.json b/data/reward-bench/openai/gpt-4o-2024-05-13/185bd742-d7d4-4600-86bd-bcda75ed2ebc.json index 8d3b140c6..e598746ee 100644 --- a/data/reward-bench/openai/gpt-4o-2024-05-13/185bd742-d7d4-4600-86bd-bcda75ed2ebc.json +++ b/data/reward-bench/openai/gpt-4o-2024-05-13/185bd742-d7d4-4600-86bd-bcda75ed2ebc.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/openai_gpt-4o-2024-05-13/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8327 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9665 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7039 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8649 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8487 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.7262 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/openai/gpt-4o-2024-08-06/901e4de6-3ef6-4c2a-873c-cdcc47201974.json b/data/reward-bench/openai/gpt-4o-2024-08-06/901e4de6-3ef6-4c2a-873c-cdcc47201974.json index 23f51655d..92a4b0914 100644 --- a/data/reward-bench/openai/gpt-4o-2024-08-06/901e4de6-3ef6-4c2a-873c-cdcc47201974.json +++ b/data/reward-bench/openai/gpt-4o-2024-08-06/901e4de6-3ef6-4c2a-873c-cdcc47201974.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/openai_gpt-4o-2024-08-06/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8673 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9609 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.761 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8811 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.8661 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/openai/gpt-4o-2024-08-06/a051d5d6-18e6-483d-a000-4a52a06de676.json b/data/reward-bench/openai/gpt-4o-2024-08-06/a051d5d6-18e6-483d-a000-4a52a06de676.json index e3cbd5105..44c5bcc27 100644 --- a/data/reward-bench/openai/gpt-4o-2024-08-06/a051d5d6-18e6-483d-a000-4a52a06de676.json +++ b/data/reward-bench/openai/gpt-4o-2024-08-06/a051d5d6-18e6-483d-a000-4a52a06de676.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/openai_gpt-4o-2024-08-06/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6493 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5684 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3312 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.623 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8619 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7293 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.7819 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/openai/gpt-4o-mini-2024-07-18/94d77182-8952-4a63-b02b-3d8bd8a8dead.json b/data/reward-bench/openai/gpt-4o-mini-2024-07-18/94d77182-8952-4a63-b02b-3d8bd8a8dead.json index bc9ec44b3..653bf1c3b 100644 --- a/data/reward-bench/openai/gpt-4o-mini-2024-07-18/94d77182-8952-4a63-b02b-3d8bd8a8dead.json +++ b/data/reward-bench/openai/gpt-4o-mini-2024-07-18/94d77182-8952-4a63-b02b-3d8bd8a8dead.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/openai_gpt-4o-mini-2024-07-18/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8007 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9497 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6075 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8081 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.8374 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/openai/gpt-4o-mini-2024-07-18/9a48d808-0280-4175-a28a-7e9ba8ac6deb.json b/data/reward-bench/openai/gpt-4o-mini-2024-07-18/9a48d808-0280-4175-a28a-7e9ba8ac6deb.json index 9e94f068b..4e0668e59 100644 --- a/data/reward-bench/openai/gpt-4o-mini-2024-07-18/9a48d808-0280-4175-a28a-7e9ba8ac6deb.json +++ b/data/reward-bench/openai/gpt-4o-mini-2024-07-18/9a48d808-0280-4175-a28a-7e9ba8ac6deb.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/openai_gpt-4o-mini-2024-07-18/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5796 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.4105 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3438 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5191 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7667 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7414 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6962 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/openbmb/Eurus-7b-kto/f0d9f57d-d552-44ea-a91c-751854133316.json b/data/reward-bench/openbmb/Eurus-7b-kto/f0d9f57d-d552-44ea-a91c-751854133316.json index 6052e166d..f4c92b674 100644 --- a/data/reward-bench/openbmb/Eurus-7b-kto/f0d9f57d-d552-44ea-a91c-751854133316.json +++ b/data/reward-bench/openbmb/Eurus-7b-kto/f0d9f57d-d552-44ea-a91c-751854133316.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/openbmb_Eurus-7b-kto/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.69 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9525 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5373 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6054 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7467 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5261 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/openbmb/Eurus-RM-7b/561cfba1-856d-4809-b5c7-41481735e1d6.json b/data/reward-bench/openbmb/Eurus-RM-7b/561cfba1-856d-4809-b5c7-41481735e1d6.json index bbe1d3987..2ceca85c7 100644 --- a/data/reward-bench/openbmb/Eurus-RM-7b/561cfba1-856d-4809-b5c7-41481735e1d6.json +++ b/data/reward-bench/openbmb/Eurus-RM-7b/561cfba1-856d-4809-b5c7-41481735e1d6.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/openbmb_Eurus-RM-7b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5806 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.6 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3438 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5683 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6267 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7475 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.5972 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/openbmb/Eurus-RM-7b/995d1caf-b735-44dd-adff-875e3203aa46.json b/data/reward-bench/openbmb/Eurus-RM-7b/995d1caf-b735-44dd-adff-875e3203aa46.json index 29c8dd2fe..83d393244 100644 --- a/data/reward-bench/openbmb/Eurus-RM-7b/995d1caf-b735-44dd-adff-875e3203aa46.json +++ b/data/reward-bench/openbmb/Eurus-RM-7b/995d1caf-b735-44dd-adff-875e3203aa46.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/openbmb_Eurus-RM-7b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8159 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9804 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6557 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8135 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8633 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.7172 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/openbmb/MiniCPM-2B-dpo-fp32/81767043-23c2-4229-b3b5-1c24e470d52a.json b/data/reward-bench/openbmb/MiniCPM-2B-dpo-fp32/81767043-23c2-4229-b3b5-1c24e470d52a.json index 896ebfae3..94f6c97ac 100644 --- a/data/reward-bench/openbmb/MiniCPM-2B-dpo-fp32/81767043-23c2-4229-b3b5-1c24e470d52a.json +++ b/data/reward-bench/openbmb/MiniCPM-2B-dpo-fp32/81767043-23c2-4229-b3b5-1c24e470d52a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/openbmb_MiniCPM-2B-dpo-fp32/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.673 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8911 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4934 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.573 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8233 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.4958 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/openbmb/UltraRM-13b/4f6344bc-af30-46f9-b6f8-41ff925d064e.json b/data/reward-bench/openbmb/UltraRM-13b/4f6344bc-af30-46f9-b6f8-41ff925d064e.json index 3c1927c24..2c68b9ca5 100644 --- a/data/reward-bench/openbmb/UltraRM-13b/4f6344bc-af30-46f9-b6f8-41ff925d064e.json +++ b/data/reward-bench/openbmb/UltraRM-13b/4f6344bc-af30-46f9-b6f8-41ff925d064e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/openbmb_UltraRM-13b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6903 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9637 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5548 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5986 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6244 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.7294 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/openbmb/UltraRM-13b/abac8640-40be-4eb5-9035-2bf6fd436a7a.json b/data/reward-bench/openbmb/UltraRM-13b/abac8640-40be-4eb5-9035-2bf6fd436a7a.json index 4a8e3095f..0b4c10b89 100644 --- a/data/reward-bench/openbmb/UltraRM-13b/abac8640-40be-4eb5-9035-2bf6fd436a7a.json +++ b/data/reward-bench/openbmb/UltraRM-13b/abac8640-40be-4eb5-9035-2bf6fd436a7a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/openbmb_UltraRM-13b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.4683 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5063 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3312 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5519 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.5089 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.6081 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.3036 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/opencompass/CompassJudger-1-1.5B-Instruct/6fd972ab-c45f-4ccd-a5cf-4aac5e703342.json b/data/reward-bench/opencompass/CompassJudger-1-1.5B-Instruct/6fd972ab-c45f-4ccd-a5cf-4aac5e703342.json index b19fd4605..4c9d57828 100644 --- a/data/reward-bench/opencompass/CompassJudger-1-1.5B-Instruct/6fd972ab-c45f-4ccd-a5cf-4aac5e703342.json +++ b/data/reward-bench/opencompass/CompassJudger-1-1.5B-Instruct/6fd972ab-c45f-4ccd-a5cf-4aac5e703342.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/opencompass_CompassJudger-1-1.5B-Instruct/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7344 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9637 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4923 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7818 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.6999 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/opencompass/CompassJudger-1-14B-Instruct/8eb1bcf2-a6bd-467c-bc37-090fdb7a9460.json b/data/reward-bench/opencompass/CompassJudger-1-14B-Instruct/8eb1bcf2-a6bd-467c-bc37-090fdb7a9460.json index d6cbfe2cd..4299d154c 100644 --- a/data/reward-bench/opencompass/CompassJudger-1-14B-Instruct/8eb1bcf2-a6bd-467c-bc37-090fdb7a9460.json +++ b/data/reward-bench/opencompass/CompassJudger-1-14B-Instruct/8eb1bcf2-a6bd-467c-bc37-090fdb7a9460.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/opencompass_CompassJudger-1-14B-Instruct/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8409 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9749 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6228 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8392 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9268 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/opencompass/CompassJudger-1-32B-Instruct/5ad53725-ed5a-41f3-8ff6-7404f3f981db.json b/data/reward-bench/opencompass/CompassJudger-1-32B-Instruct/5ad53725-ed5a-41f3-8ff6-7404f3f981db.json index a63b6c3a9..49134a927 100644 --- a/data/reward-bench/opencompass/CompassJudger-1-32B-Instruct/5ad53725-ed5a-41f3-8ff6-7404f3f981db.json +++ b/data/reward-bench/opencompass/CompassJudger-1-32B-Instruct/5ad53725-ed5a-41f3-8ff6-7404f3f981db.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/opencompass_CompassJudger-1-32B-Instruct/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8522 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9804 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6513 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8527 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.9244 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/opencompass/CompassJudger-1-7B-Instruct/ae2d05b4-5e80-4b00-af67-b94609b073eb.json b/data/reward-bench/opencompass/CompassJudger-1-7B-Instruct/ae2d05b4-5e80-4b00-af67-b94609b073eb.json index 5e86fdf7e..145d5b3e9 100644 --- a/data/reward-bench/opencompass/CompassJudger-1-7B-Instruct/ae2d05b4-5e80-4b00-af67-b94609b073eb.json +++ b/data/reward-bench/opencompass/CompassJudger-1-7B-Instruct/ae2d05b4-5e80-4b00-af67-b94609b073eb.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/opencompass_CompassJudger-1-7B-Instruct/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8317 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9777 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6096 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8446 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.8948 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/prometheus-eval/prometheus-7b-v2.0/592f2811-c197-423e-89d4-e25ee5a324fb.json b/data/reward-bench/prometheus-eval/prometheus-7b-v2.0/592f2811-c197-423e-89d4-e25ee5a324fb.json index 3165413d5..e934ef88d 100644 --- a/data/reward-bench/prometheus-eval/prometheus-7b-v2.0/592f2811-c197-423e-89d4-e25ee5a324fb.json +++ b/data/reward-bench/prometheus-eval/prometheus-7b-v2.0/592f2811-c197-423e-89d4-e25ee5a324fb.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/prometheus-eval_prometheus-7b-v2.0/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7204 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8547 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4912 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7709 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.7648 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/prometheus-eval/prometheus-8x7b-v2.0/17795e7b-e912-440f-a80e-63233d3b6d8c.json b/data/reward-bench/prometheus-eval/prometheus-8x7b-v2.0/17795e7b-e912-440f-a80e-63233d3b6d8c.json index 21cf27bec..d2deb1a71 100644 --- a/data/reward-bench/prometheus-eval/prometheus-8x7b-v2.0/17795e7b-e912-440f-a80e-63233d3b6d8c.json +++ b/data/reward-bench/prometheus-eval/prometheus-8x7b-v2.0/17795e7b-e912-440f-a80e-63233d3b6d8c.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/prometheus-eval_prometheus-8x7b-v2.0/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7451 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9302 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4715 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8047 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.774 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/375cf55f-64f6-42f6-a947-1487feffb196.json b/data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/375cf55f-64f6-42f6-a947-1487feffb196.json index 8570295e2..494a96669 100644 --- a/data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/375cf55f-64f6-42f6-a947-1487feffb196.json +++ b/data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/375cf55f-64f6-42f6-a947-1487feffb196.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/sfairXC_FsfairX-LLaMA3-RM-v0.1/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8338 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9944 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6513 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8676 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8644 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.7492 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/94d2eddd-f7db-4360-ac58-0af39ce66935.json b/data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/94d2eddd-f7db-4360-ac58-0af39ce66935.json index 4d7ceaaeb..8dad45261 100644 --- a/data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/94d2eddd-f7db-4360-ac58-0af39ce66935.json +++ b/data/reward-bench/sfairXC/FsfairX-LLaMA3-RM-v0.1/94d2eddd-f7db-4360-ac58-0af39ce66935.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/sfairXC_FsfairX-LLaMA3-RM-v0.1/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6292 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5916 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4188 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6284 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7667 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7051 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6647 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/stabilityai/stable-code-instruct-3b/996ca604-e01c-4a95-9286-60b6dc04f67d.json b/data/reward-bench/stabilityai/stable-code-instruct-3b/996ca604-e01c-4a95-9286-60b6dc04f67d.json index a33ed8633..74acfeed3 100644 --- a/data/reward-bench/stabilityai/stable-code-instruct-3b/996ca604-e01c-4a95-9286-60b6dc04f67d.json +++ b/data/reward-bench/stabilityai/stable-code-instruct-3b/996ca604-e01c-4a95-9286-60b6dc04f67d.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/stabilityai_stable-code-instruct-3b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6216 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5782 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5855 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6554 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7528 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.4506 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/stabilityai/stablelm-2-12b-chat/b6f0089f-d04b-4bcd-be84-ce3bc0d6c2b9.json b/data/reward-bench/stabilityai/stablelm-2-12b-chat/b6f0089f-d04b-4bcd-be84-ce3bc0d6c2b9.json index 267923a0a..491861cbd 100644 --- a/data/reward-bench/stabilityai/stablelm-2-12b-chat/b6f0089f-d04b-4bcd-be84-ce3bc0d6c2b9.json +++ b/data/reward-bench/stabilityai/stablelm-2-12b-chat/b6f0089f-d04b-4bcd-be84-ce3bc0d6c2b9.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/stabilityai_stablelm-2-12b-chat/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7642 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9665 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5548 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7811 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.8945 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.4839 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/stabilityai/stablelm-2-zephyr-1_6b/83e15cba-4fec-48f2-9be4-78decbd96f66.json b/data/reward-bench/stabilityai/stablelm-2-zephyr-1_6b/83e15cba-4fec-48f2-9be4-78decbd96f66.json index 96b9cccee..3406eee45 100644 --- a/data/reward-bench/stabilityai/stablelm-2-zephyr-1_6b/83e15cba-4fec-48f2-9be4-78decbd96f66.json +++ b/data/reward-bench/stabilityai/stablelm-2-zephyr-1_6b/83e15cba-4fec-48f2-9be4-78decbd96f66.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/stabilityai_stablelm-2-zephyr-1_6b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6574 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9665 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4671 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6027 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6784 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.4868 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/stabilityai/stablelm-zephyr-3b/493617c0-37eb-4c83-b175-2507a3647b5e.json b/data/reward-bench/stabilityai/stablelm-zephyr-3b/493617c0-37eb-4c83-b175-2507a3647b5e.json index 86b5e6605..47f135b1d 100644 --- a/data/reward-bench/stabilityai/stablelm-zephyr-3b/493617c0-37eb-4c83-b175-2507a3647b5e.json +++ b/data/reward-bench/stabilityai/stablelm-zephyr-3b/493617c0-37eb-4c83-b175-2507a3647b5e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/stabilityai_stablelm-zephyr-3b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7146 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8631 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6009 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7405 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7573 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.5075 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/stanfordnlp/SteamSHP-flan-t5-large/97f494ce-3c9c-4a19-a237-d458be611a0a.json b/data/reward-bench/stanfordnlp/SteamSHP-flan-t5-large/97f494ce-3c9c-4a19-a237-d458be611a0a.json index 6959fb49d..47e44c557 100644 --- a/data/reward-bench/stanfordnlp/SteamSHP-flan-t5-large/97f494ce-3c9c-4a19-a237-d458be611a0a.json +++ b/data/reward-bench/stanfordnlp/SteamSHP-flan-t5-large/97f494ce-3c9c-4a19-a237-d458be611a0a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/stanfordnlp_SteamSHP-flan-t5-large/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.4962 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8575 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3311 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.3743 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.3563 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6273 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/stanfordnlp/SteamSHP-flan-t5-xl/f8bf1e92-3cc3-4c7e-9770-485a3074e85f.json b/data/reward-bench/stanfordnlp/SteamSHP-flan-t5-xl/f8bf1e92-3cc3-4c7e-9770-485a3074e85f.json index 05a9b1b28..6a0de9161 100644 --- a/data/reward-bench/stanfordnlp/SteamSHP-flan-t5-xl/f8bf1e92-3cc3-4c7e-9770-485a3074e85f.json +++ b/data/reward-bench/stanfordnlp/SteamSHP-flan-t5-xl/f8bf1e92-3cc3-4c7e-9770-485a3074e85f.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/stanfordnlp_SteamSHP-flan-t5-xl/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5135 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8547 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3684 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.3784 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.3841 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6498 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/unknown/Cohere March 2024/5bf73fba-520f-4a2f-9296-8240847eb8ec.json b/data/reward-bench/unknown/Cohere March 2024/5bf73fba-520f-4a2f-9296-8240847eb8ec.json index 9bd7881e2..b2e8c1248 100644 --- a/data/reward-bench/unknown/Cohere March 2024/5bf73fba-520f-4a2f-9296-8240847eb8ec.json +++ b/data/reward-bench/unknown/Cohere March 2024/5bf73fba-520f-4a2f-9296-8240847eb8ec.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Cohere March 2024/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8511 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9469 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6513 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.877 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9817 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.7458 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/unknown/Cohere May 2024/3dd2c89f-64f5-4bbc-a621-791a9f0538b2.json b/data/reward-bench/unknown/Cohere May 2024/3dd2c89f-64f5-4bbc-a621-791a9f0538b2.json index 0945d9e1d..a50e32313 100644 --- a/data/reward-bench/unknown/Cohere May 2024/3dd2c89f-64f5-4bbc-a621-791a9f0538b2.json +++ b/data/reward-bench/unknown/Cohere May 2024/3dd2c89f-64f5-4bbc-a621-791a9f0538b2.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/Cohere May 2024/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.8816 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9637 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.7127 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.923 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.9768 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.782 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/unknown/gemini-1.5-flash-8b/ef987556-7277-48d8-ac07-532586773a3a.json b/data/reward-bench/unknown/gemini-1.5-flash-8b/ef987556-7277-48d8-ac07-532586773a3a.json index 7a9d537f7..dd83d6018 100644 --- a/data/reward-bench/unknown/gemini-1.5-flash-8b/ef987556-7277-48d8-ac07-532586773a3a.json +++ b/data/reward-bench/unknown/gemini-1.5-flash-8b/ef987556-7277-48d8-ac07-532586773a3a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/gemini-1.5-flash-8b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7601 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9441 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5987 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.7399 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,7 +101,12 @@ }, "score_details": { "score": 0.7575 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/upstage/SOLAR-10.7B-Instruct-v1.0/add7eddb-7a8b-4c78-9864-c4316a97ce5e.json b/data/reward-bench/upstage/SOLAR-10.7B-Instruct-v1.0/add7eddb-7a8b-4c78-9864-c4316a97ce5e.json index 959f86ef9..5df4ce7b8 100644 --- a/data/reward-bench/upstage/SOLAR-10.7B-Instruct-v1.0/add7eddb-7a8b-4c78-9864-c4316a97ce5e.json +++ b/data/reward-bench/upstage/SOLAR-10.7B-Instruct-v1.0/add7eddb-7a8b-4c78-9864-c4316a97ce5e.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/upstage_SOLAR-10.7B-Instruct-v1.0/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7391 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8156 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6864 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8514 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7252 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.4949 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/wenbopan/Faro-Yi-9B-DPO/caf02954-1eed-44eb-b5f4-df47c90828d7.json b/data/reward-bench/wenbopan/Faro-Yi-9B-DPO/caf02954-1eed-44eb-b5f4-df47c90828d7.json index 1ed12c67e..78244f3ff 100644 --- a/data/reward-bench/wenbopan/Faro-Yi-9B-DPO/caf02954-1eed-44eb-b5f4-df47c90828d7.json +++ b/data/reward-bench/wenbopan/Faro-Yi-9B-DPO/caf02954-1eed-44eb-b5f4-df47c90828d7.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/wenbopan_Faro-Yi-9B-DPO/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6461 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9218 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5307 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5514 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.5839 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6395 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/weqweasdas/RM-Gemma-2B/00798930-daa2-4e79-82c6-2cccf1c3a0cb.json b/data/reward-bench/weqweasdas/RM-Gemma-2B/00798930-daa2-4e79-82c6-2cccf1c3a0cb.json index b2a906172..8856c5e7a 100644 --- a/data/reward-bench/weqweasdas/RM-Gemma-2B/00798930-daa2-4e79-82c6-2cccf1c3a0cb.json +++ b/data/reward-bench/weqweasdas/RM-Gemma-2B/00798930-daa2-4e79-82c6-2cccf1c3a0cb.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/weqweasdas_RM-Gemma-2B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6549 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9441 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4079 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.4986 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7637 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6652 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/weqweasdas/RM-Gemma-2B/71658cf8-0189-49dc-847f-b9a9b5faee4a.json b/data/reward-bench/weqweasdas/RM-Gemma-2B/71658cf8-0189-49dc-847f-b9a9b5faee4a.json index 2ca9f1e1b..c3e2d4a3c 100644 --- a/data/reward-bench/weqweasdas/RM-Gemma-2B/71658cf8-0189-49dc-847f-b9a9b5faee4a.json +++ b/data/reward-bench/weqweasdas/RM-Gemma-2B/71658cf8-0189-49dc-847f-b9a9b5faee4a.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/weqweasdas_RM-Gemma-2B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.3057 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.3705 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.2812 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.4317 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.3311 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.2343 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.1851 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/weqweasdas/RM-Gemma-7B-4096/3d506b91-5b0d-47e3-a3a0-bc09808bf5b5.json b/data/reward-bench/weqweasdas/RM-Gemma-7B-4096/3d506b91-5b0d-47e3-a3a0-bc09808bf5b5.json index e041d4d42..81934bf6a 100644 --- a/data/reward-bench/weqweasdas/RM-Gemma-7B-4096/3d506b91-5b0d-47e3-a3a0-bc09808bf5b5.json +++ b/data/reward-bench/weqweasdas/RM-Gemma-7B-4096/3d506b91-5b0d-47e3-a3a0-bc09808bf5b5.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/weqweasdas_RM-Gemma-7B-4096/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6922 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9497 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.5022 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5608 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7511 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.7024 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/weqweasdas/RM-Gemma-7B/04c71231-2025-4e1a-b7ed-56b245868089.json b/data/reward-bench/weqweasdas/RM-Gemma-7B/04c71231-2025-4e1a-b7ed-56b245868089.json index 5c579948e..02ba525c2 100644 --- a/data/reward-bench/weqweasdas/RM-Gemma-7B/04c71231-2025-4e1a-b7ed-56b245868089.json +++ b/data/reward-bench/weqweasdas/RM-Gemma-7B/04c71231-2025-4e1a-b7ed-56b245868089.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/weqweasdas_RM-Gemma-7B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.6967 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9693 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.4978 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5784 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7362 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.7069 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/weqweasdas/RM-Gemma-7B/08b2edd0-f8e9-47cd-b19d-53fdc7209917.json b/data/reward-bench/weqweasdas/RM-Gemma-7B/08b2edd0-f8e9-47cd-b19d-53fdc7209917.json index 07fd2fbc9..ed03af39d 100644 --- a/data/reward-bench/weqweasdas/RM-Gemma-7B/08b2edd0-f8e9-47cd-b19d-53fdc7209917.json +++ b/data/reward-bench/weqweasdas/RM-Gemma-7B/08b2edd0-f8e9-47cd-b19d-53fdc7209917.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/weqweasdas_RM-Gemma-7B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.4826 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.4926 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3937 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.6066 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.4822 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.497 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.4232 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/weqweasdas/RM-Mistral-7B/79a43841-4032-4a20-8b5a-83b4b446d107.json b/data/reward-bench/weqweasdas/RM-Mistral-7B/79a43841-4032-4a20-8b5a-83b4b446d107.json index 6afc5be13..7abde633d 100644 --- a/data/reward-bench/weqweasdas/RM-Mistral-7B/79a43841-4032-4a20-8b5a-83b4b446d107.json +++ b/data/reward-bench/weqweasdas/RM-Mistral-7B/79a43841-4032-4a20-8b5a-83b4b446d107.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/weqweasdas_RM-Mistral-7B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.7982 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.9665 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.6053 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.8703 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.7736 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.753 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/weqweasdas/RM-Mistral-7B/a2c16ab8-1098-490a-8d0a-392d835427e0.json b/data/reward-bench/weqweasdas/RM-Mistral-7B/a2c16ab8-1098-490a-8d0a-392d835427e0.json index 7abeeaa13..d53f1986e 100644 --- a/data/reward-bench/weqweasdas/RM-Mistral-7B/a2c16ab8-1098-490a-8d0a-392d835427e0.json +++ b/data/reward-bench/weqweasdas/RM-Mistral-7B/a2c16ab8-1098-490a-8d0a-392d835427e0.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/weqweasdas_RM-Mistral-7B/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.596 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.5937 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3438 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.5956 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.6911 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.7293 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.6226 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/0aa12860-7ebe-49c2-a5af-1926d23e34f8.json b/data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/0aa12860-7ebe-49c2-a5af-1926d23e34f8.json index 765de8cd1..dbe32c629 100644 --- a/data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/0aa12860-7ebe-49c2-a5af-1926d23e34f8.json +++ b/data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/0aa12860-7ebe-49c2-a5af-1926d23e34f8.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench/weqweasdas_hh_rlhf_rm_open_llama_3b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/spaces/allenai/reward-bench" - ], "source_metadata": { "source_name": "RewardBench", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.5027 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.8184 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.3728 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.4149 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.3281 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } }, { @@ -97,7 +119,12 @@ }, "score_details": { "score": 0.6564 + }, + "source_data": { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench" } } ] -} \ No newline at end of file +} diff --git a/data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/796d3ec1-9c26-4ead-87cb-4eb866209120.json b/data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/796d3ec1-9c26-4ead-87cb-4eb866209120.json index ca35ed864..7e050faee 100644 --- a/data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/796d3ec1-9c26-4ead-87cb-4eb866209120.json +++ b/data/reward-bench/weqweasdas/hh_rlhf_rm_open_llama_3b/796d3ec1-9c26-4ead-87cb-4eb866209120.json @@ -1,10 +1,7 @@ { - "schema_version": "0.1.0", + "schema_version": "0.2.0", "evaluation_id": "reward-bench-2/weqweasdas_hh_rlhf_rm_open_llama_3b/1766412838.146816", "retrieved_timestamp": "1766412838.146816", - "source_data": [ - "https://huggingface.co/datasets/allenai/reward-bench-2-results" - ], "source_metadata": { "source_name": "RewardBench 2", "source_type": "documentation", @@ -32,6 +29,11 @@ }, "score_details": { "score": 0.2498 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -45,6 +47,11 @@ }, "score_details": { "score": 0.3642 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -58,6 +65,11 @@ }, "score_details": { "score": 0.275 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -71,6 +83,11 @@ }, "score_details": { "score": 0.3497 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -84,6 +101,11 @@ }, "score_details": { "score": 0.24 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -97,6 +119,11 @@ }, "score_details": { "score": 0.2384 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } }, { @@ -110,7 +137,12 @@ }, "score_details": { "score": 0.0315 + }, + "source_data": { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results" } } ] -} \ No newline at end of file +} diff --git a/scripts/rewardbench/adapter.py b/scripts/rewardbench/adapter.py index eb3133905..e450b3835 100644 --- a/scripts/rewardbench/adapter.py +++ b/scripts/rewardbench/adapter.py @@ -21,12 +21,15 @@ sys.path.insert(0, str(Path(__file__).parent.parent)) from utils import ( + SCHEMA_VERSION, fetch_csv, fetch_json, get_developer, make_evaluation_result, make_source_metadata, make_model_info, + make_source_data_hf, + make_source_data_url, save_evaluation_log, ) @@ -38,6 +41,18 @@ OUTPUT_DIR = "data/reward-bench" +# RewardBench v1 source data (shared across all v1 evaluation results) +V1_SOURCE_DATA = make_source_data_hf( + dataset_name="RewardBench", + hf_repo="allenai/reward-bench", +) + +# RewardBench v2 source data (shared across all v2 evaluation results) +V2_SOURCE_DATA = make_source_data_hf( + dataset_name="RewardBench 2", + hf_repo="allenai/reward-bench-2-results", +) + # RewardBench v1 metrics with descriptions V1_METRICS = { "Score": "Overall RewardBench Score", @@ -111,6 +126,7 @@ def fetch_rewardbench_v1(retrieved_timestamp: str) -> int: name=metric_name, score=score, description=description, + source_data=V1_SOURCE_DATA, ) ) @@ -127,10 +143,9 @@ def fetch_rewardbench_v1(retrieved_timestamp: str) -> int: # Build evaluation log evaluation_id = f"reward-bench/{model_info.id.replace('/', '_')}/{retrieved_timestamp}" eval_log = EvaluationLog( - schema_version="0.1.0", + schema_version=SCHEMA_VERSION, evaluation_id=evaluation_id, retrieved_timestamp=retrieved_timestamp, - source_data=["https://huggingface.co/spaces/allenai/reward-bench"], source_metadata=make_source_metadata( source_name="RewardBench", organization_name="Allen Institute for AI", @@ -208,6 +223,7 @@ def fetch_rewardbench_v2(retrieved_timestamp: str) -> int: name=metric_name, score=score, description=description, + source_data=V2_SOURCE_DATA, ) ) except (ValueError, TypeError): @@ -225,6 +241,7 @@ def fetch_rewardbench_v2(retrieved_timestamp: str) -> int: name="Score", score=mean_score, description="Overall RewardBench 2 Score (mean of all metrics)", + source_data=V2_SOURCE_DATA, ), ) @@ -238,10 +255,9 @@ def fetch_rewardbench_v2(retrieved_timestamp: str) -> int: # Build evaluation log evaluation_id = f"reward-bench-2/{model_info.id.replace('/', '_')}/{retrieved_timestamp}" eval_log = EvaluationLog( - schema_version="0.1.0", + schema_version=SCHEMA_VERSION, evaluation_id=evaluation_id, retrieved_timestamp=retrieved_timestamp, - source_data=["https://huggingface.co/datasets/allenai/reward-bench-2-results"], source_metadata=make_source_metadata( source_name="RewardBench 2", organization_name="Allen Institute for AI", diff --git a/scripts/rewardbench/migrate_to_v020.py b/scripts/rewardbench/migrate_to_v020.py new file mode 100644 index 000000000..e36880a16 --- /dev/null +++ b/scripts/rewardbench/migrate_to_v020.py @@ -0,0 +1,113 @@ +""" +Migration script to update reward-bench JSON files from schema v0.1.0 to v0.2.0. + +Key changes: +- schema_version: "0.1.0" -> "0.2.0" +- Remove top-level "source_data" field +- Add "source_data" to each evaluation result item +- Remove "inference_platform": "unknown" from model_info (now optional) + +For RewardBench v1 results (evaluation_id starts with "reward-bench/"): + source_data = {"dataset_name": "RewardBench", "source_type": "hf_dataset", "hf_repo": "allenai/reward-bench"} + +For RewardBench v2 results (evaluation_id starts with "reward-bench-2/"): + source_data = {"dataset_name": "RewardBench 2", "source_type": "hf_dataset", "hf_repo": "allenai/reward-bench-2-results"} + +Usage: + python -m scripts.rewardbench.migrate_to_v020 +""" + +import json +from pathlib import Path + + +DATA_DIR = Path("data/reward-bench") + +V1_SOURCE_DATA = { + "dataset_name": "RewardBench", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench", +} + +V2_SOURCE_DATA = { + "dataset_name": "RewardBench 2", + "source_type": "hf_dataset", + "hf_repo": "allenai/reward-bench-2-results", +} + + +def migrate_file(filepath: Path) -> bool: + """ + Migrate a single JSON file from v0.1.0 to v0.2.0. + + Returns True if the file was modified, False if it was already up to date. + """ + with open(filepath, "r") as f: + data = json.load(f) + + # Skip files that are already v0.2.0 + if data.get("schema_version") == "0.2.0": + return False + + # Determine source_data based on evaluation_id + evaluation_id = data.get("evaluation_id", "") + if evaluation_id.startswith("reward-bench-2/"): + source_data = V2_SOURCE_DATA + else: + source_data = V1_SOURCE_DATA + + # 1. Update schema_version + data["schema_version"] = "0.2.0" + + # 2. Remove top-level source_data + data.pop("source_data", None) + + # 3. Add source_data to each evaluation result + for result in data.get("evaluation_results", []): + if "source_data" not in result: + result["source_data"] = source_data + + # 4. Clean up model_info: remove inference_platform if "unknown" + model_info = data.get("model_info", {}) + if model_info.get("inference_platform") == "unknown": + del model_info["inference_platform"] + + # Write back + with open(filepath, "w") as f: + json.dump(data, f, indent=2) + f.write("\n") + + return True + + +def main(): + """Migrate all reward-bench JSON files to v0.2.0.""" + if not DATA_DIR.exists(): + print(f"Error: {DATA_DIR} does not exist") + return + + json_files = sorted(DATA_DIR.rglob("*.json")) + print(f"Found {len(json_files)} JSON files in {DATA_DIR}") + + migrated = 0 + skipped = 0 + errors = 0 + + for filepath in json_files: + try: + if migrate_file(filepath): + migrated += 1 + else: + skipped += 1 + except Exception as e: + print(f" Error migrating {filepath}: {e}") + errors += 1 + + print(f"\nMigration complete:") + print(f" Migrated: {migrated}") + print(f" Skipped (already v0.2.0): {skipped}") + print(f" Errors: {errors}") + + +if __name__ == "__main__": + main() diff --git a/scripts/utils/__init__.py b/scripts/utils/__init__.py index 0ce4c3702..08b5f76fb 100644 --- a/scripts/utils/__init__.py +++ b/scripts/utils/__init__.py @@ -4,11 +4,15 @@ from .fetch import fetch_json, fetch_csv, FetchError from .io import save_evaluation_log, generate_output_path, sanitize_filename from .schema import ( + SCHEMA_VERSION, make_metric_config, make_evaluation_result, make_source_metadata, make_model_info, make_evaluation_log, + make_source_data_url, + make_source_data_hf, + make_source_data_private, ) __all__ = [ @@ -24,9 +28,13 @@ "generate_output_path", "sanitize_filename", # schema.py + "SCHEMA_VERSION", "make_metric_config", "make_evaluation_result", "make_source_metadata", "make_model_info", "make_evaluation_log", + "make_source_data_url", + "make_source_data_hf", + "make_source_data_private", ] diff --git a/scripts/utils/schema.py b/scripts/utils/schema.py index 08d66e4ac..870ba208a 100644 --- a/scripts/utils/schema.py +++ b/scripts/utils/schema.py @@ -1,7 +1,7 @@ """Schema construction helpers for building evaluation logs.""" import time -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Union from eval_types import ( EvaluationLog, @@ -11,11 +11,88 @@ ModelInfo, ScoreDetails, ScoreType, + SourceDataHf, + SourceDataPrivate, + SourceDataUrl, SourceMetadata, ) from .developer import get_developer, get_model_id +# Schema version constant +SCHEMA_VERSION = "0.2.0" + +# Type alias for source data variants +SourceData = Union[SourceDataUrl, SourceDataHf, SourceDataPrivate] + + +def make_source_data_url( + dataset_name: str, + url: List[str], + **additional_details: Any, +) -> SourceDataUrl: + """ + Create a SourceDataUrl for URL-based evaluation data sources. + + Args: + dataset_name: Name of the source dataset + url: List of URL(s) for the source of the evaluation data + **additional_details: Additional metadata key-value pairs + + Returns: + Configured SourceDataUrl instance + """ + return SourceDataUrl( + dataset_name=dataset_name, + source_type="url", + url=url, + ) + + +def make_source_data_hf( + dataset_name: str, + hf_repo: Optional[str] = None, + hf_split: Optional[str] = None, + samples_number: Optional[int] = None, +) -> SourceDataHf: + """ + Create a SourceDataHf for HuggingFace dataset sources. + + Args: + dataset_name: Name of the source dataset + hf_repo: HuggingFace repository identifier + hf_split: Dataset split (train, val, or test) + samples_number: Number of samples in the dataset + + Returns: + Configured SourceDataHf instance + """ + return SourceDataHf( + dataset_name=dataset_name, + source_type="hf_dataset", + hf_repo=hf_repo, + hf_split=hf_split, + samples_number=samples_number, + ) + + +def make_source_data_private( + dataset_name: str, +) -> SourceDataPrivate: + """ + Create a SourceDataPrivate for private/custom dataset sources. + + Args: + dataset_name: Name of the source dataset + + Returns: + Configured SourceDataPrivate instance + """ + return SourceDataPrivate( + dataset_name=dataset_name, + source_type="other", + ) + def make_metric_config( description: str, @@ -64,6 +141,7 @@ def make_evaluation_result( name: str, score: float, description: str, + source_data: SourceData, lower_is_better: bool = False, score_type: ScoreType = ScoreType.continuous, min_score: float = 0.0, @@ -80,6 +158,7 @@ def make_evaluation_result( name: Name of the evaluation (e.g., "MMLU", "GSM8K") score: The score value description: Human-readable description of what this measures + source_data: Source dataset information (URL, HuggingFace, or private) lower_is_better: Whether lower scores are better score_type: Type of score min_score: Minimum possible score @@ -92,6 +171,7 @@ def make_evaluation_result( """ return EvaluationResult( evaluation_name=name, + source_data=source_data, metric_config=make_metric_config( description=description, lower_is_better=lower_is_better, @@ -139,7 +219,7 @@ def make_source_metadata( def make_model_info( model_name: str, developer: Optional[str] = None, - inference_platform: str = "unknown", + inference_platform: Optional[str] = None, additional_details: Optional[Dict[str, Any]] = None, ) -> ModelInfo: """ @@ -150,7 +230,7 @@ def make_model_info( Args: model_name: Name of the model developer: Optional developer override - inference_platform: Platform used for inference + inference_platform: Optional platform used for inference additional_details: Extra model metadata Returns: @@ -172,13 +252,12 @@ def make_evaluation_log( source_name: str, model_name: str, evaluation_results: List[EvaluationResult], - source_data: List[str], organization_name: str, source_type: str = "documentation", evaluator_relationship: EvaluatorRelationship = EvaluatorRelationship.third_party, organization_url: Optional[str] = None, developer: Optional[str] = None, - inference_platform: str = "unknown", + inference_platform: Optional[str] = None, model_additional_details: Optional[Dict[str, Any]] = None, retrieved_timestamp: Optional[str] = None, ) -> EvaluationLog: @@ -190,14 +269,13 @@ def make_evaluation_log( Args: source_name: Name of the evaluation source model_name: Name of the model being evaluated - evaluation_results: List of evaluation results - source_data: URLs or dataset info for the source data + evaluation_results: List of evaluation results (each must include source_data) organization_name: Organization providing the evaluation source_type: Either "documentation" or "evaluation_run" evaluator_relationship: Relationship to model developer organization_url: Optional URL for the organization developer: Optional developer override - inference_platform: Platform used for inference + inference_platform: Optional platform used for inference model_additional_details: Extra model metadata retrieved_timestamp: Optional timestamp override @@ -213,10 +291,9 @@ def make_evaluation_log( evaluation_id = f"{source_name}/{sanitized_model_id}/{timestamp}" return EvaluationLog( - schema_version="0.1.0", + schema_version=SCHEMA_VERSION, evaluation_id=evaluation_id, retrieved_timestamp=timestamp, - source_data=source_data, source_metadata=make_source_metadata( source_name=source_name, organization_name=organization_name, From 83a3a4fd70d95421662af96a0a92a40725074dfd Mon Sep 17 00:00:00 2001 From: Asaf Yehudai Date: Wed, 11 Feb 2026 10:47:42 +0200 Subject: [PATCH 2/2] update RewardBench to v0.2 Update RewardBench to v0.2, make it self contained, and check that it pass the schema tests. --- scripts/rewardbench/adapter.py | 130 ++++++++++++++++++++++++--------- scripts/utils/__init__.py | 8 -- scripts/utils/schema.py | 97 +++--------------------- 3 files changed, 107 insertions(+), 128 deletions(-) diff --git a/scripts/rewardbench/adapter.py b/scripts/rewardbench/adapter.py index e450b3835..09ceef8d1 100644 --- a/scripts/rewardbench/adapter.py +++ b/scripts/rewardbench/adapter.py @@ -12,47 +12,74 @@ import re import time -from typing import List, Optional - -from eval_types import EvaluationLog, EvaluationResult, EvaluatorRelationship +import uuid +from pathlib import Path +from typing import Any, Dict, List, Optional + +from eval_types import ( + EvaluationLog, + EvaluationResult, + EvaluatorRelationship, + MetricConfig, + ModelInfo, + ScoreDetails, + ScoreType, + SourceDataHf, + SourceMetadata, +) import sys -from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) from utils import ( - SCHEMA_VERSION, fetch_csv, fetch_json, get_developer, - make_evaluation_result, - make_source_metadata, - make_model_info, - make_source_data_hf, - make_source_data_url, - save_evaluation_log, + get_model_id, + sanitize_filename, ) +# Schema version +SCHEMA_VERSION = "0.2.0" # Data source URLs REWARDBENCH_V1_CSV = "https://huggingface.co/spaces/allenai/reward-bench/resolve/main/leaderboard/final-rbv1-data.csv" REWARDBENCH_V2_TREE_API = "https://huggingface.co/api/datasets/allenai/reward-bench-2-results/tree/main/eval-set" REWARDBENCH_V2_FILE_BASE = "https://huggingface.co/datasets/allenai/reward-bench-2-results/resolve/main/eval-set" -OUTPUT_DIR = "data/reward-bench" +OUTPUT_DIR = Path("data/reward-bench") # RewardBench v1 source data (shared across all v1 evaluation results) -V1_SOURCE_DATA = make_source_data_hf( +V1_SOURCE_DATA = SourceDataHf( dataset_name="RewardBench", + source_type="hf_dataset", hf_repo="allenai/reward-bench", ) # RewardBench v2 source data (shared across all v2 evaluation results) -V2_SOURCE_DATA = make_source_data_hf( +V2_SOURCE_DATA = SourceDataHf( dataset_name="RewardBench 2", + source_type="hf_dataset", hf_repo="allenai/reward-bench-2-results", ) +# Source metadata (shared) +V1_SOURCE_METADATA = SourceMetadata( + source_name="RewardBench", + source_type="documentation", + source_organization_name="Allen Institute for AI", + source_organization_url="https://allenai.org", + evaluator_relationship=EvaluatorRelationship.third_party, +) + +V2_SOURCE_METADATA = SourceMetadata( + source_name="RewardBench 2", + source_type="documentation", + source_organization_name="Allen Institute for AI", + source_organization_url="https://allenai.org", + evaluator_relationship=EvaluatorRelationship.third_party, +) + # RewardBench v1 metrics with descriptions V1_METRICS = { "Score": "Overall RewardBench Score", @@ -74,6 +101,53 @@ ] +def _make_eval_result( + name: str, + score: float, + description: str, + source_data: SourceDataHf, +) -> EvaluationResult: + """Create an EvaluationResult for a continuous 0-1 metric.""" + return EvaluationResult( + evaluation_name=name, + source_data=source_data, + metric_config=MetricConfig( + evaluation_description=description, + lower_is_better=False, + score_type=ScoreType.continuous, + min_score=0.0, + max_score=1.0, + ), + score_details=ScoreDetails(score=round(score, 4)), + ) + + +def _make_model_info( + model_name: str, + developer: str, + additional_details: Optional[Dict[str, Any]] = None, +) -> ModelInfo: + """Create ModelInfo without setting inference_platform.""" + model_id = get_model_id(model_name, developer) + return ModelInfo( + name=model_name, + id=model_id, + developer=developer, + additional_details=additional_details, + ) + + +def _save_eval_log(eval_log: EvaluationLog, developer: str, model: str) -> Path: + """Save an evaluation log to the standard directory structure.""" + dir_path = OUTPUT_DIR / sanitize_filename(developer) / sanitize_filename(model) + dir_path.mkdir(parents=True, exist_ok=True) + + filepath = dir_path / f"{uuid.uuid4()}.json" + json_str = eval_log.model_dump_json(indent=2, exclude_none=True) + filepath.write_text(json_str) + return filepath + + def extract_model_name_from_html(html_string: str) -> str: """Extract the model name from an HTML anchor tag.""" pattern = r">([^<]+)<" @@ -122,7 +196,7 @@ def fetch_rewardbench_v1(retrieved_timestamp: str) -> int: score = parse_score(row.get(metric_name, "")) if score is not None: eval_results.append( - make_evaluation_result( + _make_eval_result( name=metric_name, score=score, description=description, @@ -134,7 +208,7 @@ def fetch_rewardbench_v1(retrieved_timestamp: str) -> int: continue # Build model info - model_info = make_model_info( + model_info = _make_model_info( model_name=model_name, developer=developer, additional_details={"model_type": model_type} if model_type else None, @@ -146,12 +220,7 @@ def fetch_rewardbench_v1(retrieved_timestamp: str) -> int: schema_version=SCHEMA_VERSION, evaluation_id=evaluation_id, retrieved_timestamp=retrieved_timestamp, - source_metadata=make_source_metadata( - source_name="RewardBench", - organization_name="Allen Institute for AI", - organization_url="https://allenai.org", - evaluator_relationship=EvaluatorRelationship.third_party, - ), + source_metadata=V1_SOURCE_METADATA, model_info=model_info, evaluation_results=eval_results, ) @@ -162,7 +231,7 @@ def fetch_rewardbench_v1(retrieved_timestamp: str) -> int: else: dev, model = "unknown", model_info.id - filepath = save_evaluation_log(eval_log, OUTPUT_DIR, dev, model) + filepath = _save_eval_log(eval_log, dev, model) print(f"Saved: {filepath}") count += 1 @@ -219,7 +288,7 @@ def fetch_rewardbench_v2(retrieved_timestamp: str) -> int: score = float(model_data[metric_name]) scores_for_average.append(score) eval_results.append( - make_evaluation_result( + _make_eval_result( name=metric_name, score=score, description=description, @@ -237,7 +306,7 @@ def fetch_rewardbench_v2(retrieved_timestamp: str) -> int: mean_score = sum(scores_for_average) / len(scores_for_average) eval_results.insert( 0, - make_evaluation_result( + _make_eval_result( name="Score", score=mean_score, description="Overall RewardBench 2 Score (mean of all metrics)", @@ -246,7 +315,7 @@ def fetch_rewardbench_v2(retrieved_timestamp: str) -> int: ) # Build model info - model_info = make_model_info( + model_info = _make_model_info( model_name=model_name, developer=developer, additional_details={"model_type": model_type} if model_type else None, @@ -258,12 +327,7 @@ def fetch_rewardbench_v2(retrieved_timestamp: str) -> int: schema_version=SCHEMA_VERSION, evaluation_id=evaluation_id, retrieved_timestamp=retrieved_timestamp, - source_metadata=make_source_metadata( - source_name="RewardBench 2", - organization_name="Allen Institute for AI", - organization_url="https://allenai.org", - evaluator_relationship=EvaluatorRelationship.third_party, - ), + source_metadata=V2_SOURCE_METADATA, model_info=model_info, evaluation_results=eval_results, ) @@ -274,7 +338,7 @@ def fetch_rewardbench_v2(retrieved_timestamp: str) -> int: else: dev, model = "unknown", model_info.id - filepath = save_evaluation_log(eval_log, OUTPUT_DIR, dev, model) + filepath = _save_eval_log(eval_log, dev, model) print(f" Saved: {filepath}") count += 1 diff --git a/scripts/utils/__init__.py b/scripts/utils/__init__.py index 08b5f76fb..0ce4c3702 100644 --- a/scripts/utils/__init__.py +++ b/scripts/utils/__init__.py @@ -4,15 +4,11 @@ from .fetch import fetch_json, fetch_csv, FetchError from .io import save_evaluation_log, generate_output_path, sanitize_filename from .schema import ( - SCHEMA_VERSION, make_metric_config, make_evaluation_result, make_source_metadata, make_model_info, make_evaluation_log, - make_source_data_url, - make_source_data_hf, - make_source_data_private, ) __all__ = [ @@ -28,13 +24,9 @@ "generate_output_path", "sanitize_filename", # schema.py - "SCHEMA_VERSION", "make_metric_config", "make_evaluation_result", "make_source_metadata", "make_model_info", "make_evaluation_log", - "make_source_data_url", - "make_source_data_hf", - "make_source_data_private", ] diff --git a/scripts/utils/schema.py b/scripts/utils/schema.py index 870ba208a..08d66e4ac 100644 --- a/scripts/utils/schema.py +++ b/scripts/utils/schema.py @@ -1,7 +1,7 @@ """Schema construction helpers for building evaluation logs.""" import time -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional from eval_types import ( EvaluationLog, @@ -11,88 +11,11 @@ ModelInfo, ScoreDetails, ScoreType, - SourceDataHf, - SourceDataPrivate, - SourceDataUrl, SourceMetadata, ) from .developer import get_developer, get_model_id -# Schema version constant -SCHEMA_VERSION = "0.2.0" - -# Type alias for source data variants -SourceData = Union[SourceDataUrl, SourceDataHf, SourceDataPrivate] - - -def make_source_data_url( - dataset_name: str, - url: List[str], - **additional_details: Any, -) -> SourceDataUrl: - """ - Create a SourceDataUrl for URL-based evaluation data sources. - - Args: - dataset_name: Name of the source dataset - url: List of URL(s) for the source of the evaluation data - **additional_details: Additional metadata key-value pairs - - Returns: - Configured SourceDataUrl instance - """ - return SourceDataUrl( - dataset_name=dataset_name, - source_type="url", - url=url, - ) - - -def make_source_data_hf( - dataset_name: str, - hf_repo: Optional[str] = None, - hf_split: Optional[str] = None, - samples_number: Optional[int] = None, -) -> SourceDataHf: - """ - Create a SourceDataHf for HuggingFace dataset sources. - - Args: - dataset_name: Name of the source dataset - hf_repo: HuggingFace repository identifier - hf_split: Dataset split (train, val, or test) - samples_number: Number of samples in the dataset - - Returns: - Configured SourceDataHf instance - """ - return SourceDataHf( - dataset_name=dataset_name, - source_type="hf_dataset", - hf_repo=hf_repo, - hf_split=hf_split, - samples_number=samples_number, - ) - - -def make_source_data_private( - dataset_name: str, -) -> SourceDataPrivate: - """ - Create a SourceDataPrivate for private/custom dataset sources. - - Args: - dataset_name: Name of the source dataset - - Returns: - Configured SourceDataPrivate instance - """ - return SourceDataPrivate( - dataset_name=dataset_name, - source_type="other", - ) - def make_metric_config( description: str, @@ -141,7 +64,6 @@ def make_evaluation_result( name: str, score: float, description: str, - source_data: SourceData, lower_is_better: bool = False, score_type: ScoreType = ScoreType.continuous, min_score: float = 0.0, @@ -158,7 +80,6 @@ def make_evaluation_result( name: Name of the evaluation (e.g., "MMLU", "GSM8K") score: The score value description: Human-readable description of what this measures - source_data: Source dataset information (URL, HuggingFace, or private) lower_is_better: Whether lower scores are better score_type: Type of score min_score: Minimum possible score @@ -171,7 +92,6 @@ def make_evaluation_result( """ return EvaluationResult( evaluation_name=name, - source_data=source_data, metric_config=make_metric_config( description=description, lower_is_better=lower_is_better, @@ -219,7 +139,7 @@ def make_source_metadata( def make_model_info( model_name: str, developer: Optional[str] = None, - inference_platform: Optional[str] = None, + inference_platform: str = "unknown", additional_details: Optional[Dict[str, Any]] = None, ) -> ModelInfo: """ @@ -230,7 +150,7 @@ def make_model_info( Args: model_name: Name of the model developer: Optional developer override - inference_platform: Optional platform used for inference + inference_platform: Platform used for inference additional_details: Extra model metadata Returns: @@ -252,12 +172,13 @@ def make_evaluation_log( source_name: str, model_name: str, evaluation_results: List[EvaluationResult], + source_data: List[str], organization_name: str, source_type: str = "documentation", evaluator_relationship: EvaluatorRelationship = EvaluatorRelationship.third_party, organization_url: Optional[str] = None, developer: Optional[str] = None, - inference_platform: Optional[str] = None, + inference_platform: str = "unknown", model_additional_details: Optional[Dict[str, Any]] = None, retrieved_timestamp: Optional[str] = None, ) -> EvaluationLog: @@ -269,13 +190,14 @@ def make_evaluation_log( Args: source_name: Name of the evaluation source model_name: Name of the model being evaluated - evaluation_results: List of evaluation results (each must include source_data) + evaluation_results: List of evaluation results + source_data: URLs or dataset info for the source data organization_name: Organization providing the evaluation source_type: Either "documentation" or "evaluation_run" evaluator_relationship: Relationship to model developer organization_url: Optional URL for the organization developer: Optional developer override - inference_platform: Optional platform used for inference + inference_platform: Platform used for inference model_additional_details: Extra model metadata retrieved_timestamp: Optional timestamp override @@ -291,9 +213,10 @@ def make_evaluation_log( evaluation_id = f"{source_name}/{sanitized_model_id}/{timestamp}" return EvaluationLog( - schema_version=SCHEMA_VERSION, + schema_version="0.1.0", evaluation_id=evaluation_id, retrieved_timestamp=timestamp, + source_data=source_data, source_metadata=make_source_metadata( source_name=source_name, organization_name=organization_name,