From 6221e2f8e79d1596d403d7e168d552df19e668b1 Mon Sep 17 00:00:00 2001 From: Jonathan Bnayahu Date: Thu, 11 Sep 2025 14:11:51 +0300 Subject: [PATCH 1/5] Initial version of the safety benchmark, plus various fixes to the subsets. Signed-off-by: Jonathan Bnayahu --- prepare/benchmarks/safety.py | 14 +++++++++++++ prepare/cards/attaq.py | 5 +++-- prepare/cards/safety/airbench2024.py | 1 + prepare/cards/safety/mlcommons_ailuminate.py | 4 +++- src/unitxt/catalog/benchmarks/safety.json | 21 +++++++++++++++++++ src/unitxt/catalog/cards/attaq.json | 13 ++++++------ .../cards/safety/mlcommons_ailuminate.json | 2 +- src/unitxt/processors.py | 2 +- 8 files changed, 51 insertions(+), 11 deletions(-) create mode 100644 prepare/benchmarks/safety.py create mode 100644 src/unitxt/catalog/benchmarks/safety.json diff --git a/prepare/benchmarks/safety.py b/prepare/benchmarks/safety.py new file mode 100644 index 0000000000..fbb70d35df --- /dev/null +++ b/prepare/benchmarks/safety.py @@ -0,0 +1,14 @@ +from unitxt.benchmark import Benchmark +from unitxt.catalog import add_to_catalog +from unitxt.standard import DatasetRecipe + +benchmark = Benchmark( + subsets={ + "attaq": DatasetRecipe(card="cards.attaq"), + "provoq": DatasetRecipe(card="cards.safety.provoq"), + "airbench": DatasetRecipe(card="cards.safety.airbench2024"), + "ailuminate": DatasetRecipe(card="cards.safety.mlcommons_ailuminate"), + } +) + +add_to_catalog(benchmark, "benchmarks.safety", overwrite=True) diff --git a/prepare/cards/attaq.py b/prepare/cards/attaq.py index b042f10b33..8068c0f22c 100644 --- a/prepare/cards/attaq.py +++ b/prepare/cards/attaq.py @@ -17,8 +17,9 @@ Shuffle(page_size=2800), ], task=Task( - input_fields=["input"], - reference_fields=["label"], + input_fields={"input": str}, + reference_fields={"label": str}, + prediction_type=str, metrics=["metrics.safety_metric"], ), templates=[ diff --git a/prepare/cards/safety/airbench2024.py b/prepare/cards/safety/airbench2024.py index 2d866dca3b..52b900291e 100644 --- a/prepare/cards/safety/airbench2024.py +++ b/prepare/cards/safety/airbench2024.py @@ -43,6 +43,7 @@ }, reference_fields={}, prediction_type=str, + # TODO: The current implementation uses Granite Guardian as judge. Future versions could use a custom judge and use the judge_prompt field. metrics=[ "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=prompt,assistant_message_field=prediction]", ], diff --git a/prepare/cards/safety/mlcommons_ailuminate.py b/prepare/cards/safety/mlcommons_ailuminate.py index 6e3c8a58f0..d644b0f8f5 100644 --- a/prepare/cards/safety/mlcommons_ailuminate.py +++ b/prepare/cards/safety/mlcommons_ailuminate.py @@ -26,7 +26,9 @@ }, reference_fields={}, prediction_type=str, - metrics=["metrics.llm_as_judge.safety.llamaguard"], + metrics=[ + "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=prompt_text,assistant_message_field=prediction]" + ], ), templates=TemplatesDict( { diff --git a/src/unitxt/catalog/benchmarks/safety.json b/src/unitxt/catalog/benchmarks/safety.json new file mode 100644 index 0000000000..0840e22733 --- /dev/null +++ b/src/unitxt/catalog/benchmarks/safety.json @@ -0,0 +1,21 @@ +{ + "__type__": "benchmark", + "subsets": { + "attaq": { + "__type__": "dataset_recipe", + "card": "cards.attaq" + }, + "provoq": { + "__type__": "dataset_recipe", + "card": "cards.safety.provoq" + }, + "airbench": { + "__type__": "dataset_recipe", + "card": "cards.safety.airbench2024" + }, + "ailuminate": { + "__type__": "dataset_recipe", + "card": "cards.safety.mlcommons_ailuminate" + } + } +} diff --git a/src/unitxt/catalog/cards/attaq.json b/src/unitxt/catalog/cards/attaq.json index 2c58f37b9a..6bc49ad878 100644 --- a/src/unitxt/catalog/cards/attaq.json +++ b/src/unitxt/catalog/cards/attaq.json @@ -18,12 +18,13 @@ ], "task": { "__type__": "task", - "input_fields": [ - "input" - ], - "reference_fields": [ - "label" - ], + "input_fields": { + "input": "str" + }, + "reference_fields": { + "label": "str" + }, + "prediction_type": "str", "metrics": [ "metrics.safety_metric" ] diff --git a/src/unitxt/catalog/cards/safety/mlcommons_ailuminate.json b/src/unitxt/catalog/cards/safety/mlcommons_ailuminate.json index 89542cec61..33aa3262ff 100644 --- a/src/unitxt/catalog/cards/safety/mlcommons_ailuminate.json +++ b/src/unitxt/catalog/cards/safety/mlcommons_ailuminate.json @@ -22,7 +22,7 @@ "reference_fields": {}, "prediction_type": "str", "metrics": [ - "metrics.llm_as_judge.safety.llamaguard" + "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=prompt_text,assistant_message_field=prediction]" ] }, "templates": { diff --git a/src/unitxt/processors.py b/src/unitxt/processors.py index 6f13e10a33..a351999a75 100644 --- a/src/unitxt/processors.py +++ b/src/unitxt/processors.py @@ -326,7 +326,7 @@ def process_value(self, text: Any) -> Any: try: return float(match.group(1)) * 0.25 - 0.25 except: - return np.NaN + return np.nan class ExtractMtBenchLabelJudgment(FieldOperator): From 0caaba770e3efc194ba8a20f920d4495afde00e7 Mon Sep 17 00:00:00 2001 From: Jonathan Bnayahu Date: Mon, 15 Sep 2025 16:24:53 +0300 Subject: [PATCH 2/5] GraniteGuardian-based metric for attaq Signed-off-by: Jonathan Bnayahu --- prepare/benchmarks/safety.py | 25 +++++++++-- prepare/cards/safety/attaq_gg.py | 27 ++++++++++++ src/unitxt/catalog/benchmarks/safety.json | 19 +++++++-- src/unitxt/catalog/cards/safety/attaq_gg.json | 42 +++++++++++++++++++ 4 files changed, 105 insertions(+), 8 deletions(-) create mode 100644 prepare/cards/safety/attaq_gg.py create mode 100644 src/unitxt/catalog/cards/safety/attaq_gg.json diff --git a/prepare/benchmarks/safety.py b/prepare/benchmarks/safety.py index fbb70d35df..0abd288a0c 100644 --- a/prepare/benchmarks/safety.py +++ b/prepare/benchmarks/safety.py @@ -4,10 +4,27 @@ benchmark = Benchmark( subsets={ - "attaq": DatasetRecipe(card="cards.attaq"), - "provoq": DatasetRecipe(card="cards.safety.provoq"), - "airbench": DatasetRecipe(card="cards.safety.airbench2024"), - "ailuminate": DatasetRecipe(card="cards.safety.mlcommons_ailuminate"), + "attaq": DatasetRecipe( + card="cards.safety.attaq_gg", + template_card_index="default", + max_test_instances=500, + ), + "provoq": DatasetRecipe( + card="cards.safety.provoq", + template_card_index="default", + group_by=["group"], + max_test_instances=500, + ), + "airbench": DatasetRecipe( + card="cards.safety.airbench2024", + template_card_index="default", + max_test_instances=500, + ), + "ailuminate": DatasetRecipe( + card="cards.safety.mlcommons_ailuminate", + template_card_index="default", + max_test_instances=500, + ), } ) diff --git a/prepare/cards/safety/attaq_gg.py b/prepare/cards/safety/attaq_gg.py new file mode 100644 index 0000000000..fb38f21fb2 --- /dev/null +++ b/prepare/cards/safety/attaq_gg.py @@ -0,0 +1,27 @@ +from unitxt import add_to_catalog +from unitxt.blocks import InputOutputTemplate, LoadHF, Task, TaskCard, TemplatesDict +from unitxt.operators import Shuffle +from unitxt.splitters import RenameSplits +from unitxt.test_utils.card import test_card + +card = TaskCard( + loader=LoadHF(path="ibm/AttaQ"), + preprocess_steps=[ + RenameSplits(mapper={"train": "test"}), + Shuffle(page_size=2800), + ], + task=Task( + input_fields={"input": str}, + reference_fields={"label": str}, + prediction_type=str, + metrics=[ + "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=input,assistant_message_field=prediction]", + ], + ), + templates=TemplatesDict( + {"default": InputOutputTemplate(input_format="{input}", output_format="")} + ), +) + +test_card(card, strict=False, demos_taken_from="test", num_demos=0) +add_to_catalog(card, "cards.safety.attaq_gg", overwrite=True) diff --git a/src/unitxt/catalog/benchmarks/safety.json b/src/unitxt/catalog/benchmarks/safety.json index 0840e22733..cf76e52cc1 100644 --- a/src/unitxt/catalog/benchmarks/safety.json +++ b/src/unitxt/catalog/benchmarks/safety.json @@ -3,19 +3,30 @@ "subsets": { "attaq": { "__type__": "dataset_recipe", - "card": "cards.attaq" + "card": "cards.safety.attaq_gg", + "template_card_index": "default", + "max_test_instances": 500 }, "provoq": { "__type__": "dataset_recipe", - "card": "cards.safety.provoq" + "card": "cards.safety.provoq", + "template_card_index": "default", + "group_by": [ + "group" + ], + "max_test_instances": 500 }, "airbench": { "__type__": "dataset_recipe", - "card": "cards.safety.airbench2024" + "card": "cards.safety.airbench2024", + "template_card_index": "default", + "max_test_instances": 500 }, "ailuminate": { "__type__": "dataset_recipe", - "card": "cards.safety.mlcommons_ailuminate" + "card": "cards.safety.mlcommons_ailuminate", + "template_card_index": "default", + "max_test_instances": 500 } } } diff --git a/src/unitxt/catalog/cards/safety/attaq_gg.json b/src/unitxt/catalog/cards/safety/attaq_gg.json new file mode 100644 index 0000000000..5e71370f1d --- /dev/null +++ b/src/unitxt/catalog/cards/safety/attaq_gg.json @@ -0,0 +1,42 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "ibm/AttaQ" + }, + "preprocess_steps": [ + { + "__type__": "rename_splits", + "mapper": { + "train": "test" + } + }, + { + "__type__": "shuffle", + "page_size": 2800 + } + ], + "task": { + "__type__": "task", + "input_fields": { + "input": "str" + }, + "reference_fields": { + "label": "str" + }, + "prediction_type": "str", + "metrics": [ + "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=input,assistant_message_field=prediction]" + ] + }, + "templates": { + "__type__": "templates_dict", + "items": { + "default": { + "__type__": "input_output_template", + "input_format": "{input}", + "output_format": "" + } + } + } +} From 043fee8be885398d5c09290b9537552e75c636ed Mon Sep 17 00:00:00 2001 From: Jonathan Bnayahu Date: Mon, 15 Sep 2025 19:30:54 +0300 Subject: [PATCH 3/5] Added grouping Signed-off-by: Jonathan Bnayahu --- prepare/benchmarks/safety.py | 2 ++ src/unitxt/catalog/benchmarks/safety.json | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/prepare/benchmarks/safety.py b/prepare/benchmarks/safety.py index 0abd288a0c..0a88778831 100644 --- a/prepare/benchmarks/safety.py +++ b/prepare/benchmarks/safety.py @@ -18,11 +18,13 @@ "airbench": DatasetRecipe( card="cards.safety.airbench2024", template_card_index="default", + group_by=["l2-name"], max_test_instances=500, ), "ailuminate": DatasetRecipe( card="cards.safety.mlcommons_ailuminate", template_card_index="default", + group_by=["hazard"], max_test_instances=500, ), } diff --git a/src/unitxt/catalog/benchmarks/safety.json b/src/unitxt/catalog/benchmarks/safety.json index cf76e52cc1..509e8e06fe 100644 --- a/src/unitxt/catalog/benchmarks/safety.json +++ b/src/unitxt/catalog/benchmarks/safety.json @@ -20,12 +20,18 @@ "__type__": "dataset_recipe", "card": "cards.safety.airbench2024", "template_card_index": "default", + "group_by": [ + "l2-name" + ], "max_test_instances": 500 }, "ailuminate": { "__type__": "dataset_recipe", "card": "cards.safety.mlcommons_ailuminate", "template_card_index": "default", + "group_by": [ + "hazard" + ], "max_test_instances": 500 } } From 9c84dfe324a422a0a581b5491c92709967be5a51 Mon Sep 17 00:00:00 2001 From: Jonathan Bnayahu Date: Tue, 16 Sep 2025 08:11:12 +0300 Subject: [PATCH 4/5] . Signed-off-by: Jonathan Bnayahu --- prepare/benchmarks/safety.py | 11 +++++++---- src/unitxt/catalog/benchmarks/safety.json | 3 +++ 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/prepare/benchmarks/safety.py b/prepare/benchmarks/safety.py index 0a88778831..a2a8241705 100644 --- a/prepare/benchmarks/safety.py +++ b/prepare/benchmarks/safety.py @@ -2,30 +2,33 @@ from unitxt.catalog import add_to_catalog from unitxt.standard import DatasetRecipe +MAX_TEST_INSTANCES = 500 + benchmark = Benchmark( subsets={ "attaq": DatasetRecipe( card="cards.safety.attaq_gg", template_card_index="default", - max_test_instances=500, + group_by=["label"], + max_test_instances=MAX_TEST_INSTANCES, ), "provoq": DatasetRecipe( card="cards.safety.provoq", template_card_index="default", group_by=["group"], - max_test_instances=500, + max_test_instances=MAX_TEST_INSTANCES, ), "airbench": DatasetRecipe( card="cards.safety.airbench2024", template_card_index="default", group_by=["l2-name"], - max_test_instances=500, + max_test_instances=MAX_TEST_INSTANCES, ), "ailuminate": DatasetRecipe( card="cards.safety.mlcommons_ailuminate", template_card_index="default", group_by=["hazard"], - max_test_instances=500, + max_test_instances=MAX_TEST_INSTANCES, ), } ) diff --git a/src/unitxt/catalog/benchmarks/safety.json b/src/unitxt/catalog/benchmarks/safety.json index 509e8e06fe..df52e56a5a 100644 --- a/src/unitxt/catalog/benchmarks/safety.json +++ b/src/unitxt/catalog/benchmarks/safety.json @@ -5,6 +5,9 @@ "__type__": "dataset_recipe", "card": "cards.safety.attaq_gg", "template_card_index": "default", + "group_by": [ + "label" + ], "max_test_instances": 500 }, "provoq": { From 8f9806a8ea1bd6ff1577bc4cd262e037aa3608dd Mon Sep 17 00:00:00 2001 From: Jonathan Bnayahu Date: Thu, 18 Sep 2025 09:17:52 +0300 Subject: [PATCH 5/5] Switch provoq to GG metric. Signed-off-by: Jonathan Bnayahu --- prepare/benchmarks/safety.py | 4 +- prepare/cards/safety/provoq_gg.py | 31 +++++++++++++++ src/unitxt/catalog/benchmarks/safety.json | 10 ++--- .../catalog/cards/safety/provoq_gg.json | 39 +++++++++++++++++++ 4 files changed, 77 insertions(+), 7 deletions(-) create mode 100644 prepare/cards/safety/provoq_gg.py create mode 100644 src/unitxt/catalog/cards/safety/provoq_gg.json diff --git a/prepare/benchmarks/safety.py b/prepare/benchmarks/safety.py index a2a8241705..4ac87b1ba1 100644 --- a/prepare/benchmarks/safety.py +++ b/prepare/benchmarks/safety.py @@ -2,7 +2,7 @@ from unitxt.catalog import add_to_catalog from unitxt.standard import DatasetRecipe -MAX_TEST_INSTANCES = 500 +MAX_TEST_INSTANCES = 1000 benchmark = Benchmark( subsets={ @@ -13,7 +13,7 @@ max_test_instances=MAX_TEST_INSTANCES, ), "provoq": DatasetRecipe( - card="cards.safety.provoq", + card="cards.safety.provoq_gg", template_card_index="default", group_by=["group"], max_test_instances=MAX_TEST_INSTANCES, diff --git a/prepare/cards/safety/provoq_gg.py b/prepare/cards/safety/provoq_gg.py new file mode 100644 index 0000000000..54cd4d1adb --- /dev/null +++ b/prepare/cards/safety/provoq_gg.py @@ -0,0 +1,31 @@ +from unitxt import add_to_catalog +from unitxt.blocks import InputOutputTemplate, LoadHF, Task, TaskCard, TemplatesDict +from unitxt.test_utils.card import test_card + +card = TaskCard( + loader=LoadHF(path="IBM/ProvoQ", data_classification_policy=["public"]), + task=Task( + input_fields={"question": str, "stigma": str, "group": str}, + reference_fields={}, + prediction_type=str, + metrics=[ + "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=question,assistant_message_field=prediction]", + ], + ), + templates=TemplatesDict( + {"default": InputOutputTemplate(input_format="{question}", output_format="")} + ), + __description__="The ProvoQ dataset is designed to evaluate the sensitivity of large language models (LLMs) to stigma-related topics. It contains a comprehensive set of human-curated provocative questions that systematically target minority-stigma pairs in the United States, creating a diverse and nuanced set of questions that reflect these sensitive topics. The dataset aims to support research in understanding and mitigating biases in AI systems, particularly in the context of minority groups. While most questions are toxic, others may seem benign but potentially elicit harmful responses. The dataset contains questions in text format, organized by minority-stigma pairs.", + __tags__={ + "languages": ["english"], + }, +) + +test_card( + card, + strict=False, + demos_taken_from="test", + num_demos=0, +) + +add_to_catalog(card, "cards.safety.provoq_gg", overwrite=True) diff --git a/src/unitxt/catalog/benchmarks/safety.json b/src/unitxt/catalog/benchmarks/safety.json index df52e56a5a..9b5cafff2b 100644 --- a/src/unitxt/catalog/benchmarks/safety.json +++ b/src/unitxt/catalog/benchmarks/safety.json @@ -8,16 +8,16 @@ "group_by": [ "label" ], - "max_test_instances": 500 + "max_test_instances": 1000 }, "provoq": { "__type__": "dataset_recipe", - "card": "cards.safety.provoq", + "card": "cards.safety.provoq_gg", "template_card_index": "default", "group_by": [ "group" ], - "max_test_instances": 500 + "max_test_instances": 1000 }, "airbench": { "__type__": "dataset_recipe", @@ -26,7 +26,7 @@ "group_by": [ "l2-name" ], - "max_test_instances": 500 + "max_test_instances": 1000 }, "ailuminate": { "__type__": "dataset_recipe", @@ -35,7 +35,7 @@ "group_by": [ "hazard" ], - "max_test_instances": 500 + "max_test_instances": 1000 } } } diff --git a/src/unitxt/catalog/cards/safety/provoq_gg.json b/src/unitxt/catalog/cards/safety/provoq_gg.json new file mode 100644 index 0000000000..290308b8e4 --- /dev/null +++ b/src/unitxt/catalog/cards/safety/provoq_gg.json @@ -0,0 +1,39 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "IBM/ProvoQ", + "data_classification_policy": [ + "public" + ] + }, + "task": { + "__type__": "task", + "input_fields": { + "question": "str", + "stigma": "str", + "group": "str" + }, + "reference_fields": {}, + "prediction_type": "str", + "metrics": [ + "metrics.granite_guardian.assistant_risk.harm[prediction_type=str,user_message_field=question,assistant_message_field=prediction]" + ] + }, + "templates": { + "__type__": "templates_dict", + "items": { + "default": { + "__type__": "input_output_template", + "input_format": "{question}", + "output_format": "" + } + } + }, + "__description__": "The ProvoQ dataset is designed to evaluate the sensitivity of large language models (LLMs) to stigma-related topics. It contains a comprehensive set of human-curated provocative questions that systematically target minority-stigma pairs in the United States, creating a diverse and nuanced set of questions that reflect these sensitive topics. The dataset aims to support research in understanding and mitigating biases in AI systems, particularly in the context of minority groups. While most questions are toxic, others may seem benign but potentially elicit harmful responses. The dataset contains questions in text format, organized by minority-stigma pairs.", + "__tags__": { + "languages": [ + "english" + ] + } +}