microsoft · romanlutz · Mar 17, 2025 · Mar 13, 2025 · Mar 13, 2025 · Mar 13, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -64,7 +64,7 @@ repos:
       language: python
       files: ^doc.*\.(py|md)$
       additional_dependencies: ['requests']
-      exclude: (release_process.md|git.md|^doc/deployment/|tests|pyrit/prompt_converter/morse_converter.py|.github|pyrit/prompt_converter/emoji_converter.py|pyrit/score/markdown_injection.py|pyrit/datasets/fetch_example_datasets.py|^pyrit/auxiliary_attacks/gcg/)
+      exclude: (release_process.md|git.md|^doc/deployment/|tests|pyrit/prompt_converter/morse_converter.py|.github|pyrit/prompt_converter/emoji_converter.py|pyrit/score/markdown_injection.py|^pyrit/datasets/|^pyrit/auxiliary_attacks/gcg/)
 
   - repo: https://github.com/pycqa/pylint
     rev: v3.3.3

diff --git a/doc/api.rst b/doc/api.rst
@@ -99,19 +99,21 @@ API Reference
     :nosignatures:
     :toctree: _autosummary/
 
+    fetch_adv_bench_dataset
+    fetch_aya_redteaming_dataset
+    fetch_babelscape_alert_dataset
     fetch_decoding_trust_stereotypes_dataset
     fetch_examples
+    fetch_forbidden_questions_dataset
     fetch_harmbench_dataset
+    fetch_librAI_do_not_answer_dataset
+    fetch_llm_latent_adversarial_training_harmful_dataset
     fetch_many_shot_jailbreaking_dataset
-    fetch_seclists_bias_testing_dataset
-    fetch_xstest_dataset
     fetch_pku_safe_rlhf_dataset
-    fetch_adv_bench_dataset
-    fetch_wmdp_dataset
-    fetch_forbidden_questions_dataset
-    fetch_llm_latent_adversarial_training_harmful_dataset
+    fetch_seclists_bias_testing_dataset
     fetch_tdc23_redteaming_dataset
-    fetch_aya_redteaming_dataset
+    fetch_wmdp_dataset
+    fetch_xstest_dataset
 
 :py:mod:`pyrit.embedding`
 =========================

diff --git a/doc/code/datasets/0_dataset.md b/doc/code/datasets/0_dataset.md
@@ -8,7 +8,7 @@ By using `SeedPrompts` through loading from a YAML file or loading them via syst
 
 **Loading Datasets**:
 
-We also show examples of common methods to fetch datasets into PyRIT from different sources. Most datasets will be loaded as a `SeedPromptDataset`. Outside of these examples, the fetch functions which are currently available can be found in `fetch_example_datasets.py` and are organized by similar method type. There is a wide range of datasets which are included and can be used as example to also load in other datasets. As these datasets are the first component of building an attack in PyRIT, the following notebooks also continue to demonstrate how these prompts can be used in the process.
+We also show examples of common methods to fetch datasets into PyRIT from different sources. Most datasets will be loaded as a `SeedPromptDataset`. Outside of these examples, the fetch functions which are currently available can be found in the `pyrit.datasets` module. There is a wide range of datasets which are included and can be used as example to also load in other datasets. As these datasets are the first component of building an attack in PyRIT, the following notebooks also continue to demonstrate how these prompts can be used in the process.
 
 **Datasets Loading Process: Seed Prompt De-duplication**:
 PyRIT checks for existence of duplicate seed prompts using hashes to make sure it is not uploading duplicate seed prompts in the memory. The feature follows following decision-tree:

diff --git a/doc/code/datasets/2_fetch_dataset.ipynb b/doc/code/datasets/2_fetch_dataset.ipynb
@@ -7,7 +7,7 @@
     "# Fetching dataset examples\n",
     "\n",
     "This notebook demonstrates how to load datasets as a `SeedPromptDataset` to perform red teaming on a target.\n",
-    "There are several datasets which can be found in the `fetch_example_datasets.py` file.\n",
+    "There are several datasets which can be found in the `pyrit.datasets` module.\n",
     "Three example datasets are shown in this notebook and can be used with orchestrators such as the Prompt Sending Orchestrator.\n",
     "The example below demonstrates loading a HuggingFace dataset as a `SeedPromptDataset`."
    ]

diff --git a/doc/code/datasets/2_fetch_dataset.py b/doc/code/datasets/2_fetch_dataset.py
@@ -16,7 +16,7 @@
 # # Fetching dataset examples
 #
 # This notebook demonstrates how to load datasets as a `SeedPromptDataset` to perform red teaming on a target.
-# There are several datasets which can be found in the `fetch_example_datasets.py` file.
+# There are several datasets which can be found in the `pyrit.datasets` module.
 # Three example datasets are shown in this notebook and can be used with orchestrators such as the Prompt Sending Orchestrator.
 # The example below demonstrates loading a HuggingFace dataset as a `SeedPromptDataset`.
 

diff --git a/pyrit/datasets/__init__.py b/pyrit/datasets/__init__.py
@@ -1,38 +1,38 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
-from .fetch_example_datasets import (
-    fetch_adv_bench_dataset,
-    fetch_aya_redteaming_dataset,
-    fetch_babelscape_alert_dataset,
-    fetch_decoding_trust_stereotypes_dataset,
-    fetch_examples,
-    fetch_forbidden_questions_dataset,
-    fetch_harmbench_dataset,
-    fetch_librAI_do_not_answer_dataset,
+from pyrit.datasets.adv_bench_dataset import fetch_adv_bench_dataset
+from pyrit.datasets.aya_redteaming_dataset import fetch_aya_redteaming_dataset
+from pyrit.datasets.babelscape_alert_dataset import fetch_babelscape_alert_dataset
+from pyrit.datasets.decoding_trust_stereotypes_dataset import fetch_decoding_trust_stereotypes_dataset
+from pyrit.datasets.dataset_helper import fetch_examples
+from pyrit.datasets.forbidden_questions_dataset import fetch_forbidden_questions_dataset
+from pyrit.datasets.harmbench_dataset import fetch_harmbench_dataset
+from pyrit.datasets.librAI_do_not_answer_dataset import fetch_librAI_do_not_answer_dataset
+from pyrit.datasets.llm_latent_adversarial_training_harmful_dataset import (
     fetch_llm_latent_adversarial_training_harmful_dataset,
-    fetch_many_shot_jailbreaking_dataset,
-    fetch_pku_safe_rlhf_dataset,
-    fetch_seclists_bias_testing_dataset,
-    fetch_tdc23_redteaming_dataset,
-    fetch_wmdp_dataset,
-    fetch_xstest_dataset,
 )
+from pyrit.datasets.many_shot_jailbreaking_dataset import fetch_many_shot_jailbreaking_dataset
+from pyrit.datasets.pku_safe_rlhf_dataset import fetch_pku_safe_rlhf_dataset
+from pyrit.datasets.seclists_bias_testing_dataset import fetch_seclists_bias_testing_dataset
+from pyrit.datasets.tdc23_redteaming_dataset import fetch_tdc23_redteaming_dataset
+from pyrit.datasets.wmdp_dataset import fetch_wmdp_dataset
+from pyrit.datasets.xstest_dataset import fetch_xstest_dataset
 
 __all__ = [
+    "fetch_adv_bench_dataset",
     "fetch_aya_redteaming_dataset",
+    "fetch_babelscape_alert_dataset",
     "fetch_decoding_trust_stereotypes_dataset",
     "fetch_examples",
+    "fetch_forbidden_questions_dataset",
     "fetch_harmbench_dataset",
+    "fetch_librAI_do_not_answer_dataset",
+    "fetch_llm_latent_adversarial_training_harmful_dataset",
     "fetch_many_shot_jailbreaking_dataset",
-    "fetch_seclists_bias_testing_dataset",
-    "fetch_xstest_dataset",
     "fetch_pku_safe_rlhf_dataset",
-    "fetch_adv_bench_dataset",
-    "fetch_wmdp_dataset",
-    "fetch_forbidden_questions_dataset",
-    "fetch_llm_latent_adversarial_training_harmful_dataset",
+    "fetch_seclists_bias_testing_dataset",
     "fetch_tdc23_redteaming_dataset",
-    "fetch_librAI_do_not_answer_dataset",
-    "fetch_babelscape_alert_dataset",
+    "fetch_wmdp_dataset",
+    "fetch_xstest_dataset",
 ]
diff --git a/pyrit/datasets/adv_bench_dataset.py b/pyrit/datasets/adv_bench_dataset.py
@@ -0,0 +1,101 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from pathlib import Path
+from typing import List, Literal, Optional
+
+from pyrit.common.path import DATASETS_PATH
+from pyrit.datasets.dataset_helper import fetch_examples
+from pyrit.models import SeedPromptDataset
+from pyrit.models.seed_prompt import SeedPrompt
+
+
+def fetch_adv_bench_dataset(
+    cache: bool = True,
+    main_categories: Optional[
+        List[
+            Literal[
+                "Autonomy",
+                "Physical",
+                "Psychological",
+                "Reputational",
+                "Financial and Business",
+                "Human Rights and Civil Liberties",
+                "Societal and Cultural",
+                "Political and Economic",
+                "Environmental",
+            ]
+        ]
+    ] = None,
+    sub_categories: Optional[List[str]] = None,
+) -> SeedPromptDataset:
+    """
+    Retrieve AdvBench examples enhanced with categories from a collaborative and human-centered harms taxonomy.
+
+    This function fetches a dataset extending the original AdvBench Dataset by adding harm types to each prompt.
+    Categorization was done using the Claude 3.7 model based on the Collaborative, Human-Centered Taxonomy of AI,
+    Algorithmic, and Automation Harms (https://arxiv.org/abs/2407.01294v2). Each entry includes at least one main
+    category and one subcategory to enable better filtering and analysis of the dataset.
+
+    Useful link: https://arxiv.org/html/2407.01294v2/x2.png (Overview of the Harms Taxonomy)
+
+    Args:
+        cache (bool): Whether to cache the fetched examples. Defaults to True.
+
+        main_categories (Optional[List[str]]): A list of main harm categories to search for in the dataset.
+            For descriptions of each category, see the paper: arXiv:2407.01294v2
+            Defaults to None, which includes all 9 main categories.
+
+        sub_categories (Optional[List[str]]): A list of harm subcategories to search for in the dataset.
+            For the complete list of all subcategories, see the paper: arXiv:2407.01294v2.
+            Defaults to None, which includes all subcategories.
+
+    Returns:
+        SeedPromptDataset: A SeedPromptDataset containing the examples.
+
+    Note:
+        For more information and access to the original dataset and related materials, visit:
+        https://github.com/llm-attacks/llm-attacks/blob/main/data/advbench. Based on research in paper:
+        https://arxiv.org/abs/2307.15043 written by Andy Zou, Zifan Wang, Nicholas Carlini, Milad Nasr,
+        J. Zico Kolter, Matt Fredrikson.
+
+        The categorization approach was proposed by @paulinek13, who suggested using the Collaborative, Human-Centred
+        Taxonomy of AI, Algorithmic, and Automation Harms (arXiv:2407.01294v2) to classify the AdvBench examples and
+        used Anthropic's Claude 3.7 Sonnet model to perform the categorization based on the taxonomy's descriptions.
+    """
+    dataset = fetch_examples(
+        source=str(Path(DATASETS_PATH) / "data" / "adv_bench_dataset.json"), source_type="file", cache=cache
+    )
+
+    filtered = dataset["data"]  # type: ignore
+
+    if main_categories or sub_categories:
+        main_set = set(main_categories or [])
+        sub_set = set(sub_categories or [])
+
+        # Include an entry if it matches ANY specified main category OR ANY specified subcategory
+        filtered = [
+            item
+            for item in filtered
+            if (main_set and any(cat in main_set for cat in item["main_categories"]))
+            or (sub_set and any(cat in sub_set for cat in item["sub_categories"]))
+        ]
+
+    seed_prompts = [
+        SeedPrompt(
+            value=item["prompt"],
+            data_type="text",
+            name="AdvBench Dataset [Extended]",
+            dataset_name="AdvBench Dataset",
+            harm_categories=item["main_categories"] + item["sub_categories"],
+            description="""AdvBench is a set of 520 harmful behaviors formulated as instructions. This dataset
+            has been extended to include harm categories for better filtering and analysis. The adversary's goal
+            is instead to find a single attack string that will cause the model to generate any response that
+            attempts to comply with the instruction, and to do so over as many harmful behaviors as possible.""",
+            source="https://github.com/llm-attacks/llm-attacks/tree/main/data/advbench",
+        )
+        for item in filtered
+    ]
+
+    seed_prompt_dataset = SeedPromptDataset(prompts=seed_prompts)
+    return seed_prompt_dataset
diff --git a/pyrit/datasets/aya_redteaming_dataset.py b/pyrit/datasets/aya_redteaming_dataset.py
@@ -0,0 +1,95 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from pathlib import Path
+from typing import List, Literal, Optional
+
+from pyrit.datasets.dataset_helper import fetch_examples
+from pyrit.models import SeedPromptDataset
+from pyrit.models.seed_prompt import SeedPrompt
+
+
+def fetch_aya_redteaming_dataset(
+    cache: bool = True,
+    data_home: Optional[Path] = None,
+    language: Literal["English", "Hindi", "French", "Spanish", "Arabic", "Russian", "Serbian", "Tagalog"] = "English",
+    harm_categories: Optional[
+        List[
+            Literal[
+                "Bullying & Harassment",
+                "Discrimination & Injustice",
+                "Graphic material",
+                "Harms of Representation Allocation and Quality of Service",
+                "Hate Speech",
+                "Non-consensual sexual content",
+                "Profanity",
+                "Self-Harm",
+                "Violence, Threats & Incitement",
+            ]
+        ]
+    ] = None,
+    harm_scope: Optional[Literal["global", "local"]] = None,
+) -> SeedPromptDataset:
+    """
+    Fetch examples from the Aya Red-teaming dataset with optional filtering and create a SeedPromptDataset.
+
+    Args:
+        cache (bool): Whether to cache the fetched examples. Defaults to True.
+        data_home: Directory to store cached data. Defaults to None.
+        language: The language to fetch prompts for. Defaults to "English".
+        harm_categories: List of harm categories to filter the examples.
+            Defaults to None, which means all categories are included.
+            Otherwise, only prompts with at least one matching category are included.
+        harm_scope: Whether to fetch globally or locally harmful prompts.
+            Defaults to None, which means all examples are included.
+
+    Returns:
+        SeedPromptDataset: A SeedPromptDataset containing the filtered examples.
+
+    Note:
+        For more information and access to the original dataset and related materials, visit:
+        https://huggingface.co/datasets/CohereForAI/aya_redteaming/blob/main/README.md \n
+        Related paper: https://arxiv.org/abs/2406.18682 \n
+        The dataset license: Apache 2.0
+
+    Warning:
+        Due to the nature of these prompts, it may be advisable to consult your relevant legal
+        department before testing them with LLMs to ensure compliance and reduce potential risks.
+    """
+    _lang = {
+        "English": "eng",
+        "Hindi": "hin",
+        "French": "fra",
+        "Spanish": "spa",
+        "Arabic": "arb",
+        "Russian": "rus",
+        "Serbian": "srp",
+        "Tagalog": "tgl",
+    }
+
+    examples = fetch_examples(
+        source=f"https://huggingface.co/datasets/CohereForAI/aya_redteaming/raw/main/aya_{_lang[language]}.jsonl",
+        source_type="public_url",
+        cache=cache,
+        data_home=data_home,
+    )
+
+    seed_prompts = []
+
+    for example in examples:
+        categories = eval(example["harm_category"])
+        if harm_categories is None or any(cat in categories for cat in harm_categories):
+            if harm_scope is None or example["global_or_local"] == harm_scope:
+                seed_prompts.append(
+                    SeedPrompt(
+                        value=example["prompt"],
+                        data_type="text",
+                        name="Aya Red-teaming Examples",
+                        dataset_name="Aya Red-teaming Examples",
+                        harm_categories=categories,
+                        source="https://huggingface.co/datasets/CohereForAI/aya_redteaming",
+                    )
+                )
+
+    seed_prompt_dataset = SeedPromptDataset(prompts=seed_prompts)
+    return seed_prompt_dataset
diff --git a/pyrit/datasets/babelscape_alert_dataset.py b/pyrit/datasets/babelscape_alert_dataset.py
@@ -0,0 +1,54 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from typing import Literal
+
+from datasets import load_dataset
+
+from pyrit.models import SeedPromptDataset
+from pyrit.models.seed_prompt import SeedPrompt
+
+
+def fetch_babelscape_alert_dataset(
+    category: Literal["alert", "alert_adversarial"] = "alert_adversarial"
+) -> SeedPromptDataset:
+    """
+    Fetch the Babelscape/ALERT dataset and create a SeedPromptDataset.
+
+    Args:
+        category (str): The dataset category, "alert" or "alert_adversarial"
+
+    Returns:
+        SeedPromptDataset: A SeedPromptDataset containing the examples.
+    """
+
+    data_categories = None
+    if not category:  # if category is not specified, read both subsets
+        data_categories = ["alert_adversarial", "alert"]
+    elif category not in ["alert_adversarial", "alert"]:
+        raise ValueError(f"Invalid Parameter: {category}. Expected 'alert_adversarial' or 'alert'")
+    else:
+        data_categories = [category]
+
+    # Load specified subset or both catagories
+    for name in data_categories:
+        data = load_dataset("Babelscape/ALERT", name)
+        prompts = [item["prompt"] for item in data["test"]]
+
+    # Create SeedPrompt instances from each example in 'prompts'
+    seed_prompts = [
+        SeedPrompt(
+            value=prompt,
+            data_type="text",
+            name="",
+            dataset_name="Babelscape/ALERT",
+            description="""ALERT by Babelscape is a dataset that consists
+            of two different categories, 'alert' with 15k red teaming prompts,
+            and 'alert_adversarial' with 30k adversarial red teaming prompts.""",
+            source="https://huggingface.co/datasets/Babelscape/ALERT",
+        )
+        for prompt in prompts
+    ]
+
+    seed_prompt_dataset = SeedPromptDataset(prompts=seed_prompts)
+    return seed_prompt_dataset