microsoft · romanlutz · Jun 3, 2026 · Jun 2, 2026 · Jun 3, 2026 · Jun 3, 2026
diff --git a/pyrit/datasets/seed_datasets/remote/cbt_bench_dataset.py b/pyrit/datasets/seed_datasets/remote/cbt_bench_dataset.py
@@ -2,6 +2,7 @@
 # Licensed under the MIT license.
 
 import logging
+import warnings
 from typing import Any
 
 from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
@@ -38,19 +39,28 @@ def __init__(
         *,
         source: str = "Psychotherapy-LLM/CBT-Bench",
         config: str = "core_fine_seed",
-        split: str = "train",
+        split: str | None = None,
     ) -> None:
         """
         Initialize the CBT-Bench dataset loader.
 
         Args:
             source: HuggingFace dataset identifier. Defaults to "Psychotherapy-LLM/CBT-Bench".
             config: Dataset configuration/subset to load. Defaults to "core_fine_seed".
-            split: Dataset split to load. Defaults to "train".
+            split: **Deprecated.** Every config of ``Psychotherapy-LLM/CBT-Bench`` publishes
+                only the ``"train"`` split, so this kwarg has no effect. It will be removed
+                in v0.16.0.
         """
+        if split is not None:
+            warnings.warn(
+                "'split' is deprecated and will be removed in v0.16.0. "
+                "Every config of Psychotherapy-LLM/CBT-Bench publishes only the 'train' "
+                "split, so this kwarg has no effect.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
         self.source = source
         self.config = config
-        self.split = split
 
     @property
     def dataset_name(self) -> str:
@@ -76,7 +86,7 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
         data = await self._fetch_from_huggingface_async(
             dataset_name=self.source,
             config=self.config,
-            split=self.split,
+            split="train",
             cache=cache,
         )
 

diff --git a/pyrit/datasets/seed_datasets/remote/darkbench_dataset.py b/pyrit/datasets/seed_datasets/remote/darkbench_dataset.py
@@ -1,6 +1,8 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.
 
+import warnings
+
 from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
     _RemoteDatasetLoader,
 )
@@ -33,19 +35,28 @@ def __init__(
         *,
         dataset_name: str = "apart/darkbench",
         config: str = "default",
-        split: str = "train",
+        split: str | None = None,
     ) -> None:
         """
         Initialize the DarkBench dataset loader.
 
         Args:
             dataset_name: HuggingFace dataset identifier. Defaults to "apart/darkbench".
             config: Dataset configuration. Defaults to "default".
-            split: Dataset split to load. Defaults to "train".
+            split: **Deprecated.** Upstream ``apart/darkbench`` publishes only the
+                ``"train"`` split, so this kwarg has no effect. It will be removed in
+                v0.16.0.
         """
+        if split is not None:
+            warnings.warn(
+                "'split' is deprecated and will be removed in v0.16.0. "
+                "Upstream apart/darkbench publishes only the 'train' split, "
+                "so this kwarg has no effect.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
         self.hf_dataset_name = dataset_name
         self.config = config
-        self.split = split
 
     @property
     def dataset_name(self) -> str:
@@ -70,7 +81,7 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
         data = await self._fetch_from_huggingface_async(
             dataset_name=self.hf_dataset_name,
             config=self.config,
-            split=self.split,
+            split="train",
             cache=cache,
             data_files="darkbench.tsv",
         )

diff --git a/pyrit/datasets/seed_datasets/remote/forbidden_questions_dataset.py b/pyrit/datasets/seed_datasets/remote/forbidden_questions_dataset.py
@@ -2,6 +2,7 @@
 # Licensed under the MIT license.
 
 import logging
+import warnings
 
 from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
     _RemoteDatasetLoader,
@@ -34,17 +35,28 @@ def __init__(
         self,
         *,
         source: str = "TrustAIRLab/forbidden_question_set",
-        split: str = "default",
+        split: str | None = None,
     ) -> None:
         """
         Initialize the Forbidden Questions dataset loader.
 
         Args:
             source: HuggingFace dataset identifier. Defaults to "TrustAIRLab/forbidden_question_set".
-            split: Dataset split to load. Defaults to "default".
+            split: **Deprecated.** This kwarg was misforwarded to HuggingFace as ``config``,
+                and ``TrustAIRLab/forbidden_question_set`` publishes only one config
+                (``"default"``) with one split (``"train"``), so it never did anything
+                useful. It will be removed in v0.16.0.
         """
+        if split is not None:
+            warnings.warn(
+                "'split' is deprecated and will be removed in v0.16.0. "
+                "It was misforwarded to HuggingFace as 'config', and "
+                "TrustAIRLab/forbidden_question_set publishes only one config ('default') "
+                "with one split ('train'), so this kwarg has no effect.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
         self.source = source
-        self.split = split
 
     @property
     def dataset_name(self) -> str:
@@ -66,7 +78,7 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
         # Load from HuggingFace
         data = await self._fetch_from_huggingface_async(
             dataset_name=self.source,
-            config=self.split,
+            config="default",
             split="train",
             cache=cache,
         )

diff --git a/pyrit/datasets/seed_datasets/remote/harmful_qa_dataset.py b/pyrit/datasets/seed_datasets/remote/harmful_qa_dataset.py
@@ -2,6 +2,7 @@
 # Licensed under the MIT license.
 
 import logging
+import warnings
 
 from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
     _RemoteDatasetLoader,
@@ -36,15 +37,24 @@ class _HarmfulQADataset(_RemoteDatasetLoader):
     def __init__(
         self,
         *,
-        split: str = "train",
+        split: str | None = None,
     ) -> None:
         """
         Initialize the HarmfulQA dataset loader.
 
         Args:
-            split: Dataset split to load. Defaults to "train".
+            split: **Deprecated.** Upstream ``declare-lab/HarmfulQA`` publishes only the
+                ``"train"`` split, so this kwarg has no effect. It will be removed in
+                v0.16.0.
         """
-        self.split = split
+        if split is not None:
+            warnings.warn(
+                "'split' is deprecated and will be removed in v0.16.0. "
+                "Upstream declare-lab/HarmfulQA publishes only the 'train' split, "
+                "so this kwarg has no effect.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
 
     @property
     def dataset_name(self) -> str:
@@ -65,7 +75,7 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
 
         data = await self._fetch_from_huggingface_async(
             dataset_name=self.HF_DATASET_NAME,
-            split=self.split,
+            split="train",
             cache=cache,
         )
 

diff --git a/pyrit/datasets/seed_datasets/remote/hixstest_dataset.py b/pyrit/datasets/seed_datasets/remote/hixstest_dataset.py
@@ -3,6 +3,7 @@
 
 import logging
 import os
+import warnings
 from enum import Enum
 
 from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
@@ -68,7 +69,7 @@ def __init__(
         self,
         *,
         language: HiXSTestLanguage = HiXSTestLanguage.HINDI,
-        split: str = "train",
+        split: str | None = None,
         token: str | None = None,
     ) -> None:
         """
@@ -78,16 +79,25 @@ def __init__(
             language: Which language to use as the primary ``SeedPrompt.value``.
                 Defaults to ``HiXSTestLanguage.HINDI`` (the dataset's intended language).
                 Pass ``HiXSTestLanguage.ENGLISH`` to use the English translation instead.
-            split: Dataset split to load. Defaults to "train" (the only split).
+            split: **Deprecated.** Upstream ``walledai/HiXSTest`` publishes only the
+                ``"train"`` split, so this kwarg has no effect. It will be removed in
+                v0.16.0.
             token: Hugging Face authentication token. If not provided, reads from the
                 ``HUGGINGFACE_TOKEN`` environment variable.
 
         Raises:
             ValueError: If ``language`` is not a ``HiXSTestLanguage`` instance.
         """
+        if split is not None:
+            warnings.warn(
+                "'split' is deprecated and will be removed in v0.16.0. "
+                "Upstream walledai/HiXSTest publishes only the 'train' split, "
+                "so this kwarg has no effect.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
         self._validate_enum(language, HiXSTestLanguage, "language")
         self.language = language
-        self.split = split
         self.token = token if token is not None else os.environ.get("HUGGINGFACE_TOKEN")
 
     @property
@@ -113,7 +123,7 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
 
         data = await self._fetch_from_huggingface_async(
             dataset_name=self.HF_DATASET_NAME,
-            split=self.split,
+            split="train",
             cache=cache,
             token=self.token,
         )

diff --git a/pyrit/datasets/seed_datasets/remote/or_bench_dataset.py b/pyrit/datasets/seed_datasets/remote/or_bench_dataset.py
@@ -2,6 +2,7 @@
 # Licensed under the MIT license.
 
 import logging
+import warnings
 
 from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
     _RemoteDatasetLoader,
@@ -36,14 +37,23 @@ class _ORBenchBaseDataset(_RemoteDatasetLoader):
     modalities: tuple[Modality, ...] = (Modality.TEXT,)
     tags: frozenset[str] = frozenset({"default", "safety", "refusal"})
 
-    def __init__(self, *, split: str = "train") -> None:
+    def __init__(self, *, split: str | None = None) -> None:
         """
         Initialize the OR-Bench dataset loader.
 
         Args:
-            split: Dataset split to load. Defaults to "train".
+            split: **Deprecated.** Every config of ``bench-llm/OR-Bench`` publishes only
+                the ``"train"`` split, so this kwarg has no effect. It will be removed in
+                v0.16.0.
         """
-        self.split = split
+        if split is not None:
+            warnings.warn(
+                "'split' is deprecated and will be removed in v0.16.0. "
+                "Every config of bench-llm/OR-Bench publishes only the 'train' split, "
+                "so this kwarg has no effect.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
 
     async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
         """
@@ -60,7 +70,7 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
         data = await self._fetch_from_huggingface_async(
             dataset_name=self.HF_DATASET_NAME,
             config=self.CONFIG,
-            split=self.split,
+            split="train",
             cache=cache,
         )
 

diff --git a/pyrit/datasets/seed_datasets/remote/sgxstest_dataset.py b/pyrit/datasets/seed_datasets/remote/sgxstest_dataset.py
@@ -3,6 +3,7 @@
 
 import logging
 import os
+import warnings
 from enum import Enum
 
 from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
@@ -74,7 +75,7 @@ def __init__(
         self,
         *,
         label: SGXSTestLabel = SGXSTestLabel.UNSAFE,
-        split: str = "train",
+        split: str | None = None,
         token: str | None = None,
     ) -> None:
         """
@@ -84,18 +85,26 @@ def __init__(
             label: Which subset of prompts to load. Defaults to ``SGXSTestLabel.UNSAFE``
                 (the truly-harmful prompts). Use ``SGXSTestLabel.SAFE`` for the
                 over-refusal targets or ``SGXSTestLabel.ALL`` for the full 200-prompt set.
-            split: Dataset split to load. Defaults to "train" (the only split currently
-                published by the upstream dataset).
+            split: **Deprecated.** Upstream ``walledai/SGXSTest`` publishes only the
+                ``"train"`` split, so this kwarg has no effect. It will be removed in
+                v0.16.0.
             token: Hugging Face authentication token. If not provided, reads from
                 the HUGGINGFACE_TOKEN env var.
 
         Raises:
             ValueError: If ``label`` is not an SGXSTestLabel member.
         """
+        if split is not None:
+            warnings.warn(
+                "'split' is deprecated and will be removed in v0.16.0. "
+                "Upstream walledai/SGXSTest publishes only the 'train' split, "
+                "so this kwarg has no effect.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
         self._validate_enum(value=label, enum_cls=SGXSTestLabel, label="label")
 
         self.label = label
-        self.split = split
         self.token = token if token is not None else os.environ.get("HUGGINGFACE_TOKEN")
 
     @property
@@ -122,7 +131,7 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
 
         data = await self._fetch_from_huggingface_async(
             dataset_name=self.HF_DATASET_NAME,
-            split=self.split,
+            split="train",
             cache=cache,
             token=self.token,
         )

diff --git a/pyrit/datasets/seed_datasets/remote/simple_safety_tests_dataset.py b/pyrit/datasets/seed_datasets/remote/simple_safety_tests_dataset.py
@@ -2,6 +2,7 @@
 # Licensed under the MIT license.
 
 import logging
+import warnings
 
 from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
     _RemoteDatasetLoader,
@@ -36,15 +37,24 @@ class _SimpleSafetyTestsDataset(_RemoteDatasetLoader):
     def __init__(
         self,
         *,
-        split: str = "test",
+        split: str | None = None,
     ) -> None:
         """
         Initialize the SimpleSafetyTests dataset loader.
 
         Args:
-            split: Dataset split to load. Defaults to "test".
+            split: **Deprecated.** Upstream ``Bertievidgen/SimpleSafetyTests`` publishes
+                only the ``"test"`` split, so this kwarg has no effect. It will be
+                removed in v0.16.0.
         """
-        self.split = split
+        if split is not None:
+            warnings.warn(
+                "'split' is deprecated and will be removed in v0.16.0. "
+                "Upstream Bertievidgen/SimpleSafetyTests publishes only the 'test' "
+                "split, so this kwarg has no effect.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
 
     @property
     def dataset_name(self) -> str:
@@ -65,7 +75,7 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
 
         data = await self._fetch_from_huggingface_async(
             dataset_name=self.HF_DATASET_NAME,
-            split=self.split,
+            split="test",
             cache=cache,
         )