Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions pyrit/datasets/seed_datasets/remote/cbt_bench_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Licensed under the MIT license.

import logging
import warnings
from typing import Any

from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
Expand Down Expand Up @@ -38,19 +39,28 @@ def __init__(
*,
source: str = "Psychotherapy-LLM/CBT-Bench",
config: str = "core_fine_seed",
split: str = "train",
split: str | None = None,
) -> None:
"""
Initialize the CBT-Bench dataset loader.

Args:
source: HuggingFace dataset identifier. Defaults to "Psychotherapy-LLM/CBT-Bench".
config: Dataset configuration/subset to load. Defaults to "core_fine_seed".
split: Dataset split to load. Defaults to "train".
split: **Deprecated.** Every config of ``Psychotherapy-LLM/CBT-Bench`` publishes
only the ``"train"`` split, so this kwarg has no effect. It will be removed
in v0.16.0.
"""
if split is not None:
warnings.warn(
"'split' is deprecated and will be removed in v0.16.0. "
"Every config of Psychotherapy-LLM/CBT-Bench publishes only the 'train' "
"split, so this kwarg has no effect.",
DeprecationWarning,
stacklevel=2,
)
self.source = source
self.config = config
self.split = split

@property
def dataset_name(self) -> str:
Expand All @@ -76,7 +86,7 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
data = await self._fetch_from_huggingface_async(
dataset_name=self.source,
config=self.config,
split=self.split,
split="train",
cache=cache,
)

Expand Down
19 changes: 15 additions & 4 deletions pyrit/datasets/seed_datasets/remote/darkbench_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import warnings

from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
_RemoteDatasetLoader,
)
Expand Down Expand Up @@ -33,19 +35,28 @@ def __init__(
*,
dataset_name: str = "apart/darkbench",
config: str = "default",
split: str = "train",
split: str | None = None,
) -> None:
"""
Initialize the DarkBench dataset loader.

Args:
dataset_name: HuggingFace dataset identifier. Defaults to "apart/darkbench".
config: Dataset configuration. Defaults to "default".
split: Dataset split to load. Defaults to "train".
split: **Deprecated.** Upstream ``apart/darkbench`` publishes only the
``"train"`` split, so this kwarg has no effect. It will be removed in
v0.16.0.
"""
if split is not None:
warnings.warn(
"'split' is deprecated and will be removed in v0.16.0. "
"Upstream apart/darkbench publishes only the 'train' split, "
"so this kwarg has no effect.",
DeprecationWarning,
stacklevel=2,
)
self.hf_dataset_name = dataset_name
self.config = config
self.split = split

@property
def dataset_name(self) -> str:
Expand All @@ -70,7 +81,7 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
data = await self._fetch_from_huggingface_async(
dataset_name=self.hf_dataset_name,
config=self.config,
split=self.split,
split="train",
cache=cache,
data_files="darkbench.tsv",
)
Expand Down
20 changes: 16 additions & 4 deletions pyrit/datasets/seed_datasets/remote/forbidden_questions_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Licensed under the MIT license.

import logging
import warnings

from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
_RemoteDatasetLoader,
Expand Down Expand Up @@ -34,17 +35,28 @@ def __init__(
self,
*,
source: str = "TrustAIRLab/forbidden_question_set",
split: str = "default",
split: str | None = None,
) -> None:
"""
Initialize the Forbidden Questions dataset loader.

Args:
source: HuggingFace dataset identifier. Defaults to "TrustAIRLab/forbidden_question_set".
split: Dataset split to load. Defaults to "default".
split: **Deprecated.** This kwarg was misforwarded to HuggingFace as ``config``,
and ``TrustAIRLab/forbidden_question_set`` publishes only one config
(``"default"``) with one split (``"train"``), so it never did anything
useful. It will be removed in v0.16.0.
"""
if split is not None:
warnings.warn(
"'split' is deprecated and will be removed in v0.16.0. "
"It was misforwarded to HuggingFace as 'config', and "
"TrustAIRLab/forbidden_question_set publishes only one config ('default') "
"with one split ('train'), so this kwarg has no effect.",
DeprecationWarning,
stacklevel=2,
)
self.source = source
self.split = split

@property
def dataset_name(self) -> str:
Expand All @@ -66,7 +78,7 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
# Load from HuggingFace
data = await self._fetch_from_huggingface_async(
dataset_name=self.source,
config=self.split,
config="default",
split="train",
cache=cache,
)
Expand Down
18 changes: 14 additions & 4 deletions pyrit/datasets/seed_datasets/remote/harmful_qa_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Licensed under the MIT license.

import logging
import warnings

from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
_RemoteDatasetLoader,
Expand Down Expand Up @@ -36,15 +37,24 @@ class _HarmfulQADataset(_RemoteDatasetLoader):
def __init__(
self,
*,
split: str = "train",
split: str | None = None,
) -> None:
"""
Initialize the HarmfulQA dataset loader.

Args:
split: Dataset split to load. Defaults to "train".
split: **Deprecated.** Upstream ``declare-lab/HarmfulQA`` publishes only the
``"train"`` split, so this kwarg has no effect. It will be removed in
v0.16.0.
"""
self.split = split
if split is not None:
warnings.warn(
"'split' is deprecated and will be removed in v0.16.0. "
"Upstream declare-lab/HarmfulQA publishes only the 'train' split, "
"so this kwarg has no effect.",
DeprecationWarning,
stacklevel=2,
)

@property
def dataset_name(self) -> str:
Expand All @@ -65,7 +75,7 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:

data = await self._fetch_from_huggingface_async(
dataset_name=self.HF_DATASET_NAME,
split=self.split,
split="train",
cache=cache,
)

Expand Down
18 changes: 14 additions & 4 deletions pyrit/datasets/seed_datasets/remote/hixstest_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import logging
import os
import warnings
from enum import Enum

from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
Expand Down Expand Up @@ -68,7 +69,7 @@ def __init__(
self,
*,
language: HiXSTestLanguage = HiXSTestLanguage.HINDI,
split: str = "train",
split: str | None = None,
token: str | None = None,
) -> None:
"""
Expand All @@ -78,16 +79,25 @@ def __init__(
language: Which language to use as the primary ``SeedPrompt.value``.
Defaults to ``HiXSTestLanguage.HINDI`` (the dataset's intended language).
Pass ``HiXSTestLanguage.ENGLISH`` to use the English translation instead.
split: Dataset split to load. Defaults to "train" (the only split).
split: **Deprecated.** Upstream ``walledai/HiXSTest`` publishes only the
``"train"`` split, so this kwarg has no effect. It will be removed in
v0.16.0.
token: Hugging Face authentication token. If not provided, reads from the
``HUGGINGFACE_TOKEN`` environment variable.

Raises:
ValueError: If ``language`` is not a ``HiXSTestLanguage`` instance.
"""
if split is not None:
warnings.warn(
"'split' is deprecated and will be removed in v0.16.0. "
"Upstream walledai/HiXSTest publishes only the 'train' split, "
"so this kwarg has no effect.",
DeprecationWarning,
stacklevel=2,
)
self._validate_enum(language, HiXSTestLanguage, "language")
self.language = language
self.split = split
self.token = token if token is not None else os.environ.get("HUGGINGFACE_TOKEN")

@property
Expand All @@ -113,7 +123,7 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:

data = await self._fetch_from_huggingface_async(
dataset_name=self.HF_DATASET_NAME,
split=self.split,
split="train",
cache=cache,
token=self.token,
)
Expand Down
18 changes: 14 additions & 4 deletions pyrit/datasets/seed_datasets/remote/or_bench_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Licensed under the MIT license.

import logging
import warnings

from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
_RemoteDatasetLoader,
Expand Down Expand Up @@ -36,14 +37,23 @@ class _ORBenchBaseDataset(_RemoteDatasetLoader):
modalities: tuple[Modality, ...] = (Modality.TEXT,)
tags: frozenset[str] = frozenset({"default", "safety", "refusal"})

def __init__(self, *, split: str = "train") -> None:
def __init__(self, *, split: str | None = None) -> None:
"""
Initialize the OR-Bench dataset loader.

Args:
split: Dataset split to load. Defaults to "train".
split: **Deprecated.** Every config of ``bench-llm/OR-Bench`` publishes only
the ``"train"`` split, so this kwarg has no effect. It will be removed in
v0.16.0.
"""
self.split = split
if split is not None:
warnings.warn(
"'split' is deprecated and will be removed in v0.16.0. "
"Every config of bench-llm/OR-Bench publishes only the 'train' split, "
"so this kwarg has no effect.",
DeprecationWarning,
stacklevel=2,
)

async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
"""
Expand All @@ -60,7 +70,7 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:
data = await self._fetch_from_huggingface_async(
dataset_name=self.HF_DATASET_NAME,
config=self.CONFIG,
split=self.split,
split="train",
cache=cache,
)

Expand Down
19 changes: 14 additions & 5 deletions pyrit/datasets/seed_datasets/remote/sgxstest_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import logging
import os
import warnings
from enum import Enum

from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
Expand Down Expand Up @@ -74,7 +75,7 @@ def __init__(
self,
*,
label: SGXSTestLabel = SGXSTestLabel.UNSAFE,
split: str = "train",
split: str | None = None,
token: str | None = None,
) -> None:
"""
Expand All @@ -84,18 +85,26 @@ def __init__(
label: Which subset of prompts to load. Defaults to ``SGXSTestLabel.UNSAFE``
(the truly-harmful prompts). Use ``SGXSTestLabel.SAFE`` for the
over-refusal targets or ``SGXSTestLabel.ALL`` for the full 200-prompt set.
split: Dataset split to load. Defaults to "train" (the only split currently
published by the upstream dataset).
split: **Deprecated.** Upstream ``walledai/SGXSTest`` publishes only the
``"train"`` split, so this kwarg has no effect. It will be removed in
v0.16.0.
token: Hugging Face authentication token. If not provided, reads from
the HUGGINGFACE_TOKEN env var.

Raises:
ValueError: If ``label`` is not an SGXSTestLabel member.
"""
if split is not None:
warnings.warn(
"'split' is deprecated and will be removed in v0.16.0. "
"Upstream walledai/SGXSTest publishes only the 'train' split, "
"so this kwarg has no effect.",
DeprecationWarning,
stacklevel=2,
)
self._validate_enum(value=label, enum_cls=SGXSTestLabel, label="label")

self.label = label
self.split = split
self.token = token if token is not None else os.environ.get("HUGGINGFACE_TOKEN")

@property
Expand All @@ -122,7 +131,7 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:

data = await self._fetch_from_huggingface_async(
dataset_name=self.HF_DATASET_NAME,
split=self.split,
split="train",
cache=cache,
token=self.token,
)
Expand Down
18 changes: 14 additions & 4 deletions pyrit/datasets/seed_datasets/remote/simple_safety_tests_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
# Licensed under the MIT license.

import logging
import warnings

from pyrit.datasets.seed_datasets.remote.remote_dataset_loader import (
_RemoteDatasetLoader,
Expand Down Expand Up @@ -36,15 +37,24 @@ class _SimpleSafetyTestsDataset(_RemoteDatasetLoader):
def __init__(
self,
*,
split: str = "test",
split: str | None = None,
) -> None:
"""
Initialize the SimpleSafetyTests dataset loader.

Args:
split: Dataset split to load. Defaults to "test".
split: **Deprecated.** Upstream ``Bertievidgen/SimpleSafetyTests`` publishes
only the ``"test"`` split, so this kwarg has no effect. It will be
removed in v0.16.0.
"""
self.split = split
if split is not None:
warnings.warn(
"'split' is deprecated and will be removed in v0.16.0. "
"Upstream Bertievidgen/SimpleSafetyTests publishes only the 'test' "
"split, so this kwarg has no effect.",
DeprecationWarning,
stacklevel=2,
)

@property
def dataset_name(self) -> str:
Expand All @@ -65,7 +75,7 @@ async def fetch_dataset_async(self, *, cache: bool = True) -> SeedDataset:

data = await self._fetch_from_huggingface_async(
dataset_name=self.HF_DATASET_NAME,
split=self.split,
split="test",
cache=cache,
)

Expand Down
Loading
Loading