microsoft · romanlutz · Jul 7, 2025 · Jun 25, 2025 · Jun 27, 2025 · Jun 27, 2025
diff --git a/pyrit/datasets/__init__.py b/pyrit/datasets/__init__.py
@@ -25,6 +25,7 @@
 from pyrit.datasets.xstest_dataset import fetch_xstest_dataset
 from pyrit.datasets.equitymedqa_dataset import fetch_equitymedqa_dataset_unique_values
 from pyrit.datasets.text_jailbreak import TextJailBreak
+from pyrit.datasets.transphobia_awareness_dataset import fetch_transphobia_awareness_dataset
 from pyrit.datasets.ccp_sensitive_prompts_dataset import fetch_ccp_sensitive_prompts_dataset
 
 __all__ = [
@@ -48,6 +49,7 @@
     "fetch_seclists_bias_testing_dataset",
     "fetch_sosbench_dataset",
     "fetch_tdc23_redteaming_dataset",
+    "fetch_transphobia_awareness_dataset",
     "fetch_wmdp_dataset",
     "fetch_xstest_dataset",
     "TextJailBreak",

diff --git a/pyrit/datasets/transphobia_awareness_dataset.py b/pyrit/datasets/transphobia_awareness_dataset.py
@@ -0,0 +1,93 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from typing import List
+
+import pandas as pd
+
+from pyrit.models import SeedPromptDataset
+from pyrit.models.seed_prompt import SeedPrompt
+
+# URLs for the datasets
+RATINGS_URL = "https://zenodo.org/records/15482694/files/Ratings.xlsx?download=1"
+QUESTION_RESPONSE_PAIRS_URL = "https://zenodo.org/records/15482694/files/Question_Response_Pairs.xlsx?download=1"
+QUORA_QUESTION_LIST_URL = "https://zenodo.org/records/15482694/files/Quora%20Question%20List.xlsx?download=1"
+
+
+def fetch_transphobia_awareness_dataset() -> SeedPromptDataset:
+    """
+    Load the Transphobia-Awareness Dataset from web URLs and return as a SeedPromptDataset.
+
+    This function fetches and parses the transphobia awareness dataset, which contains
+    user-generated queries related to transphobia with human annotations and model responses.
+
+    Transphobia dataset is loaded from the source cited below.
+    Scheuerman, M., Weathington, K., Petterson, A., Doyle, D. T., Das, D., DeVito, M. A., & Brubaker, J.
+    (2025). Quora Questions and LLM Responses on Transgender and Non-Binary Topics [Data set].
+    Zenodo. https://doi.org/10.5281/zenodo.15482694
+
+    This dataset is detailed in the paper cited below.
+    Morgan Klaus Scheuerman, Katy Weathington, Adrian Petterson,
+    Dylan Doyle, Dipto Das, Michael Ann DeVito, Jed Brubaker.
+    "Transphobia is in the Eye of the Prompter:
+    Trans-Centered Perspectives on Large Language Models."
+    ACM Transactions on Computer-Human Interaction (2018).
+
+    Returns:
+        SeedPromptDataset: A dataset of prompts with transphobia awareness metadata.
+
+    Notes:
+        - ratings.xlsx is the main file, with human ratings and model responses.
+        - question_response_pairs.xlsx and quora question list.xlsx can be used to enrich metadata.
+        - All files are fetched from their URLs by default.
+        - Harm categories are populated from unique keyword values.
+    """
+    # Always fetch from URLs
+    ratings_df = pd.read_excel(RATINGS_URL, header=1)
+
+    seed_prompts: List[SeedPrompt] = []
+
+    # Map keywords for consistency
+    ratings_df["keyword"] = ratings_df["keyword"].replace(
+        {"Trans": "transgender", "Transgender": "transgender", "nonbinary": "non-binary"}
+    )
+    unique_keywords = ratings_df["keyword"].unique().tolist()
+    harm_categories = ["transphobia"] + unique_keywords
+
+    for _, row in ratings_df.iterrows():
+        metadata = {
+            "keyword": row["keyword"],
+            "question_sentiment": row.get(
+                """Question sentiment: How transphobic is the question?\n\nRate on a scale of 1-6,
+            where 6 is the most transphobic"""
+            ),
+        }
+
+        prompt = SeedPrompt(
+            value=row["Quora Question"],
+            data_type="text",
+            dataset_name="Transphobia-Awareness",
+            harm_categories=["transphobia"],
+            description="Quora-style question for transphobia awareness and inclusivity evaluation.",
+            metadata=metadata,
+            source=RATINGS_URL,
+            authors=[
+                "Morgan Klaus Scheuerman",
+                "Katy Weathington",
+                "Adrian Petterson",
+                "Dylan Thomas Doyle",
+                "Dipto Das",
+                "Michael Ann DeVito",
+                "Jed R. Brubaker",
+            ],
+        )
+        seed_prompts.append(prompt)
+
+    return SeedPromptDataset(
+        prompts=seed_prompts,
+        name="Transphobia-Awareness",
+        dataset_name="Transphobia-Awareness",
+        harm_categories=harm_categories,
+        description="Dataset for evaluating LLM responses for transphobia and inclusivity.",
+        source=RATINGS_URL,
+    )
diff --git a/tests/integration/datasets/test_fetch_datasets.py b/tests/integration/datasets/test_fetch_datasets.py
@@ -23,6 +23,7 @@
     fetch_seclists_bias_testing_dataset,
     fetch_sosbench_dataset,
     fetch_tdc23_redteaming_dataset,
+    fetch_transphobia_awareness_dataset,
     fetch_wmdp_dataset,
     fetch_xstest_dataset,
 )
@@ -51,6 +52,7 @@
         (fetch_seclists_bias_testing_dataset, True),
         (fetch_sosbench_dataset, True),
         (fetch_tdc23_redteaming_dataset, True),
+        (fetch_transphobia_awareness_dataset, True),
         (fetch_wmdp_dataset, False),
         (fetch_xstest_dataset, True),
     ],