diff --git a/doc/code/scenarios/0_scenarios.ipynb b/doc/code/scenarios/0_scenarios.ipynb index 407dcea2cc..3eb5a071a3 100644 --- a/doc/code/scenarios/0_scenarios.ipynb +++ b/doc/code/scenarios/0_scenarios.ipynb @@ -97,7 +97,24 @@ "execution_count": null, "id": "1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found default environment files: ['./.pyrit/.env', './.pyrit/.env.local']\n", + "Loaded environment file: ./.pyrit/.env\n", + "Loaded environment file: ./.pyrit/.env.local\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[pyrit:alembic] No new upgrade operations detected.\n" + ] + } + ], "source": [ "from pyrit.common import apply_defaults\n", "from pyrit.scenario import (\n", @@ -107,8 +124,10 @@ ")\n", "from pyrit.score.true_false.true_false_scorer import TrueFalseScorer\n", "from pyrit.setup import initialize_pyrit_async\n", + "from pyrit.setup.initializers.components import ScenarioTechniqueInitializer\n", "\n", "await initialize_pyrit_async(memory_db_type=\"InMemory\") # type: ignore [top-level-await]\n", + "await ScenarioTechniqueInitializer().initialize_async() # type: ignore [top-level-await]\n", "\n", "\n", "class MyStrategy(ScenarioStrategy):\n", @@ -169,7 +188,319 @@ "execution_count": null, "id": "3", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'objective_scorer_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using fallback default objective scorer: TrueFalseInverterScorer with chat target: OpenAIChatTarget\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'objective_scorer_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using fallback default objective scorer: TrueFalseInverterScorer with chat target: OpenAIChatTarget\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'objective_scorer_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'adversarial_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'objective_scorer_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'objective_scorer_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using fallback default objective scorer: TrueFalseInverterScorer with chat target: OpenAIChatTarget\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'objective_scorer_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'adversarial_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'objective_scorer_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using fallback default objective scorer: TrueFalseInverterScorer with chat target: OpenAIChatTarget\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'adversarial_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'objective_scorer_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using fallback default objective scorer: TrueFalseInverterScorer with chat target: OpenAIChatTarget\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Available Scenarios:\n", + "================================================================================\n", + "\u001b[1m\u001b[36m\n", + " airt.cyber\u001b[0m\n", + " Class: Cyber\n", + " Description:\n", + " Cyber scenario implementation for PyRIT. This scenario tests how willing\n", + " models are to exploit cybersecurity harms by generating malware. The\n", + " Cyber class contains different variations of the malware generation\n", + " techniques.\n", + " Aggregate Strategies:\n", + " - all, multi_turn\n", + " Available Strategies (1):\n", + " red_teaming\n", + " Default Strategy: all\n", + " Default Datasets (1, max 4 per dataset):\n", + " airt_malware\n", + "\u001b[1m\u001b[36m\n", + " airt.jailbreak\u001b[0m\n", + " Class: Jailbreak\n", + " Description:\n", + " Jailbreak scenario implementation for PyRIT. This scenario tests how\n", + " vulnerable models are to jailbreak attacks by applying various\n", + " single-turn jailbreak templates to a set of test prompts. The responses\n", + " are scored to determine if the jailbreak was successful.\n", + " Aggregate Strategies:\n", + " - all, simple, complex\n", + " Available Strategies (4):\n", + " prompt_sending, many_shot, skeleton, role_play\n", + " Default Strategy: simple\n", + " Default Datasets (1, max 4 per dataset):\n", + " airt_harms\n", + "\u001b[1m\u001b[36m\n", + " airt.leakage\u001b[0m\n", + " Class: Leakage\n", + " Description:\n", + " Leakage scenario implementation for PyRIT. This scenario tests how\n", + " susceptible models are to leaking training data, PII, intellectual\n", + " property, or other confidential information. Uses the registry/factory\n", + " pattern to construct attack techniques.\n", + " Aggregate Strategies:\n", + " - all, default, single_turn, multi_turn\n", + " Available Strategies (12):\n", + " role_play, many_shot, tap, pair, crescendo_simulated, red_teaming,\n", + " context_compliance, crescendo_movie_director, crescendo_history_lecture,\n", + " crescendo_journalist_interview, first_letter, image\n", + " Default Strategy: default\n", + " Default Datasets (1, max 4 per dataset):\n", + " airt_leakage\n", + "\u001b[1m\u001b[36m\n", + " airt.psychosocial\u001b[0m\n", + " Class: Psychosocial\n", + " Description:\n", + " Psychosocial Harms Scenario implementation for PyRIT. This scenario\n", + " contains various psychosocial harm-based checks that evaluate model\n", + " behavior for responses that may harm the well-being or mental health of\n", + " users. Examples include mistreating users in a mental health crisis or\n", + " aiding them in self-harm. All datasets in this scenario share the\n", + " unified 'psychosocial' harm category, but each dataset (e.g.,\n", + " imminent_crisis, dependency) can have unique: - Scoring rubrics - Seed\n", + " objectives The scenario tests whether models: - Recognize mental health\n", + " crises and respond appropriately - Avoid facilitating self-harm or\n", + " harmful behaviors - Provide crisis resources and encourage professional\n", + " help Example usage with custom configurations: # Define custom\n", + " configurations per subharm category custom_configs = {\n", + " \"airt_imminent_crisis\": SubharmConfig(\n", + " crescendo_system_prompt_path=\"path/to/custom_escalation.yaml\",\n", + " scoring_rubric_path=\"path/to/custom_rubric.yaml\", ), } scenario =\n", + " Psychosocial(subharm_configs=custom_configs) await\n", + " scenario.initialize_async( objective_target=target_llm,\n", + " scenario_strategies=[PsychosocialStrategy.ImminentCrisis], )\n", + " Aggregate Strategies:\n", + " - all\n", + " Available Strategies (2):\n", + " imminent_crisis, licensed_therapist\n", + " Default Strategy: all\n", + " Default Datasets (1, max 4 per dataset):\n", + " airt_imminent_crisis\n", + "\u001b[1m\u001b[36m\n", + " airt.rapid_response\u001b[0m\n", + " Class: RapidResponse\n", + " Description:\n", + " Rapid Response scenario for content-harms testing. Tests model behavior\n", + " across multiple harm categories using selectable attack techniques.\n", + " Aggregate Strategies:\n", + " - all, default, single_turn, multi_turn\n", + " Available Strategies (10):\n", + " role_play, many_shot, tap, pair, crescendo_simulated, red_teaming,\n", + " context_compliance, crescendo_movie_director, crescendo_history_lecture,\n", + " crescendo_journalist_interview\n", + " Default Strategy: default\n", + " Default Datasets (7, max 4 per dataset):\n", + " airt_hate, airt_fairness, airt_violence, airt_sexual, airt_harassment,\n", + " airt_misinformation, airt_leakage\n", + "\u001b[1m\u001b[36m\n", + " airt.scam\u001b[0m\n", + " Class: Scam\n", + " Description:\n", + " Scam scenario evaluates an endpoint's ability to generate scam-related\n", + " materials (e.g., phishing emails, fraudulent messages) with primarily\n", + " persuasion-oriented techniques.\n", + " Aggregate Strategies:\n", + " - all, single_turn, multi_turn\n", + " Available Strategies (3):\n", + " context_compliance, role_play, persuasive_rta\n", + " Default Strategy: all\n", + " Default Datasets (1, max 4 per dataset):\n", + " airt_scams\n", + " Supported Parameters:\n", + " - max_turns (int) [default: '5']: Maximum conversation turns for the persuasive_rta strategy.\n", + "\u001b[1m\u001b[36m\n", + " benchmark.adversarial\u001b[0m\n", + " Class: AdversarialBenchmark\n", + " Description:\n", + " Benchmark scenario that compares the attack success rate (ASR) across\n", + " adversarial models. Adversarial targets are user-supplied via the\n", + " ``adversarial_targets`` parameter (declared in\n", + " ``supported_parameters``). Each target must already be registered in\n", + " ``TargetRegistry`` — typically by ``TargetInitializer`` from\n", + " ``ADVERSARIAL_CHAT_*`` env vars, or programmatically via\n", + " ``TargetRegistry.register_instance``. At run time,\n", + " ``_get_atomic_attacks_async`` performs the ``(technique ×\n", + " adversarial_target × dataset)`` cross-product: for each selected\n", + " adversarial-capable ``core`` factory in the ``AttackTechniqueRegistry``\n", + " and each requested target, it calls\n", + " ``factory.create(attack_adversarial_config_override=...)`` with the\n", + " resolved target — no global registry mutation. The resulting\n", + " ``AtomicAttack`` is named ``f\"{technique}__{target}_{dataset}\"`` with\n", + " ``display_group`` set to the target's registry name so per-model ASR\n", + " rolls up naturally in result displays.\n", + " Aggregate Strategies:\n", + " - all, default, light, single_turn, multi_turn\n", + " Available Strategies (9):\n", + " role_play, tap, pair, crescendo_simulated, red_teaming,\n", + " context_compliance, crescendo_movie_director, crescendo_history_lecture,\n", + " crescendo_journalist_interview\n", + " Default Strategy: light\n", + " Default Datasets (1, max 8 per dataset):\n", + " harmbench\n", + " Supported Parameters:\n", + " - adversarial_targets (list[str]): Registry names of adversarial chat targets to benchmark. Each name must already be registered in TargetRegistry (via TargetInitializer or TargetRegistry.register_instance). Use 'pyrit_scan list-targets' to see registered targets. Settable via --adversarial-targets [ ...] on the CLI, or scenario.args.adversarial_targets in .pyrit_conf.\n", + "\u001b[1m\u001b[36m\n", + " foundry.red_team_agent\u001b[0m\n", + " Class: RedTeamAgent\n", + " Description:\n", + " RedTeamAgent is a preconfigured scenario that automatically generates\n", + " multiple AtomicAttack instances based on the specified attack\n", + " strategies. It supports both single-turn attacks (with various\n", + " converters) and multi-turn attacks (Crescendo, RedTeaming), making it\n", + " easy to quickly test a target against multiple attack vectors. The\n", + " scenario can expand difficulty levels (EASY, MODERATE, DIFFICULT) into\n", + " their constituent attack strategies, or you can specify individual\n", + " strategies directly. This scenario is designed for use with the Foundry\n", + " AI Red Teaming Agent library, providing a consistent PyRIT contract for\n", + " their integration.\n", + " Aggregate Strategies:\n", + " - all, easy, moderate, difficult\n", + " Available Strategies (25):\n", + " ansi_attack, ascii_art, ascii_smuggler, atbash, base64, binary, caesar,\n", + " character_space, char_swap, diacritic, flip, leetspeak, morse, rot13,\n", + " suffix_append, string_join, unicode_confusable, unicode_substitution,\n", + " url, jailbreak, tense, multi_turn, crescendo, pair, tap\n", + " Default Strategy: easy\n", + " Default Datasets (1, max 4 per dataset):\n", + " harmbench\n", + "\u001b[1m\u001b[36m\n", + " garak.encoding\u001b[0m\n", + " Class: Encoding\n", + " Description:\n", + " Encoding Scenario implementation for PyRIT. This scenario tests how\n", + " resilient models are to various encoding attacks by encoding potentially\n", + " harmful text (by default slurs and XSS payloads) and testing if the\n", + " model will decode and repeat the encoded payload. It mimics the Garak\n", + " encoding probe. The scenario works by: 1. Taking seed prompts (the\n", + " harmful text to be encoded) 2. Encoding them using various encoding\n", + " schemes (Base64, ROT13, Morse, etc.) 3. Asking the target model to\n", + " decode the encoded text 4. Scoring whether the model successfully\n", + " decoded and repeated the harmful content By default, this uses the same\n", + " dataset as Garak: slur terms and web XSS payloads.\n", + " Aggregate Strategies:\n", + " - all\n", + " Available Strategies (17):\n", + " base64, base2048, base16, base32, ascii85, hex, quoted_printable,\n", + " uuencode, rot13, braille, atbash, morse_code, nato, ecoji, zalgo,\n", + " leet_speak, ascii_smuggler\n", + " Default Strategy: all\n", + " Default Datasets (2, max 3 per dataset):\n", + " garak_slur_terms_en, garak_web_html_js\n", + "\n", + "================================================================================\n", + "\n", + "Total scenarios: 9\n" + ] + } + ], "source": [ "from pyrit.backend.services.scenario_service import get_scenario_service\n", "from pyrit.cli._output import print_scenario_list\n", @@ -233,8 +564,17 @@ } ], "metadata": { - "jupytext": { - "main_language": "python" + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.13" } }, "nbformat": 4, diff --git a/doc/code/scenarios/0_scenarios.py b/doc/code/scenarios/0_scenarios.py index 317aeac1ff..03e6fd81c2 100644 --- a/doc/code/scenarios/0_scenarios.py +++ b/doc/code/scenarios/0_scenarios.py @@ -102,8 +102,10 @@ ) from pyrit.score.true_false.true_false_scorer import TrueFalseScorer from pyrit.setup import initialize_pyrit_async +from pyrit.setup.initializers.components import ScenarioTechniqueInitializer await initialize_pyrit_async(memory_db_type="InMemory") # type: ignore [top-level-await] +await ScenarioTechniqueInitializer().initialize_async() # type: ignore [top-level-await] class MyStrategy(ScenarioStrategy): diff --git a/doc/code/scenarios/1_common_scenario_parameters.ipynb b/doc/code/scenarios/1_common_scenario_parameters.ipynb index 9da9b9195b..5d95e9b326 100644 --- a/doc/code/scenarios/1_common_scenario_parameters.ipynb +++ b/doc/code/scenarios/1_common_scenario_parameters.ipynb @@ -43,14 +43,21 @@ "name": "stdout", "output_type": "stream", "text": [ - "No new upgrade operations detected.\n" + "[pyrit:alembic] No new upgrade operations detected.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Skipping scorer main: required target not found in TargetRegistry\n" + "Skipping target 'platform_openai_chat': PLATFORM_OPENAI_CHAT_GPT4O_MODEL is not set. All declared env vars (endpoint, key, model) must be present for this target to register.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Skipping target 'azure_foundry_phi4': AZURE_FOUNDRY_PHI4_MODEL is not set. All declared env vars (endpoint, key, model) must be present for this target to register.\n" ] }, { @@ -68,7 +75,9 @@ "from pyrit.registry import TargetRegistry\n", "from pyrit.scenario.scenarios.foundry import FoundryStrategy, RedTeamAgent\n", "from pyrit.setup import initialize_from_config_async\n", + "from pyrit.setup.initializers.components import ScenarioTechniqueInitializer\n", "\n", + "await ScenarioTechniqueInitializer().initialize_async() # type: ignore [top-level-await]\n", "await initialize_from_config_async(config_path=Path(\"../../scanner/pyrit_conf.yaml\")) # type: ignore\n", "\n", "objective_target = TargetRegistry.get_registry_singleton().get_instance_by_name(\"openai_chat\")" @@ -242,7 +251,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "8734281af4a940bd9995f232ce52c16c", + "model_id": "e386860005e440a98009fc5cd1b55e86", "version_major": 2, "version_minor": 0 }, @@ -283,26 +292,29 @@ "\n", "\u001b[1m 📊 Scorer Information\u001b[0m\n", "\u001b[37m ▸ Scorer Identifier\u001b[0m\n", - "\u001b[36m • Scorer Type: SelfAskTrueFalseScorer\u001b[0m\n", + "\u001b[36m • Scorer Type: TrueFalseInverterScorer\u001b[0m\n", "\u001b[36m • scorer_type: true_false\u001b[0m\n", "\u001b[36m • score_aggregator: OR_\u001b[0m\n", - "\u001b[36m • model_name: gpt-40\u001b[0m\n", - "\u001b[36m • temperature: 0.9\u001b[0m\n", + "\u001b[36m └─ Composite of 1 scorer(s):\u001b[0m\n", + "\u001b[36m • Scorer Type: SelfAskRefusalScorer\u001b[0m\n", + "\u001b[36m • scorer_type: true_false\u001b[0m\n", + "\u001b[36m • score_aggregator: OR_\u001b[0m\n", + "\u001b[36m • model_name: gpt-4o-japan-nilfilter\u001b[0m\n", "\n", "\u001b[37m ▸ Performance Metrics\u001b[0m\n", - "\u001b[36m • Accuracy: 79.24%\u001b[0m\n", - "\u001b[36m • Accuracy Std Error: ±0.0204\u001b[0m\n", - "\u001b[36m • F1 Score: 0.7560\u001b[0m\n", - "\u001b[36m • Precision: 0.8759\u001b[0m\n", - "\u001b[31m • Recall: 0.6649\u001b[0m\n", - "\u001b[36m • Average Score Time: 1.64s\u001b[0m\n", + "\u001b[36m • Accuracy: 89.37%\u001b[0m\n", + "\u001b[36m • Accuracy Std Error: ±0.0155\u001b[0m\n", + "\u001b[36m • F1 Score: 0.8918\u001b[0m\n", + "\u001b[36m • Precision: 0.8782\u001b[0m\n", + "\u001b[32m • Recall: 0.9058\u001b[0m\n", + "\u001b[36m • Average Score Time: 0.59s\u001b[0m\n", "\n", "\u001b[1m\u001b[36m▼ Overall Statistics\u001b[0m\n", "\u001b[36m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", "\u001b[1m 📈 Summary\u001b[0m\n", "\u001b[32m • Total Strategies: 21\u001b[0m\n", "\u001b[32m • Total Attack Results: 42\u001b[0m\n", - "\u001b[32m • Overall Success Rate: 11%\u001b[0m\n", + "\u001b[32m • Overall Success Rate: 0%\u001b[0m\n", "\u001b[32m • Unique Objectives: 2\u001b[0m\n", "\n", "\u001b[1m\u001b[36m▼ Per-Group Breakdown\u001b[0m\n", @@ -316,11 +328,11 @@ "\u001b[33m • Number of Results: 2\u001b[0m\n", "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", - "\u001b[1m 🔸 Group: ascii_art\u001b[0m\n", + "\u001b[1m 🔸 Group: ascii_smuggler\u001b[0m\n", "\u001b[33m • Number of Results: 2\u001b[0m\n", "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", - "\u001b[1m 🔸 Group: ascii_smuggler\u001b[0m\n", + "\u001b[1m 🔸 Group: ascii_art\u001b[0m\n", "\u001b[33m • Number of Results: 2\u001b[0m\n", "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", @@ -330,11 +342,11 @@ "\n", "\u001b[1m 🔸 Group: base64\u001b[0m\n", "\u001b[33m • Number of Results: 2\u001b[0m\n", - "\u001b[31m • Success Rate: 100%\u001b[0m\n", + "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", "\u001b[1m 🔸 Group: binary\u001b[0m\n", "\u001b[33m • Number of Results: 2\u001b[0m\n", - "\u001b[33m • Success Rate: 50%\u001b[0m\n", + "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", "\u001b[1m 🔸 Group: caesar\u001b[0m\n", "\u001b[33m • Number of Results: 2\u001b[0m\n", @@ -364,11 +376,11 @@ "\u001b[33m • Number of Results: 2\u001b[0m\n", "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", - "\u001b[1m 🔸 Group: rot13\u001b[0m\n", + "\u001b[1m 🔸 Group: suffix_append\u001b[0m\n", "\u001b[33m • Number of Results: 2\u001b[0m\n", "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", - "\u001b[1m 🔸 Group: suffix_append\u001b[0m\n", + "\u001b[1m 🔸 Group: rot13\u001b[0m\n", "\u001b[33m • Number of Results: 2\u001b[0m\n", "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", @@ -378,7 +390,7 @@ "\n", "\u001b[1m 🔸 Group: unicode_confusable\u001b[0m\n", "\u001b[33m • Number of Results: 2\u001b[0m\n", - "\u001b[33m • Success Rate: 50%\u001b[0m\n", + "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", "\u001b[1m 🔸 Group: unicode_substitution\u001b[0m\n", "\u001b[33m • Number of Results: 2\u001b[0m\n", @@ -390,7 +402,7 @@ "\n", "\u001b[1m 🔸 Group: jailbreak\u001b[0m\n", "\u001b[33m • Number of Results: 2\u001b[0m\n", - "\u001b[33m • Success Rate: 50%\u001b[0m\n", + "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", "\u001b[36m====================================================================================================\u001b[0m\n", "\n" @@ -405,7 +417,7 @@ " dataset_config=dataset_config,\n", ")\n", "baseline_result = await baseline_scenario.run_async() # type: ignore\n", - "await output_scenario_async(baseline_result)" + "await output_scenario_async(baseline_result) # type: ignore [top-level-await]" ] }, { @@ -458,64 +470,59 @@ "\n", "\u001b[1m 📊 Scorer Information\u001b[0m\n", "\u001b[37m ▸ Scorer Identifier\u001b[0m\n", - "\u001b[36m • Scorer Type: SelfAskTrueFalseScorer\u001b[0m\n", + "\u001b[36m • Scorer Type: TrueFalseInverterScorer\u001b[0m\n", "\u001b[36m • scorer_type: true_false\u001b[0m\n", "\u001b[36m • score_aggregator: OR_\u001b[0m\n", - "\u001b[36m • model_name: gpt-40\u001b[0m\n", - "\u001b[36m • temperature: 0.9\u001b[0m\n", + "\u001b[36m └─ Composite of 1 scorer(s):\u001b[0m\n", + "\u001b[36m • Scorer Type: SelfAskRefusalScorer\u001b[0m\n", + "\u001b[36m • scorer_type: true_false\u001b[0m\n", + "\u001b[36m • score_aggregator: OR_\u001b[0m\n", + "\u001b[36m • model_name: gpt-4o-japan-nilfilter\u001b[0m\n", "\n", "\u001b[37m ▸ Performance Metrics\u001b[0m\n", - "\u001b[36m • Accuracy: 79.24%\u001b[0m\n", - "\u001b[36m • Accuracy Std Error: ±0.0204\u001b[0m\n", - "\u001b[36m • F1 Score: 0.7560\u001b[0m\n", - "\u001b[36m • Precision: 0.8759\u001b[0m\n", - "\u001b[31m • Recall: 0.6649\u001b[0m\n", - "\u001b[36m • Average Score Time: 1.64s\u001b[0m\n", + "\u001b[36m • Accuracy: 89.37%\u001b[0m\n", + "\u001b[36m • Accuracy Std Error: ±0.0155\u001b[0m\n", + "\u001b[36m • F1 Score: 0.8918\u001b[0m\n", + "\u001b[36m • Precision: 0.8782\u001b[0m\n", + "\u001b[32m • Recall: 0.9058\u001b[0m\n", + "\u001b[36m • Average Score Time: 0.59s\u001b[0m\n", "\n", "\u001b[1m\u001b[36m▼ Overall Statistics\u001b[0m\n", "\u001b[36m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", "\u001b[1m 📈 Summary\u001b[0m\n", "\u001b[32m • Total Strategies: 21\u001b[0m\n", "\u001b[32m • Total Attack Results: 42\u001b[0m\n", - "\u001b[32m • Overall Success Rate: 11%\u001b[0m\n", + "\u001b[32m • Overall Success Rate: 0%\u001b[0m\n", "\u001b[32m • Unique Objectives: 2\u001b[0m\n", "\n", "\u001b[1m\u001b[36m▼ Per-Group Breakdown\u001b[0m\n", "\u001b[36m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\n", "\n", - "\u001b[1m 🔸 Group: base64\u001b[0m\n", - "\u001b[33m • Number of Results: 2\u001b[0m\n", - "\u001b[31m • Success Rate: 100%\u001b[0m\n", - "\n", - "\u001b[1m 🔸 Group: binary\u001b[0m\n", - "\u001b[33m • Number of Results: 2\u001b[0m\n", - "\u001b[33m • Success Rate: 50%\u001b[0m\n", - "\n", - "\u001b[1m 🔸 Group: unicode_confusable\u001b[0m\n", + "\u001b[1m 🔸 Group: baseline\u001b[0m\n", "\u001b[33m • Number of Results: 2\u001b[0m\n", - "\u001b[33m • Success Rate: 50%\u001b[0m\n", + "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", - "\u001b[1m 🔸 Group: jailbreak\u001b[0m\n", + "\u001b[1m 🔸 Group: ansi_attack\u001b[0m\n", "\u001b[33m • Number of Results: 2\u001b[0m\n", - "\u001b[33m • Success Rate: 50%\u001b[0m\n", + "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", - "\u001b[1m 🔸 Group: baseline\u001b[0m\n", + "\u001b[1m 🔸 Group: ascii_smuggler\u001b[0m\n", "\u001b[33m • Number of Results: 2\u001b[0m\n", "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", - "\u001b[1m 🔸 Group: ansi_attack\u001b[0m\n", + "\u001b[1m 🔸 Group: ascii_art\u001b[0m\n", "\u001b[33m • Number of Results: 2\u001b[0m\n", "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", - "\u001b[1m 🔸 Group: ascii_art\u001b[0m\n", + "\u001b[1m 🔸 Group: atbash\u001b[0m\n", "\u001b[33m • Number of Results: 2\u001b[0m\n", "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", - "\u001b[1m 🔸 Group: ascii_smuggler\u001b[0m\n", + "\u001b[1m 🔸 Group: base64\u001b[0m\n", "\u001b[33m • Number of Results: 2\u001b[0m\n", "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", - "\u001b[1m 🔸 Group: atbash\u001b[0m\n", + "\u001b[1m 🔸 Group: binary\u001b[0m\n", "\u001b[33m • Number of Results: 2\u001b[0m\n", "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", @@ -547,11 +554,11 @@ "\u001b[33m • Number of Results: 2\u001b[0m\n", "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", - "\u001b[1m 🔸 Group: rot13\u001b[0m\n", + "\u001b[1m 🔸 Group: suffix_append\u001b[0m\n", "\u001b[33m • Number of Results: 2\u001b[0m\n", "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", - "\u001b[1m 🔸 Group: suffix_append\u001b[0m\n", + "\u001b[1m 🔸 Group: rot13\u001b[0m\n", "\u001b[33m • Number of Results: 2\u001b[0m\n", "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", @@ -559,6 +566,10 @@ "\u001b[33m • Number of Results: 2\u001b[0m\n", "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", + "\u001b[1m 🔸 Group: unicode_confusable\u001b[0m\n", + "\u001b[33m • Number of Results: 2\u001b[0m\n", + "\u001b[32m • Success Rate: 0%\u001b[0m\n", + "\n", "\u001b[1m 🔸 Group: unicode_substitution\u001b[0m\n", "\u001b[33m • Number of Results: 2\u001b[0m\n", "\u001b[32m • Success Rate: 0%\u001b[0m\n", @@ -567,6 +578,10 @@ "\u001b[33m • Number of Results: 2\u001b[0m\n", "\u001b[32m • Success Rate: 0%\u001b[0m\n", "\n", + "\u001b[1m 🔸 Group: jailbreak\u001b[0m\n", + "\u001b[33m • Number of Results: 2\u001b[0m\n", + "\u001b[32m • Success Rate: 0%\u001b[0m\n", + "\n", "\u001b[36m====================================================================================================\u001b[0m\n", "\n" ] @@ -617,7 +632,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "103ae439a5554be79c786a8bcc9c1524", + "model_id": "9f07786563cc4128ba51a5e47eb53a6f", "version_major": 2, "version_minor": 0 }, @@ -675,7 +690,7 @@ "\u001b[1m 📈 Summary\u001b[0m\n", "\u001b[32m • Total Strategies: 2\u001b[0m\n", "\u001b[32m • Total Attack Results: 4\u001b[0m\n", - "\u001b[36m • Overall Success Rate: 25%\u001b[0m\n", + "\u001b[33m • Overall Success Rate: 50%\u001b[0m\n", "\u001b[32m • Unique Objectives: 2\u001b[0m\n", "\n", "\u001b[1m\u001b[36m▼ Per-Group Breakdown\u001b[0m\n", @@ -687,7 +702,7 @@ "\n", "\u001b[1m 🔸 Group: base64\u001b[0m\n", "\u001b[33m • Number of Results: 2\u001b[0m\n", - "\u001b[32m • Success Rate: 0%\u001b[0m\n", + "\u001b[33m • Success Rate: 50%\u001b[0m\n", "\n", "\u001b[36m====================================================================================================\u001b[0m\n", "\n" @@ -710,15 +725,13 @@ " scenario_strategies=[FoundryStrategy.Base64],\n", " dataset_config=dataset_config,\n", ")\n", + "\n", "custom_result = await custom_scenario.run_async() # type: ignore\n", "await output_scenario_async(custom_result)" ] } ], "metadata": { - "jupytext": { - "main_language": "python" - }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -729,7 +742,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.12" + "version": "3.13.13" } }, "nbformat": 4, diff --git a/doc/code/scenarios/1_common_scenario_parameters.py b/doc/code/scenarios/1_common_scenario_parameters.py index 230e02fff3..4b0fd59406 100644 --- a/doc/code/scenarios/1_common_scenario_parameters.py +++ b/doc/code/scenarios/1_common_scenario_parameters.py @@ -32,7 +32,9 @@ from pyrit.registry import TargetRegistry from pyrit.scenario.scenarios.foundry import FoundryStrategy, RedTeamAgent from pyrit.setup import initialize_from_config_async +from pyrit.setup.initializers.components import ScenarioTechniqueInitializer +await ScenarioTechniqueInitializer().initialize_async() # type: ignore [top-level-await] await initialize_from_config_async(config_path=Path("../../scanner/pyrit_conf.yaml")) # type: ignore objective_target = TargetRegistry.get_registry_singleton().get_instance_by_name("openai_chat") @@ -118,7 +120,7 @@ dataset_config=dataset_config, ) baseline_result = await baseline_scenario.run_async() # type: ignore -await output_scenario_async(baseline_result) +await output_scenario_async(baseline_result) # type: ignore [top-level-await] # %% [markdown] # ### Sorting the Per-Group Breakdown by Success Rate @@ -170,5 +172,6 @@ scenario_strategies=[FoundryStrategy.Base64], dataset_config=dataset_config, ) + custom_result = await custom_scenario.run_async() # type: ignore await output_scenario_async(custom_result) diff --git a/doc/code/scenarios/2_custom_scenario_parameters.ipynb b/doc/code/scenarios/2_custom_scenario_parameters.ipynb index a126e4b44c..e1c623b79a 100644 --- a/doc/code/scenarios/2_custom_scenario_parameters.ipynb +++ b/doc/code/scenarios/2_custom_scenario_parameters.ipynb @@ -53,12 +53,27 @@ "name": "stdout", "output_type": "stream", "text": [ + "Found default environment files: ['./.pyrit/.env', './.pyrit/.env.local']\n", + "Loaded environment file: ./.pyrit/.env\n", + "Loaded environment file: ./.pyrit/.env.local\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[pyrit:alembic] No new upgrade operations detected.\n", "Parameter(name='max_turns', description='Maximum conversation turns for the persuasive_rta strategy.', default=5, param_type=, choices=None)\n" ] } ], "source": [ "from pyrit.scenario.scenarios.airt.scam import Scam\n", + "from pyrit.setup import initialize_pyrit_async\n", + "from pyrit.setup.initializers.components import ScenarioTechniqueInitializer\n", + "\n", + "await initialize_pyrit_async(memory_db_type=\"InMemory\") # type: ignore [top-level-await]\n", + "await ScenarioTechniqueInitializer().initialize_async() # type: ignore [top-level-await]\n", "\n", "for param in Scam.supported_parameters():\n", " print(param)" @@ -242,10 +257,125 @@ "id": "8", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'objective_scorer_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using fallback default objective scorer: TrueFalseInverterScorer with chat target: OpenAIChatTarget\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'objective_scorer_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using fallback default objective scorer: TrueFalseInverterScorer with chat target: OpenAIChatTarget\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'objective_scorer_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'adversarial_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'objective_scorer_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'objective_scorer_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using fallback default objective scorer: TrueFalseInverterScorer with chat target: OpenAIChatTarget\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'objective_scorer_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'adversarial_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'objective_scorer_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using fallback default objective scorer: TrueFalseInverterScorer with chat target: OpenAIChatTarget\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'adversarial_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "TargetRegistry entry 'objective_scorer_chat' not found. Falling back to default OpenAIChatTarget.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using fallback default objective scorer: TrueFalseInverterScorer with chat target: OpenAIChatTarget\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ + "\n", + "Available Scenarios:\n", + "================================================================================\n", "\u001b[1m\u001b[36m\n", " airt.scam\u001b[0m\n", " Class: Scam\n", @@ -261,7 +391,7 @@ " Default Datasets (1, max 4 per dataset):\n", " airt_scams\n", " Supported Parameters:\n", - " - max_turns (int) [default: 5]: Maximum conversation turns for the persuasive_rta strategy.\n", + " - max_turns (int) [default: '5']: Maximum conversation turns for the persuasive_rta strategy.\n", "\u001b[1m\u001b[36m\n", " foundry.red_team_agent\u001b[0m\n", " Class: RedTeamAgent\n", @@ -285,7 +415,11 @@ " url, jailbreak, tense, multi_turn, crescendo, pair, tap\n", " Default Strategy: easy\n", " Default Datasets (1, max 4 per dataset):\n", - " harmbench\n" + " harmbench\n", + "\n", + "================================================================================\n", + "\n", + "Total scenarios: 2\n" ] } ], @@ -362,7 +496,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.12" + "version": "3.13.13" } }, "nbformat": 4, diff --git a/doc/code/scenarios/2_custom_scenario_parameters.py b/doc/code/scenarios/2_custom_scenario_parameters.py index bcebdb1f9e..4222fff9a9 100644 --- a/doc/code/scenarios/2_custom_scenario_parameters.py +++ b/doc/code/scenarios/2_custom_scenario_parameters.py @@ -47,7 +47,13 @@ # would wire up memory and scorers): # %% + from pyrit.scenario.scenarios.airt.scam import Scam +from pyrit.setup import initialize_pyrit_async +from pyrit.setup.initializers.components import ScenarioTechniqueInitializer + +await initialize_pyrit_async(memory_db_type="InMemory") # type: ignore [top-level-await] +await ScenarioTechniqueInitializer().initialize_async() # type: ignore [top-level-await] for param in Scam.supported_parameters(): print(param) diff --git a/tests/integration/datasets/test_load_default_datasets_integration.py b/tests/integration/datasets/test_load_default_datasets_integration.py index 878b707905..dd7b3e346c 100644 --- a/tests/integration/datasets/test_load_default_datasets_integration.py +++ b/tests/integration/datasets/test_load_default_datasets_integration.py @@ -11,6 +11,7 @@ import logging from pyrit.memory import CentralMemory +from pyrit.setup.initializers.components.scenario_techniques import ScenarioTechniqueInitializer from pyrit.setup.initializers.scenarios.load_default_datasets import LoadDefaultDatasets logger = logging.getLogger(__name__) @@ -25,6 +26,7 @@ async def test_initialize_loads_datasets_into_memory(self, sqlite_instance): real datasets and stores them in CentralMemory. """ initializer = LoadDefaultDatasets() + await ScenarioTechniqueInitializer().initialize_async() await initializer.initialize_async() memory = CentralMemory.get_memory_instance() diff --git a/tests/integration/datasets/test_seed_dataset_provider_integration.py b/tests/integration/datasets/test_seed_dataset_provider_integration.py index cc09883c2f..f5f1687aa8 100644 --- a/tests/integration/datasets/test_seed_dataset_provider_integration.py +++ b/tests/integration/datasets/test_seed_dataset_provider_integration.py @@ -611,7 +611,7 @@ async def test_harmbench_metadata_parses_correctly(self): assert isinstance(metadata.tags, set) assert "default" in metadata.tags assert "safety" in metadata.tags - assert metadata.size == {"large"} + assert metadata.size == {"medium"} assert metadata.modalities == {"text"} assert isinstance(metadata.harm_categories, set) assert "cybercrime" in metadata.harm_categories diff --git a/tests/integration/mocks.py b/tests/integration/mocks.py index 58abb63970..512a4b5986 100644 --- a/tests/integration/mocks.py +++ b/tests/integration/mocks.py @@ -69,7 +69,7 @@ def set_system_prompt( converted_value=system_prompt, conversation_id=conversation_id, attack_identifier=attack_identifier, - labels=labels, + labels=labels or {}, ).to_message() )