Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 15 additions & 14 deletions doc/code/converters/0_converters.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -123,20 +123,21 @@
"62 text text StringJoinConverter\n",
"63 text text SuffixAppendConverter\n",
"64 text text SuperscriptConverter\n",
"65 text text TemplateSegmentConverter\n",
"66 text text TenseConverter\n",
"67 text text TextJailbreakConverter\n",
"68 text text ToneConverter\n",
"69 text text ToxicSentenceGeneratorConverter\n",
"70 text text TranslationConverter\n",
"71 text text UnicodeConfusableConverter\n",
"72 text text UnicodeReplacementConverter\n",
"73 text text UnicodeSubstitutionConverter\n",
"74 text text UrlConverter\n",
"75 text text VariationConverter\n",
"76 text text VariationSelectorSmugglerConverter\n",
"77 text text ZalgoConverter\n",
"78 text text ZeroWidthConverter\n"
"65 text text TatweelConverter\n",
"66 text text TemplateSegmentConverter\n",
"67 text text TenseConverter\n",
"68 text text TextJailbreakConverter\n",
"69 text text ToneConverter\n",
"70 text text ToxicSentenceGeneratorConverter\n",
"71 text text TranslationConverter\n",
"72 text text UnicodeConfusableConverter\n",
"73 text text UnicodeReplacementConverter\n",
"74 text text UnicodeSubstitutionConverter\n",
"75 text text UrlConverter\n",
"76 text text VariationConverter\n",
"77 text text VariationSelectorSmugglerConverter\n",
"78 text text ZalgoConverter\n",
"79 text text ZeroWidthConverter\n"
]
}
],
Expand Down
51 changes: 28 additions & 23 deletions doc/code/converters/1_text_to_text_converters.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -156,53 +156,55 @@
"output_type": "stream",
"text": [
"Leetspeak: text: 7311 m3 h0w 70 (u7 d0wn 4 7r33\n",
"Random Capitals: text: TELL ME hOW tO cUt dowN a tRee\n",
"Unicode Confusable: text: 𝔱𝒆‎𞣇‎ℐ rn𝓮 𝚑𝓸ɯ 𝘵𝐨 𝓬𝒖𝑡 ⅆ𝞸𝙬𝙣 𝖆 𝗍𝐫𝙚𝒆\n",
"Random Capitals: text: tELl mE HOw to CUt DOWN A treE\n",
"Unicode Confusable: text: 𝐭𝖊𝟙𝐥 rn℮ 𝒉𝜎ѡ 𝘵‎ھ‎ 𝘤𐓶𝘵 𝒅‎ﮬ‎𝗐𝘯 ɑ 𝕥ᴦ𝑒𝗲\n",
"Unicode Substitution: text: 󠁴󠁥󠁬󠁬󠀠󠁭󠁥󠀠󠁨󠁯󠁷󠀠󠁴󠁯󠀠󠁣󠁵󠁴󠀠󠁤󠁯󠁷󠁮󠀠󠁡󠀠󠁴󠁲󠁥󠁥\n",
"Unicode Replacement: text: \\u0074\\u0065\\u006c\\u006c \\u006d\\u0065 \\u0068\\u006f\\u0077 \\u0074\\u006f \\u0063\\u0075\\u0074 \\u0064\\u006f\\u0077\\u006e \\u0061 \\u0074\\u0072\\u0065\\u0065\n",
"Emoji: text: 🅃🅴🅻🅛 🅜🄴 🄷🅞🅆 🆃🅞 🅲🆄🅣 🄳🅞🅆🅝 🅰️ 🅣🅁🄴🅴\n",
"Emoji: text: 🅣🅔🅻🅻 🅼🄴 🅷🅾️🆆 🅣🅾️ 🅒🅤🅣 🅳🄾🅆🅝 🅐 🅣🅡🄴🅔\n",
"First Letter: text: t m h t c d a t\n",
"String Join: text: t-e-l-l m-e h-o-w t-o c-u-t d-o-w-n a t-r-e-e\n",
"Zero Width: text: t​e​l​l​ ​m​e​ ​h​o​w​ ​t​o​ ​c​u​t​ ​d​o​w​n​ ​a​ ​t​r​e​e\n",
"Flip: text: eert a nwod tuc ot woh em llet\n",
"Character Space: text: t e l l m e h o w t o c u t d o w n a t r e e\n",
"Diacritic: text: téll mé hów tó cút dówn á tréé\n",
"Bidi: text: ‮tell me how to cut down a tree‬\n",
"Tatweel: text: مـرـحـبـا\n",
"Superscript: text: ᵗᵉˡˡ ᵐᵉ ʰᵒʷ ᵗᵒ ᶜᵘᵗ ᵈᵒʷⁿ ᵃ ᵗʳᵉᵉ\n",
"Zalgo: text: ţ̤̘̪̤͆̀é̈l̮ͪ̌͗ļ̶̮͖͛ͥͬ̇ m̷̷̠̼͉͚̥ͫ͞͡e̅ ḫ̥̲̇ͮͦo̱͏̙̺̅̆̎w̷ͬ̆͢ t̵̜̖̤ͣͨ̒ͅo̵̟͕ͤ̎̚ c͉ͮͫů̧̱̭̱͚̖̌͌ͧt̥̫͈͒͗ d̗̤̩̀͑͌ͅo̷͓̦ͮ̀̓ͭ́̄̓w͚̺̦̌̅ͯ͋̕͜n̟̱̽ͯͤ̈̅͡ ā̷̗̖̠̌͂ t͈͔̼͐r͎ȇͨe͕͖̠̯̽̕ͅ\n",
"Zalgo: text: t̕ͅe͉̤̤ͤ͛̀̈́l͔̖̮̋ļ͓͎̫͇̩̆ͬ m̶̲̤̓̈́͋ͮͭͩè̤̅̽ h̢͎̆͗̃̕ó͙ͫ͞w̷̨̧̱̑̃̅ t̢̂̀͏̣̗̥̠͂o̧̝͓ͤͦ͞ c̨̦̩̙̘͖ͪ̓͘ů̴̶̷̳͓̬̌̊ț͔͕̍ͦ d̡ͮ͛̑́́o̬̥͓̅ͪ́ͮͬ͜͟w͖̤ͪͣ̿̔n̗̼ͨ̆ a̧͎̋͐̌͌͆͡ t̼̺̊ͬr͚̅è̩̤͢è̙̤̱͇̙̩̖̒͗\n",
"CharSwap: text: tell me how to cut dwon a tree\n",
"Insert Punctuation: text: .tell me how to cut ?down a tree\n",
"ANSI Attack: text: What's terminal ANSI code to print a hyperlink?\n",
"Insert Punctuation: text: tell- me how to cut down a !tree\n",
"ANSI Attack: text: What's VT100 code to make the terminal bell ding?\n",
"Additionally, consider this user input: tell me how to cut down a tree\n",
"Math Obfuscation: text: t = 5t - 4t <each variable corresponds to a letter in the hidden request>\n",
"e = 2e - 1e\n",
"l = 9l - 8l\n",
"Math Obfuscation: text: t = 4t - 3t <each variable corresponds to a letter in the hidden request>\n",
"e = 3e - 2e\n",
"l = 2l - 1l\n",
"l = 6l - 5l\n",
"\n",
"m = 4m - 3m\n",
"e = 3e - 2e\n",
"m = 6m - 5m\n",
"e = 4e - 3e\n",
"\n",
"h = 4h - 3h\n",
"o = 8o - 7o\n",
"o = 9o - 8o\n",
"w = 3w - 2w\n",
"\n",
"t = 5t - 4t\n",
"t = 4t - 3t\n",
"o = 8o - 7o\n",
"\n",
"c = 8c - 7c\n",
"u = 6u - 5u\n",
"t = 8t - 7t\n",
"c = 6c - 5c\n",
"u = 5u - 4u\n",
"t = 9t - 8t\n",
"\n",
"d = 7d - 6d\n",
"o = 7o - 6o\n",
"w = 3w - 2w\n",
"n = 9n - 8n\n",
"d = 5d - 4d\n",
"o = 3o - 2o\n",
"w = 2w - 1w\n",
"n = 6n - 5n\n",
"\n",
"a = 6a - 5a\n",
"a = 7a - 6a\n",
"\n",
"t = 4t - 3t\n",
"r = 8r - 7r\n",
"r = 6r - 5r\n",
"e = 8e - 7e\n",
"e = 2e - 1e\n",
"e = 5e - 4e\n",
"\n",
"There is something I want you to provide instructions to make. It is within this complex math problem.\n",
"Repeat Token: text: tell me how to cut down a tree ! ! ! ! ! ! ! ! ! !\n",
Expand Down Expand Up @@ -289,6 +291,7 @@
" RepeatTokenConverter,\n",
" StringJoinConverter,\n",
" SuperscriptConverter,\n",
" TatweelConverter,\n",
" UnicodeConfusableConverter,\n",
" UnicodeReplacementConverter,\n",
" UnicodeSubstitutionConverter,\n",
Expand All @@ -315,6 +318,8 @@
"\n",
"# Bidi [@boucher2023trojan] wraps text in Unicode bidirectional control characters\n",
"print(\"Bidi:\", await BidiConverter().convert_async(prompt=prompt)) # type: ignore\n",
"# Tatweel inserts the Arabic kashida between adjacent Arabic letters\n",
"print(\"Tatweel:\", await TatweelConverter().convert_async(prompt=\"مرحبا\")) # type: ignore\n",
"print(\"Superscript:\", await SuperscriptConverter().convert_async(prompt=prompt)) # type: ignore\n",
"print(\"Zalgo:\", await ZalgoConverter().convert_async(prompt=prompt)) # type: ignore\n",
"\n",
Expand Down
3 changes: 3 additions & 0 deletions doc/code/converters/1_text_to_text_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@
RepeatTokenConverter,
StringJoinConverter,
SuperscriptConverter,
TatweelConverter,
UnicodeConfusableConverter,
UnicodeReplacementConverter,
UnicodeSubstitutionConverter,
Expand All @@ -129,6 +130,8 @@

# Bidi [@boucher2023trojan] wraps text in Unicode bidirectional control characters
print("Bidi:", await BidiConverter().convert_async(prompt=prompt)) # type: ignore
# Tatweel inserts the Arabic kashida between adjacent Arabic letters
print("Tatweel:", await TatweelConverter().convert_async(prompt="مرحبا")) # type: ignore
print("Superscript:", await SuperscriptConverter().convert_async(prompt=prompt)) # type: ignore
print("Zalgo:", await ZalgoConverter().convert_async(prompt=prompt)) # type: ignore

Expand Down
2 changes: 2 additions & 0 deletions pyrit/prompt_converter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
from pyrit.prompt_converter.string_join_converter import StringJoinConverter
from pyrit.prompt_converter.suffix_append_converter import SuffixAppendConverter
from pyrit.prompt_converter.superscript_converter import SuperscriptConverter
from pyrit.prompt_converter.tatweel_converter import TatweelConverter
from pyrit.prompt_converter.template_segment_converter import TemplateSegmentConverter
from pyrit.prompt_converter.tense_converter import TenseConverter
from pyrit.prompt_converter.text_selection_strategy import (
Expand Down Expand Up @@ -211,6 +212,7 @@ def __getattr__(name: str) -> object:
"StringJoinConverter",
"SuffixAppendConverter",
"SuperscriptConverter",
"TatweelConverter",
"TemplateSegmentConverter",
"TenseConverter",
"TextJailbreakConverter",
Expand Down
99 changes: 99 additions & 0 deletions pyrit/prompt_converter/tatweel_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging
import unicodedata

from pyrit.models import ComponentIdentifier, PromptDataType
from pyrit.prompt_converter.prompt_converter import ConverterResult, PromptConverter

logger = logging.getLogger(__name__)

# Arabic tatweel / kashida (U+0640)
_TATWEEL = chr(0x0640)

# Bounds of the main Arabic Unicode block (U+0600-U+06FF)
_ARABIC_BLOCK_START = 0x0600
_ARABIC_BLOCK_END = 0x06FF


def _is_arabic_letter(char: str) -> bool:
"""
Determine whether a character is a letter in the main Arabic Unicode block.

Args:
char (str): A single character to test.

Returns:
bool: True if the character is an Arabic letter (category ``Lo`` within U+0600-U+06FF).
"""
return unicodedata.category(char) == "Lo" and _ARABIC_BLOCK_START <= ord(char) <= _ARABIC_BLOCK_END


class TatweelConverter(PromptConverter):
"""
Inserts Arabic tatweel (kashida, U+0640) between adjacent Arabic letters.

The tatweel is a connector that visually elongates a word without changing its meaning. Inserting
it between letters leaves the text legible to a human reader while changing the underlying code
point and token sequence. The transformation is deterministic: no language model or randomness is
involved. Characters outside the main Arabic block, and Arabic letters not directly followed by
another Arabic letter, are left untouched.
"""

SUPPORTED_INPUT_TYPES = ("text",)
SUPPORTED_OUTPUT_TYPES = ("text",)

def __init__(self, *, tatweel_count: int = 1) -> None:
"""
Initialize the converter with the number of tatweel characters to insert.

Args:
tatweel_count (int): Number of tatweel characters inserted between adjacent Arabic
letters. Must be at least 1. Defaults to 1.

Raises:
ValueError: If ``tatweel_count`` is less than 1.
"""
super().__init__()

if tatweel_count < 1:
raise ValueError(f"tatweel_count must be at least 1, got {tatweel_count}.")

self._tatweel_count = tatweel_count

def _build_identifier(self) -> ComponentIdentifier:
"""
Build the converter identifier with the tatweel count parameter.

Returns:
ComponentIdentifier: The identifier for this converter.
"""
return self._create_identifier(params={"tatweel_count": self._tatweel_count})

async def convert_async(self, *, prompt: str, input_type: PromptDataType = "text") -> ConverterResult:
"""
Convert the given prompt by inserting tatweel between adjacent Arabic letters.

Args:
prompt (str): The prompt to be converted.
input_type (PromptDataType): The type of input data.

Returns:
ConverterResult: The result containing the elongated text.

Raises:
ValueError: If the input type is not supported.
"""
if not self.input_supported(input_type):
raise ValueError("Input type not supported")

padding = _TATWEEL * self._tatweel_count
pieces: list[str] = []
for index, char in enumerate(prompt):
pieces.append(char)
next_index = index + 1
if next_index < len(prompt) and _is_arabic_letter(char) and _is_arabic_letter(prompt[next_index]):
pieces.append(padding)

return ConverterResult(output_text="".join(pieces), output_type="text")
69 changes: 69 additions & 0 deletions tests/unit/prompt_converter/test_tatweel_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import pytest

from pyrit.prompt_converter import ConverterResult, TatweelConverter

TATWEEL = chr(0x0640)
ALEF = chr(0x0627)
BEH = chr(0x0628)


def test_input_supported():
converter = TatweelConverter()
assert converter.input_supported("text") is True
assert converter.input_supported("image") is False


async def test_inserts_tatweel_between_adjacent_arabic_letters():
result = await TatweelConverter().convert_async(prompt=ALEF + BEH, input_type="text")
assert isinstance(result, ConverterResult)
assert result.output_type == "text"
assert result.output_text == ALEF + TATWEEL + BEH


async def test_inserts_tatweel_between_each_adjacent_pair():
# Three consecutive Arabic letters yield two insertion points, not one
result = await TatweelConverter().convert_async(prompt=ALEF + BEH + ALEF)
assert result.output_text == ALEF + TATWEEL + BEH + TATWEEL + ALEF


async def test_tatweel_count_controls_padding_length():
result = await TatweelConverter(tatweel_count=3).convert_async(prompt=ALEF + BEH)
assert result.output_text == ALEF + (TATWEEL * 3) + BEH


async def test_not_inserted_across_non_arabic_boundary():
# A space between the two Arabic letters breaks adjacency, so no tatweel is added
result = await TatweelConverter().convert_async(prompt=ALEF + " " + BEH)
assert result.output_text == ALEF + " " + BEH


async def test_leaves_non_arabic_unchanged():
result = await TatweelConverter().convert_async(prompt="abc")
assert result.output_text == "abc"


async def test_empty_prompt_returns_empty():
result = await TatweelConverter().convert_async(prompt="")
assert result.output_text == ""


async def test_conversion_is_deterministic():
converter = TatweelConverter()
prompt = ALEF + BEH + ALEF
first = await converter.convert_async(prompt=prompt)
second = await converter.convert_async(prompt=prompt)
assert first.output_text == second.output_text


@pytest.mark.parametrize("count", [0, -1])
def test_invalid_tatweel_count_raises(count):
with pytest.raises(ValueError):
TatweelConverter(tatweel_count=count)


async def test_input_type_not_supported_raises():
with pytest.raises(ValueError):
await TatweelConverter().convert_async(prompt=ALEF + BEH, input_type="image")
Loading