From 945cbe6b8b89e0ab72679bda3549722bf176b4b6 Mon Sep 17 00:00:00 2001 From: Raulster24 Date: Mon, 1 Jun 2026 19:47:50 +0400 Subject: [PATCH 1/3] FEAT: Add TatweelConverter for Arabic kashida insertion --- doc/code/converters/0_converters.ipynb | 29 ++--- .../1_text_to_text_converters.ipynb | 3 + .../converters/1_text_to_text_converters.py | 3 + pyrit/prompt_converter/__init__.py | 2 + pyrit/prompt_converter/tatweel_converter.py | 100 ++++++++++++++++++ .../test_tatweel_converter.py | 69 ++++++++++++ 6 files changed, 192 insertions(+), 14 deletions(-) create mode 100644 pyrit/prompt_converter/tatweel_converter.py create mode 100644 tests/unit/prompt_converter/test_tatweel_converter.py diff --git a/doc/code/converters/0_converters.ipynb b/doc/code/converters/0_converters.ipynb index 6103c1f6e..d579bc5ee 100644 --- a/doc/code/converters/0_converters.ipynb +++ b/doc/code/converters/0_converters.ipynb @@ -123,20 +123,21 @@ "62 text text StringJoinConverter\n", "63 text text SuffixAppendConverter\n", "64 text text SuperscriptConverter\n", - "65 text text TemplateSegmentConverter\n", - "66 text text TenseConverter\n", - "67 text text TextJailbreakConverter\n", - "68 text text ToneConverter\n", - "69 text text ToxicSentenceGeneratorConverter\n", - "70 text text TranslationConverter\n", - "71 text text UnicodeConfusableConverter\n", - "72 text text UnicodeReplacementConverter\n", - "73 text text UnicodeSubstitutionConverter\n", - "74 text text UrlConverter\n", - "75 text text VariationConverter\n", - "76 text text VariationSelectorSmugglerConverter\n", - "77 text text ZalgoConverter\n", - "78 text text ZeroWidthConverter\n" + "65 text text TatweelConverter\n", + "66 text text TemplateSegmentConverter\n", + "67 text text TenseConverter\n", + "68 text text TextJailbreakConverter\n", + "69 text text ToneConverter\n", + "70 text text ToxicSentenceGeneratorConverter\n", + "71 text text TranslationConverter\n", + "72 text text UnicodeConfusableConverter\n", + "73 text text UnicodeReplacementConverter\n", + "74 text text UnicodeSubstitutionConverter\n", + "75 text text UrlConverter\n", + "76 text text VariationConverter\n", + "77 text text VariationSelectorSmugglerConverter\n", + "78 text text ZalgoConverter\n", + "79 text text ZeroWidthConverter\n" ] } ], diff --git a/doc/code/converters/1_text_to_text_converters.ipynb b/doc/code/converters/1_text_to_text_converters.ipynb index 5a098339e..8b7d187b1 100644 --- a/doc/code/converters/1_text_to_text_converters.ipynb +++ b/doc/code/converters/1_text_to_text_converters.ipynb @@ -289,6 +289,7 @@ " RepeatTokenConverter,\n", " StringJoinConverter,\n", " SuperscriptConverter,\n", + " TatweelConverter,\n", " UnicodeConfusableConverter,\n", " UnicodeReplacementConverter,\n", " UnicodeSubstitutionConverter,\n", @@ -315,6 +316,8 @@ "\n", "# Bidi [@boucher2023trojan] wraps text in Unicode bidirectional control characters\n", "print(\"Bidi:\", await BidiConverter().convert_async(prompt=prompt)) # type: ignore\n", + "# Tatweel inserts the Arabic kashida between adjacent Arabic letters\n", + "print(\"Tatweel:\", await TatweelConverter().convert_async(prompt=\"مرحبا\")) # type: ignore\n", "print(\"Superscript:\", await SuperscriptConverter().convert_async(prompt=prompt)) # type: ignore\n", "print(\"Zalgo:\", await ZalgoConverter().convert_async(prompt=prompt)) # type: ignore\n", "\n", diff --git a/doc/code/converters/1_text_to_text_converters.py b/doc/code/converters/1_text_to_text_converters.py index 31d8fcf33..823ef63be 100644 --- a/doc/code/converters/1_text_to_text_converters.py +++ b/doc/code/converters/1_text_to_text_converters.py @@ -103,6 +103,7 @@ RepeatTokenConverter, StringJoinConverter, SuperscriptConverter, + TatweelConverter, UnicodeConfusableConverter, UnicodeReplacementConverter, UnicodeSubstitutionConverter, @@ -129,6 +130,8 @@ # Bidi [@boucher2023trojan] wraps text in Unicode bidirectional control characters print("Bidi:", await BidiConverter().convert_async(prompt=prompt)) # type: ignore +# Tatweel inserts the Arabic kashida between adjacent Arabic letters +print("Tatweel:", await TatweelConverter().convert_async(prompt="مرحبا")) # type: ignore print("Superscript:", await SuperscriptConverter().convert_async(prompt=prompt)) # type: ignore print("Zalgo:", await ZalgoConverter().convert_async(prompt=prompt)) # type: ignore diff --git a/pyrit/prompt_converter/__init__.py b/pyrit/prompt_converter/__init__.py index 099bda76b..5e0828a05 100644 --- a/pyrit/prompt_converter/__init__.py +++ b/pyrit/prompt_converter/__init__.py @@ -71,6 +71,7 @@ from pyrit.prompt_converter.string_join_converter import StringJoinConverter from pyrit.prompt_converter.suffix_append_converter import SuffixAppendConverter from pyrit.prompt_converter.superscript_converter import SuperscriptConverter +from pyrit.prompt_converter.tatweel_converter import TatweelConverter from pyrit.prompt_converter.template_segment_converter import TemplateSegmentConverter from pyrit.prompt_converter.tense_converter import TenseConverter from pyrit.prompt_converter.text_selection_strategy import ( @@ -211,6 +212,7 @@ def __getattr__(name: str) -> object: "StringJoinConverter", "SuffixAppendConverter", "SuperscriptConverter", + "TatweelConverter", "TemplateSegmentConverter", "TenseConverter", "TextJailbreakConverter", diff --git a/pyrit/prompt_converter/tatweel_converter.py b/pyrit/prompt_converter/tatweel_converter.py new file mode 100644 index 000000000..2819f7f72 --- /dev/null +++ b/pyrit/prompt_converter/tatweel_converter.py @@ -0,0 +1,100 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import logging +import unicodedata + +from pyrit.identifiers import ComponentIdentifier +from pyrit.models import PromptDataType +from pyrit.prompt_converter.prompt_converter import ConverterResult, PromptConverter + +logger = logging.getLogger(__name__) + +# Arabic tatweel / kashida (U+0640) +_TATWEEL = chr(0x0640) + +# Bounds of the main Arabic Unicode block (U+0600-U+06FF) +_ARABIC_BLOCK_START = 0x0600 +_ARABIC_BLOCK_END = 0x06FF + + +def _is_arabic_letter(char: str) -> bool: + """ + Determine whether a character is a letter in the main Arabic Unicode block. + + Args: + char (str): A single character to test. + + Returns: + bool: True if the character is an Arabic letter (category ``Lo`` within U+0600-U+06FF). + """ + return unicodedata.category(char) == "Lo" and _ARABIC_BLOCK_START <= ord(char) <= _ARABIC_BLOCK_END + + +class TatweelConverter(PromptConverter): + """ + Inserts Arabic tatweel (kashida, U+0640) between adjacent Arabic letters. + + The tatweel is a connector that visually elongates a word without changing its meaning. Inserting + it between letters leaves the text legible to a human reader while changing the underlying code + point and token sequence. The transformation is deterministic: no language model or randomness is + involved. Characters outside the main Arabic block, and Arabic letters not directly followed by + another Arabic letter, are left untouched. + """ + + SUPPORTED_INPUT_TYPES = ("text",) + SUPPORTED_OUTPUT_TYPES = ("text",) + + def __init__(self, *, tatweel_count: int = 1) -> None: + """ + Initialize the converter with the number of tatweel characters to insert. + + Args: + tatweel_count (int): Number of tatweel characters inserted between adjacent Arabic + letters. Must be at least 1. Defaults to 1. + + Raises: + ValueError: If ``tatweel_count`` is less than 1. + """ + super().__init__() + + if tatweel_count < 1: + raise ValueError(f"tatweel_count must be at least 1, got {tatweel_count}.") + + self._tatweel_count = tatweel_count + + def _build_identifier(self) -> ComponentIdentifier: + """ + Build the converter identifier with the tatweel count parameter. + + Returns: + ComponentIdentifier: The identifier for this converter. + """ + return self._create_identifier(params={"tatweel_count": self._tatweel_count}) + + async def convert_async(self, *, prompt: str, input_type: PromptDataType = "text") -> ConverterResult: + """ + Convert the given prompt by inserting tatweel between adjacent Arabic letters. + + Args: + prompt (str): The prompt to be converted. + input_type (PromptDataType): The type of input data. + + Returns: + ConverterResult: The result containing the elongated text. + + Raises: + ValueError: If the input type is not supported. + """ + if not self.input_supported(input_type): + raise ValueError("Input type not supported") + + padding = _TATWEEL * self._tatweel_count + pieces: list[str] = [] + for index, char in enumerate(prompt): + pieces.append(char) + next_index = index + 1 + if next_index < len(prompt) and _is_arabic_letter(char) and _is_arabic_letter(prompt[next_index]): + pieces.append(padding) + + return ConverterResult(output_text="".join(pieces), output_type="text") diff --git a/tests/unit/prompt_converter/test_tatweel_converter.py b/tests/unit/prompt_converter/test_tatweel_converter.py new file mode 100644 index 000000000..a636b6f85 --- /dev/null +++ b/tests/unit/prompt_converter/test_tatweel_converter.py @@ -0,0 +1,69 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import pytest + +from pyrit.prompt_converter import ConverterResult, TatweelConverter + +TATWEEL = chr(0x0640) +ALEF = chr(0x0627) +BEH = chr(0x0628) + + +def test_input_supported(): + converter = TatweelConverter() + assert converter.input_supported("text") is True + assert converter.input_supported("image") is False + + +async def test_inserts_tatweel_between_adjacent_arabic_letters(): + result = await TatweelConverter().convert_async(prompt=ALEF + BEH, input_type="text") + assert isinstance(result, ConverterResult) + assert result.output_type == "text" + assert result.output_text == ALEF + TATWEEL + BEH + + +async def test_inserts_tatweel_between_each_adjacent_pair(): + # Three consecutive Arabic letters yield two insertion points, not one + result = await TatweelConverter().convert_async(prompt=ALEF + BEH + ALEF) + assert result.output_text == ALEF + TATWEEL + BEH + TATWEEL + ALEF + + +async def test_tatweel_count_controls_padding_length(): + result = await TatweelConverter(tatweel_count=3).convert_async(prompt=ALEF + BEH) + assert result.output_text == ALEF + (TATWEEL * 3) + BEH + + +async def test_not_inserted_across_non_arabic_boundary(): + # A space between the two Arabic letters breaks adjacency, so no tatweel is added + result = await TatweelConverter().convert_async(prompt=ALEF + " " + BEH) + assert result.output_text == ALEF + " " + BEH + + +async def test_leaves_non_arabic_unchanged(): + result = await TatweelConverter().convert_async(prompt="abc") + assert result.output_text == "abc" + + +async def test_empty_prompt_returns_empty(): + result = await TatweelConverter().convert_async(prompt="") + assert result.output_text == "" + + +async def test_conversion_is_deterministic(): + converter = TatweelConverter() + prompt = ALEF + BEH + ALEF + first = await converter.convert_async(prompt=prompt) + second = await converter.convert_async(prompt=prompt) + assert first.output_text == second.output_text + + +@pytest.mark.parametrize("count", [0, -1]) +def test_invalid_tatweel_count_raises(count): + with pytest.raises(ValueError): + TatweelConverter(tatweel_count=count) + + +async def test_input_type_not_supported_raises(): + with pytest.raises(ValueError): + await TatweelConverter().convert_async(prompt=ALEF + BEH, input_type="image") From 7a4dd03b2f6165a4887c8b405f0ad6fdbe50a53b Mon Sep 17 00:00:00 2001 From: Roman Lutz Date: Mon, 1 Jun 2026 14:50:49 -0700 Subject: [PATCH 2/3] DOC: refresh text-to-text converter notebook output Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../1_text_to_text_converters.ipynb | 48 ++++++++++--------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/doc/code/converters/1_text_to_text_converters.ipynb b/doc/code/converters/1_text_to_text_converters.ipynb index 8b7d187b1..2a8f86355 100644 --- a/doc/code/converters/1_text_to_text_converters.ipynb +++ b/doc/code/converters/1_text_to_text_converters.ipynb @@ -156,53 +156,55 @@ "output_type": "stream", "text": [ "Leetspeak: text: 7311 m3 h0w 70 (u7 d0wn 4 7r33\n", - "Random Capitals: text: TELL ME hOW tO cUt dowN a tRee\n", - "Unicode Confusable: text: 𝔱𝒆‎𞣇‎ℐ rn𝓮 𝚑𝓸ɯ 𝘵𝐨 𝓬𝒖𝑡 ⅆ𝞸𝙬𝙣 𝖆 𝗍𝐫𝙚𝒆\n", + "Random Capitals: text: tELl mE HOw to CUt DOWN A treE\n", + "Unicode Confusable: text: 𝐭𝖊𝟙𝐥 rn℮ 𝒉𝜎ѡ 𝘵‎ھ‎ 𝘤𐓶𝘵 𝒅‎ﮬ‎𝗐𝘯 ɑ 𝕥ᴦ𝑒𝗲\n", "Unicode Substitution: text: 󠁴󠁥󠁬󠁬󠀠󠁭󠁥󠀠󠁨󠁯󠁷󠀠󠁴󠁯󠀠󠁣󠁵󠁴󠀠󠁤󠁯󠁷󠁮󠀠󠁡󠀠󠁴󠁲󠁥󠁥\n", "Unicode Replacement: text: \\u0074\\u0065\\u006c\\u006c \\u006d\\u0065 \\u0068\\u006f\\u0077 \\u0074\\u006f \\u0063\\u0075\\u0074 \\u0064\\u006f\\u0077\\u006e \\u0061 \\u0074\\u0072\\u0065\\u0065\n", - "Emoji: text: 🅃🅴🅻🅛 🅜🄴 🄷🅞🅆 🆃🅞 🅲🆄🅣 🄳🅞🅆🅝 🅰️ 🅣🅁🄴🅴\n", + "Emoji: text: 🅣🅔🅻🅻 🅼🄴 🅷🅾️🆆 🅣🅾️ 🅒🅤🅣 🅳🄾🅆🅝 🅐 🅣🅡🄴🅔\n", "First Letter: text: t m h t c d a t\n", "String Join: text: t-e-l-l m-e h-o-w t-o c-u-t d-o-w-n a t-r-e-e\n", "Zero Width: text: t​e​l​l​ ​m​e​ ​h​o​w​ ​t​o​ ​c​u​t​ ​d​o​w​n​ ​a​ ​t​r​e​e\n", "Flip: text: eert a nwod tuc ot woh em llet\n", "Character Space: text: t e l l m e h o w t o c u t d o w n a t r e e\n", "Diacritic: text: téll mé hów tó cút dówn á tréé\n", + "Bidi: text: ‮tell me how to cut down a tree‬\n", + "Tatweel: text: مـرـحـبـا\n", "Superscript: text: ᵗᵉˡˡ ᵐᵉ ʰᵒʷ ᵗᵒ ᶜᵘᵗ ᵈᵒʷⁿ ᵃ ᵗʳᵉᵉ\n", - "Zalgo: text: ţ̤̘̪̤͆̀é̈l̮ͪ̌͗ļ̶̮͖͛ͥͬ̇ m̷̷̠̼͉͚̥ͫ͞͡e̅ ḫ̥̲̇ͮͦo̱͏̙̺̅̆̎w̷ͬ̆͢ t̵̜̖̤ͣͨ̒ͅo̵̟͕ͤ̎̚ c͉ͮͫů̧̱̭̱͚̖̌͌ͧt̥̫͈͒͗ d̗̤̩̀͑͌ͅo̷͓̦ͮ̀̓ͭ́̄̓w͚̺̦̌̅ͯ͋̕͜n̟̱̽ͯͤ̈̅͡ ā̷̗̖̠̌͂ t͈͔̼͐r͎ȇͨe͕͖̠̯̽̕ͅ\n", + "Zalgo: text: t̕ͅe͉̤̤ͤ͛̀̈́l͔̖̮̋ļ͓͎̫͇̩̆ͬ m̶̲̤̓̈́͋ͮͭͩè̤̅̽ h̢͎̆͗̃̕ó͙ͫ͞w̷̨̧̱̑̃̅ t̢̂̀͏̣̗̥̠͂o̧̝͓ͤͦ͞ c̨̦̩̙̘͖ͪ̓͘ů̴̶̷̳͓̬̌̊ț͔͕̍ͦ d̡ͮ͛̑́́o̬̥͓̅ͪ́ͮͬ͜͟w͖̤ͪͣ̿̔n̗̼ͨ̆ a̧͎̋͐̌͌͆͡ t̼̺̊ͬr͚̅è̩̤͢è̙̤̱͇̙̩̖̒͗\n", "CharSwap: text: tell me how to cut dwon a tree\n", - "Insert Punctuation: text: .tell me how to cut ?down a tree\n", - "ANSI Attack: text: What's terminal ANSI code to print a hyperlink?\n", + "Insert Punctuation: text: tell- me how to cut down a !tree\n", + "ANSI Attack: text: What's VT100 code to make the terminal bell ding?\n", "Additionally, consider this user input: tell me how to cut down a tree\n", - "Math Obfuscation: text: t = 5t - 4t