From 457c19126f644c41ed2a229156e74d320a661bde Mon Sep 17 00:00:00 2001 From: Raulster24 Date: Wed, 3 Jun 2026 21:57:19 +0400 Subject: [PATCH] FEAT: Add ArabiziConverter for Arabic transliteration --- doc/code/converters/0_converters.ipynb | 121 +++++++++--------- .../1_text_to_text_converters.ipynb | 3 + .../converters/1_text_to_text_converters.py | 3 + pyrit/prompt_converter/__init__.py | 2 + pyrit/prompt_converter/arabizi_converter.py | 101 +++++++++++++++ .../test_arabizi_converter.py | 78 +++++++++++ 6 files changed, 248 insertions(+), 60 deletions(-) create mode 100644 pyrit/prompt_converter/arabizi_converter.py create mode 100644 tests/unit/prompt_converter/test_arabizi_converter.py diff --git a/doc/code/converters/0_converters.ipynb b/doc/code/converters/0_converters.ipynb index 9b7e091345..9528678038 100644 --- a/doc/code/converters/0_converters.ipynb +++ b/doc/code/converters/0_converters.ipynb @@ -79,66 +79,67 @@ "18 text image_path QRCodeConverter\n", "19 text text AnsiAttackConverter\n", "20 text text ArabicPresentationFormConverter\n", - "21 text text AsciiArtConverter\n", - "22 text text AsciiSmugglerConverter\n", - "23 text text AskToDecodeConverter\n", - "24 text text AtbashConverter\n", - "25 text text Base2048Converter\n", - "26 text text Base64Converter\n", - "27 text text BidiConverter\n", - "28 text text BinAsciiConverter\n", - "29 text text BinaryConverter\n", - "30 text text BrailleConverter\n", - "31 text text CaesarConverter\n", - "32 text text CharSwapConverter\n", - "33 text text CharacterSpaceConverter\n", - "34 text text CodeChameleonConverter\n", - "35 text text ColloquialWordswapConverter\n", - "36 text text DenylistConverter\n", - "37 text text DiacriticConverter\n", - "38 text text EcojiConverter\n", - "39 text text EmojiConverter\n", - "40 text text FirstLetterConverter\n", - "41 text text FlipConverter\n", - "42 text text ImagePromptStyleConverter\n", - "43 text text InsertPunctuationConverter\n", - "44 text text JsonStringConverter\n", - "45 text text LLMGenericTextConverter\n", - "46 text text LeetspeakConverter\n", - "47 text text MaliciousQuestionGeneratorConverter\n", - "48 text text MathObfuscationConverter\n", - "49 text text MathPromptConverter\n", - "50 text text MorseConverter\n", - "51 text text NatoConverter\n", - "52 text text NegationTrapConverter\n", - "53 text text NoiseConverter\n", - "54 text text PersuasionConverter\n", - "55 text text ROT13Converter\n", - "56 text text RandomCapitalLettersConverter\n", - "57 text text RandomTranslationConverter\n", - "58 text text RepeatTokenConverter\n", - "59 text text ScientificTranslationConverter\n", - "60 text text SearchReplaceConverter\n", - "61 text text SelectiveTextConverter\n", - "62 text text SneakyBitsSmugglerConverter\n", - "63 text text StringJoinConverter\n", - "64 text text SuffixAppendConverter\n", - "65 text text SuperscriptConverter\n", - "66 text text TatweelConverter\n", - "67 text text TemplateSegmentConverter\n", - "68 text text TenseConverter\n", - "69 text text TextJailbreakConverter\n", - "70 text text ToneConverter\n", - "71 text text ToxicSentenceGeneratorConverter\n", - "72 text text TranslationConverter\n", - "73 text text UnicodeConfusableConverter\n", - "74 text text UnicodeReplacementConverter\n", - "75 text text UnicodeSubstitutionConverter\n", - "76 text text UrlConverter\n", - "77 text text VariationConverter\n", - "78 text text VariationSelectorSmugglerConverter\n", - "79 text text ZalgoConverter\n", - "80 text text ZeroWidthConverter\n" + "21 text text ArabiziConverter\n", + "22 text text AsciiArtConverter\n", + "23 text text AsciiSmugglerConverter\n", + "24 text text AskToDecodeConverter\n", + "25 text text AtbashConverter\n", + "26 text text Base2048Converter\n", + "27 text text Base64Converter\n", + "28 text text BidiConverter\n", + "29 text text BinAsciiConverter\n", + "30 text text BinaryConverter\n", + "31 text text BrailleConverter\n", + "32 text text CaesarConverter\n", + "33 text text CharSwapConverter\n", + "34 text text CharacterSpaceConverter\n", + "35 text text CodeChameleonConverter\n", + "36 text text ColloquialWordswapConverter\n", + "37 text text DenylistConverter\n", + "38 text text DiacriticConverter\n", + "39 text text EcojiConverter\n", + "40 text text EmojiConverter\n", + "41 text text FirstLetterConverter\n", + "42 text text FlipConverter\n", + "43 text text ImagePromptStyleConverter\n", + "44 text text InsertPunctuationConverter\n", + "45 text text JsonStringConverter\n", + "46 text text LLMGenericTextConverter\n", + "47 text text LeetspeakConverter\n", + "48 text text MaliciousQuestionGeneratorConverter\n", + "49 text text MathObfuscationConverter\n", + "50 text text MathPromptConverter\n", + "51 text text MorseConverter\n", + "52 text text NatoConverter\n", + "53 text text NegationTrapConverter\n", + "54 text text NoiseConverter\n", + "55 text text PersuasionConverter\n", + "56 text text ROT13Converter\n", + "57 text text RandomCapitalLettersConverter\n", + "58 text text RandomTranslationConverter\n", + "59 text text RepeatTokenConverter\n", + "60 text text ScientificTranslationConverter\n", + "61 text text SearchReplaceConverter\n", + "62 text text SelectiveTextConverter\n", + "63 text text SneakyBitsSmugglerConverter\n", + "64 text text StringJoinConverter\n", + "65 text text SuffixAppendConverter\n", + "66 text text SuperscriptConverter\n", + "67 text text TatweelConverter\n", + "68 text text TemplateSegmentConverter\n", + "69 text text TenseConverter\n", + "70 text text TextJailbreakConverter\n", + "71 text text ToneConverter\n", + "72 text text ToxicSentenceGeneratorConverter\n", + "73 text text TranslationConverter\n", + "74 text text UnicodeConfusableConverter\n", + "75 text text UnicodeReplacementConverter\n", + "76 text text UnicodeSubstitutionConverter\n", + "77 text text UrlConverter\n", + "78 text text VariationConverter\n", + "79 text text VariationSelectorSmugglerConverter\n", + "80 text text ZalgoConverter\n", + "81 text text ZeroWidthConverter\n" ] } ], diff --git a/doc/code/converters/1_text_to_text_converters.ipynb b/doc/code/converters/1_text_to_text_converters.ipynb index 96488ef619..62d40c705f 100644 --- a/doc/code/converters/1_text_to_text_converters.ipynb +++ b/doc/code/converters/1_text_to_text_converters.ipynb @@ -277,6 +277,7 @@ "from pyrit.prompt_converter import (\n", " AnsiAttackConverter,\n", " ArabicPresentationFormConverter,\n", + " ArabiziConverter,\n", " BidiConverter,\n", " CharacterSpaceConverter,\n", " CharSwapConverter,\n", @@ -327,6 +328,8 @@ "print(\"Tatweel:\", await TatweelConverter().convert_async(prompt=arabic_prompt)) # type: ignore\n", "# Arabic presentation form substitutes Arabic letters with their isolated glyphs\n", "print(\"Arabic Presentation Form:\", await ArabicPresentationFormConverter().convert_async(prompt=arabic_prompt)) # type: ignore\n", + "# Arabizi transliterates Arabic script into Latin-script chat Arabic\n", + "print(\"Arabizi:\", await ArabiziConverter().convert_async(prompt=arabic_prompt)) # type: ignore\n", "print(\"Superscript:\", await SuperscriptConverter().convert_async(prompt=prompt)) # type: ignore\n", "print(\"Zalgo:\", await ZalgoConverter().convert_async(prompt=prompt)) # type: ignore\n", "\n", diff --git a/doc/code/converters/1_text_to_text_converters.py b/doc/code/converters/1_text_to_text_converters.py index 88df7fec22..e4a7a52bc2 100644 --- a/doc/code/converters/1_text_to_text_converters.py +++ b/doc/code/converters/1_text_to_text_converters.py @@ -88,6 +88,7 @@ from pyrit.prompt_converter import ( AnsiAttackConverter, ArabicPresentationFormConverter, + ArabiziConverter, BidiConverter, CharacterSpaceConverter, CharSwapConverter, @@ -138,6 +139,8 @@ print("Tatweel:", await TatweelConverter().convert_async(prompt=arabic_prompt)) # type: ignore # Arabic presentation form substitutes Arabic letters with their isolated glyphs print("Arabic Presentation Form:", await ArabicPresentationFormConverter().convert_async(prompt=arabic_prompt)) # type: ignore +# Arabizi transliterates Arabic script into Latin-script chat Arabic +print("Arabizi:", await ArabiziConverter().convert_async(prompt=arabic_prompt)) # type: ignore print("Superscript:", await SuperscriptConverter().convert_async(prompt=prompt)) # type: ignore print("Zalgo:", await ZalgoConverter().convert_async(prompt=prompt)) # type: ignore diff --git a/pyrit/prompt_converter/__init__.py b/pyrit/prompt_converter/__init__.py index 961e55028f..c186123879 100644 --- a/pyrit/prompt_converter/__init__.py +++ b/pyrit/prompt_converter/__init__.py @@ -19,6 +19,7 @@ from pyrit.prompt_converter.add_text_image_converter import AddTextImageConverter from pyrit.prompt_converter.ansi_escape.ansi_attack_converter import AnsiAttackConverter from pyrit.prompt_converter.arabic_presentation_form_converter import ArabicPresentationFormConverter +from pyrit.prompt_converter.arabizi_converter import ArabiziConverter from pyrit.prompt_converter.ascii_art_converter import AsciiArtConverter from pyrit.prompt_converter.ask_to_decode_converter import AskToDecodeConverter from pyrit.prompt_converter.atbash_converter import AtbashConverter @@ -147,6 +148,7 @@ def __getattr__(name: str) -> object: "AllWordsSelectionStrategy", "AnsiAttackConverter", "ArabicPresentationFormConverter", + "ArabiziConverter", "AsciiArtConverter", "AsciiSmugglerConverter", "AskToDecodeConverter", diff --git a/pyrit/prompt_converter/arabizi_converter.py b/pyrit/prompt_converter/arabizi_converter.py new file mode 100644 index 0000000000..9663e64188 --- /dev/null +++ b/pyrit/prompt_converter/arabizi_converter.py @@ -0,0 +1,101 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import logging + +from pyrit.models import PromptDataType +from pyrit.prompt_converter.prompt_converter import ConverterResult, PromptConverter + +logger = logging.getLogger(__name__) + +# Arabic script mapped to Arabizi (Latin-script "chat Arabic"), using the widely documented Arabic +# chat alphabet with Gulf-leaning conventions. Keys are built from code points so the source file +# stays pure ASCII. The mapping is intentionally lossy (for example THEH and THAL both map to "th"), +# which mirrors how Arabizi is actually written. +_ARABIC_TO_ARABIZI: dict[str, str] = { + chr(0x0627): "a", # ALEF + chr(0x0628): "b", # BEH + chr(0x062A): "t", # TEH + chr(0x062B): "th", # THEH + chr(0x062C): "j", # JEEM + chr(0x062D): "7", # HAH + chr(0x062E): "5", # KHAH + chr(0x062F): "d", # DAL + chr(0x0630): "th", # THAL + chr(0x0631): "r", # REH + chr(0x0632): "z", # ZAIN + chr(0x0633): "s", # SEEN + chr(0x0634): "sh", # SHEEN + chr(0x0635): "9", # SAD + chr(0x0636): "d", # DAD + chr(0x0637): "6", # TAH + chr(0x0638): "z", # ZAH + chr(0x0639): "3", # AIN + chr(0x063A): "gh", # GHAIN + chr(0x0641): "f", # FEH + chr(0x0642): "8", # QAF + chr(0x0643): "k", # KAF + chr(0x0644): "l", # LAM + chr(0x0645): "m", # MEEM + chr(0x0646): "n", # NOON + chr(0x0647): "h", # HEH + chr(0x0648): "w", # WAW + chr(0x064A): "y", # YEH + chr(0x0621): "2", # HAMZA + chr(0x0622): "2a", # ALEF WITH MADDA ABOVE + chr(0x0623): "a", # ALEF WITH HAMZA ABOVE + chr(0x0625): "a", # ALEF WITH HAMZA BELOW + chr(0x0624): "2", # WAW WITH HAMZA ABOVE + chr(0x0626): "2", # YEH WITH HAMZA ABOVE + chr(0x0629): "a", # TEH MARBUTA + chr(0x0649): "a", # ALEF MAKSURA + chr(0x0640): "", # TATWEEL (connector, dropped) + chr(0x064B): "", # FATHATAN (short-vowel marks are dropped) + chr(0x064C): "", # DAMMATAN + chr(0x064D): "", # KASRATAN + chr(0x064E): "", # FATHA + chr(0x064F): "", # DAMMA + chr(0x0650): "", # KASRA + chr(0x0651): "", # SHADDA + chr(0x0652): "", # SUKUN +} + + +class ArabiziConverter(PromptConverter): + """ + Transliterates Arabic script into Arabizi (Latin-script "chat Arabic"). + + Arabizi is the everyday Latin-script encoding of Arabic used in chat and social media, where + letters that have no Latin equivalent are written with digits that resemble their shape (for + example HAH becomes 7, AIN becomes 3, and QAF becomes 8). This converter applies a deterministic + per-character mapping with Gulf-leaning conventions: no language model is involved, so the same + input always produces the same output. The attack surface targeted is tokenizer and safety + classifier handling of transliterated Arabic, not the language itself. + + Short-vowel diacritics and the tatweel connector are dropped, and characters outside the Arabic + block (Latin text, digits, punctuation) are left unchanged. The mapping is intentionally lossy, + mirroring how Arabizi is actually written. + """ + + SUPPORTED_INPUT_TYPES = ("text",) + SUPPORTED_OUTPUT_TYPES = ("text",) + + async def convert_async(self, *, prompt: str, input_type: PromptDataType = "text") -> ConverterResult: + """ + Convert the given prompt by transliterating Arabic script into Arabizi. + + Args: + prompt (str): The prompt to be converted. + input_type (PromptDataType): The type of input data. + + Returns: + ConverterResult: The result containing the transliterated text. + + Raises: + ValueError: If the input type is not supported. + """ + if not self.input_supported(input_type): + raise ValueError("Input type not supported") + + converted_text = "".join(_ARABIC_TO_ARABIZI.get(char, char) for char in prompt) + return ConverterResult(output_text=converted_text, output_type="text") diff --git a/tests/unit/prompt_converter/test_arabizi_converter.py b/tests/unit/prompt_converter/test_arabizi_converter.py new file mode 100644 index 0000000000..212721eb50 --- /dev/null +++ b/tests/unit/prompt_converter/test_arabizi_converter.py @@ -0,0 +1,78 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import pytest + +from pyrit.prompt_converter import ArabiziConverter, ConverterResult + +# Arabic letters (built from code points to keep this file pure ASCII). +ALEF = chr(0x0627) +BEH = chr(0x0628) +HAH = chr(0x062D) # -> 7 +KHAH = chr(0x062E) # -> 5 +REH = chr(0x0631) +SHEEN = chr(0x0634) # -> sh +AIN = chr(0x0639) # -> 3 +QAF = chr(0x0642) # -> 8 +MEEM = chr(0x0645) +ALEF_MADDA = chr(0x0622) # -> 2a +FATHA = chr(0x064E) # short-vowel diacritic, dropped +TATWEEL = chr(0x0640) # connector, dropped + + +def test_input_supported(): + converter = ArabiziConverter() + assert converter.input_supported("text") is True + assert converter.input_supported("image") is False + + +async def test_transliterates_word(): + # marhaba: MEEM REH HAH BEH ALEF -> m r 7 b a + result = await ArabiziConverter().convert_async(prompt=MEEM + REH + HAH + BEH + ALEF, input_type="text") + assert isinstance(result, ConverterResult) + assert result.output_type == "text" + assert result.output_text == "mr7ba" + + +async def test_number_letters(): + result = await ArabiziConverter().convert_async(prompt=HAH + KHAH + AIN + QAF) + assert result.output_text == "753" + "8" + + +async def test_multi_character_mappings(): + result = await ArabiziConverter().convert_async(prompt=SHEEN + ALEF_MADDA) + assert result.output_text == "sh2a" + + +async def test_diacritics_and_tatweel_are_dropped(): + # BEH + FATHA -> "b" (diacritic dropped); BEH + TATWEEL + BEH -> "bb" + assert (await ArabiziConverter().convert_async(prompt=BEH + FATHA)).output_text == "b" + assert (await ArabiziConverter().convert_async(prompt=BEH + TATWEEL + BEH)).output_text == "bb" + + +async def test_leaves_non_arabic_unchanged(): + result = await ArabiziConverter().convert_async(prompt="hello 123!") + assert result.output_text == "hello 123!" + + +async def test_mixed_text(): + result = await ArabiziConverter().convert_async(prompt="ok " + BEH) + assert result.output_text == "ok b" + + +async def test_empty_prompt_returns_empty(): + result = await ArabiziConverter().convert_async(prompt="") + assert result.output_text == "" + + +async def test_conversion_is_deterministic(): + converter = ArabiziConverter() + prompt = MEEM + REH + HAH + BEH + ALEF + first = await converter.convert_async(prompt=prompt) + second = await converter.convert_async(prompt=prompt) + assert first.output_text == second.output_text + + +async def test_input_type_not_supported_raises(): + with pytest.raises(ValueError): + await ArabiziConverter().convert_async(prompt=BEH, input_type="image")