From 457c19126f644c41ed2a229156e74d320a661bde Mon Sep 17 00:00:00 2001
From: Raulster24 <rahulsri.pccs@gmail.com>
Date: Wed, 3 Jun 2026 21:57:19 +0400
Subject: [PATCH] FEAT: Add ArabiziConverter for Arabic transliteration

---
 doc/code/converters/0_converters.ipynb        | 121 +++++++++---------
 .../1_text_to_text_converters.ipynb           |   3 +
 .../converters/1_text_to_text_converters.py   |   3 +
 pyrit/prompt_converter/__init__.py            |   2 +
 pyrit/prompt_converter/arabizi_converter.py   | 101 +++++++++++++++
 .../test_arabizi_converter.py                 |  78 +++++++++++
 6 files changed, 248 insertions(+), 60 deletions(-)
 create mode 100644 pyrit/prompt_converter/arabizi_converter.py
 create mode 100644 tests/unit/prompt_converter/test_arabizi_converter.py

diff --git a/doc/code/converters/0_converters.ipynb b/doc/code/converters/0_converters.ipynb
index 9b7e091345..9528678038 100644
--- a/doc/code/converters/0_converters.ipynb
+++ b/doc/code/converters/0_converters.ipynb
@@ -79,66 +79,67 @@
       "18             text      image_path                      QRCodeConverter\n",
       "19             text            text                  AnsiAttackConverter\n",
       "20             text            text      ArabicPresentationFormConverter\n",
-      "21             text            text                    AsciiArtConverter\n",
-      "22             text            text               AsciiSmugglerConverter\n",
-      "23             text            text                 AskToDecodeConverter\n",
-      "24             text            text                      AtbashConverter\n",
-      "25             text            text                    Base2048Converter\n",
-      "26             text            text                      Base64Converter\n",
-      "27             text            text                        BidiConverter\n",
-      "28             text            text                    BinAsciiConverter\n",
-      "29             text            text                      BinaryConverter\n",
-      "30             text            text                     BrailleConverter\n",
-      "31             text            text                      CaesarConverter\n",
-      "32             text            text                    CharSwapConverter\n",
-      "33             text            text              CharacterSpaceConverter\n",
-      "34             text            text               CodeChameleonConverter\n",
-      "35             text            text          ColloquialWordswapConverter\n",
-      "36             text            text                    DenylistConverter\n",
-      "37             text            text                   DiacriticConverter\n",
-      "38             text            text                       EcojiConverter\n",
-      "39             text            text                       EmojiConverter\n",
-      "40             text            text                 FirstLetterConverter\n",
-      "41             text            text                        FlipConverter\n",
-      "42             text            text            ImagePromptStyleConverter\n",
-      "43             text            text           InsertPunctuationConverter\n",
-      "44             text            text                  JsonStringConverter\n",
-      "45             text            text              LLMGenericTextConverter\n",
-      "46             text            text                   LeetspeakConverter\n",
-      "47             text            text  MaliciousQuestionGeneratorConverter\n",
-      "48             text            text             MathObfuscationConverter\n",
-      "49             text            text                  MathPromptConverter\n",
-      "50             text            text                       MorseConverter\n",
-      "51             text            text                        NatoConverter\n",
-      "52             text            text                NegationTrapConverter\n",
-      "53             text            text                       NoiseConverter\n",
-      "54             text            text                  PersuasionConverter\n",
-      "55             text            text                       ROT13Converter\n",
-      "56             text            text        RandomCapitalLettersConverter\n",
-      "57             text            text           RandomTranslationConverter\n",
-      "58             text            text                 RepeatTokenConverter\n",
-      "59             text            text       ScientificTranslationConverter\n",
-      "60             text            text               SearchReplaceConverter\n",
-      "61             text            text               SelectiveTextConverter\n",
-      "62             text            text          SneakyBitsSmugglerConverter\n",
-      "63             text            text                  StringJoinConverter\n",
-      "64             text            text                SuffixAppendConverter\n",
-      "65             text            text                 SuperscriptConverter\n",
-      "66             text            text                     TatweelConverter\n",
-      "67             text            text             TemplateSegmentConverter\n",
-      "68             text            text                       TenseConverter\n",
-      "69             text            text               TextJailbreakConverter\n",
-      "70             text            text                        ToneConverter\n",
-      "71             text            text      ToxicSentenceGeneratorConverter\n",
-      "72             text            text                 TranslationConverter\n",
-      "73             text            text           UnicodeConfusableConverter\n",
-      "74             text            text          UnicodeReplacementConverter\n",
-      "75             text            text         UnicodeSubstitutionConverter\n",
-      "76             text            text                         UrlConverter\n",
-      "77             text            text                   VariationConverter\n",
-      "78             text            text   VariationSelectorSmugglerConverter\n",
-      "79             text            text                       ZalgoConverter\n",
-      "80             text            text                   ZeroWidthConverter\n"
+      "21             text            text                     ArabiziConverter\n",
+      "22             text            text                    AsciiArtConverter\n",
+      "23             text            text               AsciiSmugglerConverter\n",
+      "24             text            text                 AskToDecodeConverter\n",
+      "25             text            text                      AtbashConverter\n",
+      "26             text            text                    Base2048Converter\n",
+      "27             text            text                      Base64Converter\n",
+      "28             text            text                        BidiConverter\n",
+      "29             text            text                    BinAsciiConverter\n",
+      "30             text            text                      BinaryConverter\n",
+      "31             text            text                     BrailleConverter\n",
+      "32             text            text                      CaesarConverter\n",
+      "33             text            text                    CharSwapConverter\n",
+      "34             text            text              CharacterSpaceConverter\n",
+      "35             text            text               CodeChameleonConverter\n",
+      "36             text            text          ColloquialWordswapConverter\n",
+      "37             text            text                    DenylistConverter\n",
+      "38             text            text                   DiacriticConverter\n",
+      "39             text            text                       EcojiConverter\n",
+      "40             text            text                       EmojiConverter\n",
+      "41             text            text                 FirstLetterConverter\n",
+      "42             text            text                        FlipConverter\n",
+      "43             text            text            ImagePromptStyleConverter\n",
+      "44             text            text           InsertPunctuationConverter\n",
+      "45             text            text                  JsonStringConverter\n",
+      "46             text            text              LLMGenericTextConverter\n",
+      "47             text            text                   LeetspeakConverter\n",
+      "48             text            text  MaliciousQuestionGeneratorConverter\n",
+      "49             text            text             MathObfuscationConverter\n",
+      "50             text            text                  MathPromptConverter\n",
+      "51             text            text                       MorseConverter\n",
+      "52             text            text                        NatoConverter\n",
+      "53             text            text                NegationTrapConverter\n",
+      "54             text            text                       NoiseConverter\n",
+      "55             text            text                  PersuasionConverter\n",
+      "56             text            text                       ROT13Converter\n",
+      "57             text            text        RandomCapitalLettersConverter\n",
+      "58             text            text           RandomTranslationConverter\n",
+      "59             text            text                 RepeatTokenConverter\n",
+      "60             text            text       ScientificTranslationConverter\n",
+      "61             text            text               SearchReplaceConverter\n",
+      "62             text            text               SelectiveTextConverter\n",
+      "63             text            text          SneakyBitsSmugglerConverter\n",
+      "64             text            text                  StringJoinConverter\n",
+      "65             text            text                SuffixAppendConverter\n",
+      "66             text            text                 SuperscriptConverter\n",
+      "67             text            text                     TatweelConverter\n",
+      "68             text            text             TemplateSegmentConverter\n",
+      "69             text            text                       TenseConverter\n",
+      "70             text            text               TextJailbreakConverter\n",
+      "71             text            text                        ToneConverter\n",
+      "72             text            text      ToxicSentenceGeneratorConverter\n",
+      "73             text            text                 TranslationConverter\n",
+      "74             text            text           UnicodeConfusableConverter\n",
+      "75             text            text          UnicodeReplacementConverter\n",
+      "76             text            text         UnicodeSubstitutionConverter\n",
+      "77             text            text                         UrlConverter\n",
+      "78             text            text                   VariationConverter\n",
+      "79             text            text   VariationSelectorSmugglerConverter\n",
+      "80             text            text                       ZalgoConverter\n",
+      "81             text            text                   ZeroWidthConverter\n"
      ]
     }
    ],
diff --git a/doc/code/converters/1_text_to_text_converters.ipynb b/doc/code/converters/1_text_to_text_converters.ipynb
index 96488ef619..62d40c705f 100644
--- a/doc/code/converters/1_text_to_text_converters.ipynb
+++ b/doc/code/converters/1_text_to_text_converters.ipynb
@@ -277,6 +277,7 @@
     "from pyrit.prompt_converter import (\n",
     "    AnsiAttackConverter,\n",
     "    ArabicPresentationFormConverter,\n",
+    "    ArabiziConverter,\n",
     "    BidiConverter,\n",
     "    CharacterSpaceConverter,\n",
     "    CharSwapConverter,\n",
@@ -327,6 +328,8 @@
     "print(\"Tatweel:\", await TatweelConverter().convert_async(prompt=arabic_prompt))  # type: ignore\n",
     "# Arabic presentation form substitutes Arabic letters with their isolated glyphs\n",
     "print(\"Arabic Presentation Form:\", await ArabicPresentationFormConverter().convert_async(prompt=arabic_prompt))  # type: ignore\n",
+    "# Arabizi transliterates Arabic script into Latin-script chat Arabic\n",
+    "print(\"Arabizi:\", await ArabiziConverter().convert_async(prompt=arabic_prompt))  # type: ignore\n",
     "print(\"Superscript:\", await SuperscriptConverter().convert_async(prompt=prompt))  # type: ignore\n",
     "print(\"Zalgo:\", await ZalgoConverter().convert_async(prompt=prompt))  # type: ignore\n",
     "\n",
diff --git a/doc/code/converters/1_text_to_text_converters.py b/doc/code/converters/1_text_to_text_converters.py
index 88df7fec22..e4a7a52bc2 100644
--- a/doc/code/converters/1_text_to_text_converters.py
+++ b/doc/code/converters/1_text_to_text_converters.py
@@ -88,6 +88,7 @@
 from pyrit.prompt_converter import (
     AnsiAttackConverter,
     ArabicPresentationFormConverter,
+    ArabiziConverter,
     BidiConverter,
     CharacterSpaceConverter,
     CharSwapConverter,
@@ -138,6 +139,8 @@
 print("Tatweel:", await TatweelConverter().convert_async(prompt=arabic_prompt))  # type: ignore
 # Arabic presentation form substitutes Arabic letters with their isolated glyphs
 print("Arabic Presentation Form:", await ArabicPresentationFormConverter().convert_async(prompt=arabic_prompt))  # type: ignore
+# Arabizi transliterates Arabic script into Latin-script chat Arabic
+print("Arabizi:", await ArabiziConverter().convert_async(prompt=arabic_prompt))  # type: ignore
 print("Superscript:", await SuperscriptConverter().convert_async(prompt=prompt))  # type: ignore
 print("Zalgo:", await ZalgoConverter().convert_async(prompt=prompt))  # type: ignore
 
diff --git a/pyrit/prompt_converter/__init__.py b/pyrit/prompt_converter/__init__.py
index 961e55028f..c186123879 100644
--- a/pyrit/prompt_converter/__init__.py
+++ b/pyrit/prompt_converter/__init__.py
@@ -19,6 +19,7 @@
 from pyrit.prompt_converter.add_text_image_converter import AddTextImageConverter
 from pyrit.prompt_converter.ansi_escape.ansi_attack_converter import AnsiAttackConverter
 from pyrit.prompt_converter.arabic_presentation_form_converter import ArabicPresentationFormConverter
+from pyrit.prompt_converter.arabizi_converter import ArabiziConverter
 from pyrit.prompt_converter.ascii_art_converter import AsciiArtConverter
 from pyrit.prompt_converter.ask_to_decode_converter import AskToDecodeConverter
 from pyrit.prompt_converter.atbash_converter import AtbashConverter
@@ -147,6 +148,7 @@ def __getattr__(name: str) -> object:
     "AllWordsSelectionStrategy",
     "AnsiAttackConverter",
     "ArabicPresentationFormConverter",
+    "ArabiziConverter",
     "AsciiArtConverter",
     "AsciiSmugglerConverter",
     "AskToDecodeConverter",
diff --git a/pyrit/prompt_converter/arabizi_converter.py b/pyrit/prompt_converter/arabizi_converter.py
new file mode 100644
index 0000000000..9663e64188
--- /dev/null
+++ b/pyrit/prompt_converter/arabizi_converter.py
@@ -0,0 +1,101 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import logging
+
+from pyrit.models import PromptDataType
+from pyrit.prompt_converter.prompt_converter import ConverterResult, PromptConverter
+
+logger = logging.getLogger(__name__)
+
+# Arabic script mapped to Arabizi (Latin-script "chat Arabic"), using the widely documented Arabic
+# chat alphabet with Gulf-leaning conventions. Keys are built from code points so the source file
+# stays pure ASCII. The mapping is intentionally lossy (for example THEH and THAL both map to "th"),
+# which mirrors how Arabizi is actually written.
+_ARABIC_TO_ARABIZI: dict[str, str] = {
+    chr(0x0627): "a",  # ALEF
+    chr(0x0628): "b",  # BEH
+    chr(0x062A): "t",  # TEH
+    chr(0x062B): "th",  # THEH
+    chr(0x062C): "j",  # JEEM
+    chr(0x062D): "7",  # HAH
+    chr(0x062E): "5",  # KHAH
+    chr(0x062F): "d",  # DAL
+    chr(0x0630): "th",  # THAL
+    chr(0x0631): "r",  # REH
+    chr(0x0632): "z",  # ZAIN
+    chr(0x0633): "s",  # SEEN
+    chr(0x0634): "sh",  # SHEEN
+    chr(0x0635): "9",  # SAD
+    chr(0x0636): "d",  # DAD
+    chr(0x0637): "6",  # TAH
+    chr(0x0638): "z",  # ZAH
+    chr(0x0639): "3",  # AIN
+    chr(0x063A): "gh",  # GHAIN
+    chr(0x0641): "f",  # FEH
+    chr(0x0642): "8",  # QAF
+    chr(0x0643): "k",  # KAF
+    chr(0x0644): "l",  # LAM
+    chr(0x0645): "m",  # MEEM
+    chr(0x0646): "n",  # NOON
+    chr(0x0647): "h",  # HEH
+    chr(0x0648): "w",  # WAW
+    chr(0x064A): "y",  # YEH
+    chr(0x0621): "2",  # HAMZA
+    chr(0x0622): "2a",  # ALEF WITH MADDA ABOVE
+    chr(0x0623): "a",  # ALEF WITH HAMZA ABOVE
+    chr(0x0625): "a",  # ALEF WITH HAMZA BELOW
+    chr(0x0624): "2",  # WAW WITH HAMZA ABOVE
+    chr(0x0626): "2",  # YEH WITH HAMZA ABOVE
+    chr(0x0629): "a",  # TEH MARBUTA
+    chr(0x0649): "a",  # ALEF MAKSURA
+    chr(0x0640): "",  # TATWEEL (connector, dropped)
+    chr(0x064B): "",  # FATHATAN (short-vowel marks are dropped)
+    chr(0x064C): "",  # DAMMATAN
+    chr(0x064D): "",  # KASRATAN
+    chr(0x064E): "",  # FATHA
+    chr(0x064F): "",  # DAMMA
+    chr(0x0650): "",  # KASRA
+    chr(0x0651): "",  # SHADDA
+    chr(0x0652): "",  # SUKUN
+}
+
+
+class ArabiziConverter(PromptConverter):
+    """
+    Transliterates Arabic script into Arabizi (Latin-script "chat Arabic").
+
+    Arabizi is the everyday Latin-script encoding of Arabic used in chat and social media, where
+    letters that have no Latin equivalent are written with digits that resemble their shape (for
+    example HAH becomes 7, AIN becomes 3, and QAF becomes 8). This converter applies a deterministic
+    per-character mapping with Gulf-leaning conventions: no language model is involved, so the same
+    input always produces the same output. The attack surface targeted is tokenizer and safety
+    classifier handling of transliterated Arabic, not the language itself.
+
+    Short-vowel diacritics and the tatweel connector are dropped, and characters outside the Arabic
+    block (Latin text, digits, punctuation) are left unchanged. The mapping is intentionally lossy,
+    mirroring how Arabizi is actually written.
+    """
+
+    SUPPORTED_INPUT_TYPES = ("text",)
+    SUPPORTED_OUTPUT_TYPES = ("text",)
+
+    async def convert_async(self, *, prompt: str, input_type: PromptDataType = "text") -> ConverterResult:
+        """
+        Convert the given prompt by transliterating Arabic script into Arabizi.
+
+        Args:
+            prompt (str): The prompt to be converted.
+            input_type (PromptDataType): The type of input data.
+
+        Returns:
+            ConverterResult: The result containing the transliterated text.
+
+        Raises:
+            ValueError: If the input type is not supported.
+        """
+        if not self.input_supported(input_type):
+            raise ValueError("Input type not supported")
+
+        converted_text = "".join(_ARABIC_TO_ARABIZI.get(char, char) for char in prompt)
+        return ConverterResult(output_text=converted_text, output_type="text")
diff --git a/tests/unit/prompt_converter/test_arabizi_converter.py b/tests/unit/prompt_converter/test_arabizi_converter.py
new file mode 100644
index 0000000000..212721eb50
--- /dev/null
+++ b/tests/unit/prompt_converter/test_arabizi_converter.py
@@ -0,0 +1,78 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import pytest
+
+from pyrit.prompt_converter import ArabiziConverter, ConverterResult
+
+# Arabic letters (built from code points to keep this file pure ASCII).
+ALEF = chr(0x0627)
+BEH = chr(0x0628)
+HAH = chr(0x062D)  # -> 7
+KHAH = chr(0x062E)  # -> 5
+REH = chr(0x0631)
+SHEEN = chr(0x0634)  # -> sh
+AIN = chr(0x0639)  # -> 3
+QAF = chr(0x0642)  # -> 8
+MEEM = chr(0x0645)
+ALEF_MADDA = chr(0x0622)  # -> 2a
+FATHA = chr(0x064E)  # short-vowel diacritic, dropped
+TATWEEL = chr(0x0640)  # connector, dropped
+
+
+def test_input_supported():
+    converter = ArabiziConverter()
+    assert converter.input_supported("text") is True
+    assert converter.input_supported("image") is False
+
+
+async def test_transliterates_word():
+    # marhaba: MEEM REH HAH BEH ALEF -> m r 7 b a
+    result = await ArabiziConverter().convert_async(prompt=MEEM + REH + HAH + BEH + ALEF, input_type="text")
+    assert isinstance(result, ConverterResult)
+    assert result.output_type == "text"
+    assert result.output_text == "mr7ba"
+
+
+async def test_number_letters():
+    result = await ArabiziConverter().convert_async(prompt=HAH + KHAH + AIN + QAF)
+    assert result.output_text == "753" + "8"
+
+
+async def test_multi_character_mappings():
+    result = await ArabiziConverter().convert_async(prompt=SHEEN + ALEF_MADDA)
+    assert result.output_text == "sh2a"
+
+
+async def test_diacritics_and_tatweel_are_dropped():
+    # BEH + FATHA -> "b" (diacritic dropped); BEH + TATWEEL + BEH -> "bb"
+    assert (await ArabiziConverter().convert_async(prompt=BEH + FATHA)).output_text == "b"
+    assert (await ArabiziConverter().convert_async(prompt=BEH + TATWEEL + BEH)).output_text == "bb"
+
+
+async def test_leaves_non_arabic_unchanged():
+    result = await ArabiziConverter().convert_async(prompt="hello 123!")
+    assert result.output_text == "hello 123!"
+
+
+async def test_mixed_text():
+    result = await ArabiziConverter().convert_async(prompt="ok " + BEH)
+    assert result.output_text == "ok b"
+
+
+async def test_empty_prompt_returns_empty():
+    result = await ArabiziConverter().convert_async(prompt="")
+    assert result.output_text == ""
+
+
+async def test_conversion_is_deterministic():
+    converter = ArabiziConverter()
+    prompt = MEEM + REH + HAH + BEH + ALEF
+    first = await converter.convert_async(prompt=prompt)
+    second = await converter.convert_async(prompt=prompt)
+    assert first.output_text == second.output_text
+
+
+async def test_input_type_not_supported_raises():
+    with pytest.raises(ValueError):
+        await ArabiziConverter().convert_async(prompt=BEH, input_type="image")