Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 61 additions & 60 deletions doc/code/converters/0_converters.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -79,66 +79,67 @@
"18 text image_path QRCodeConverter\n",
"19 text text AnsiAttackConverter\n",
"20 text text ArabicPresentationFormConverter\n",
"21 text text AsciiArtConverter\n",
"22 text text AsciiSmugglerConverter\n",
"23 text text AskToDecodeConverter\n",
"24 text text AtbashConverter\n",
"25 text text Base2048Converter\n",
"26 text text Base64Converter\n",
"27 text text BidiConverter\n",
"28 text text BinAsciiConverter\n",
"29 text text BinaryConverter\n",
"30 text text BrailleConverter\n",
"31 text text CaesarConverter\n",
"32 text text CharSwapConverter\n",
"33 text text CharacterSpaceConverter\n",
"34 text text CodeChameleonConverter\n",
"35 text text ColloquialWordswapConverter\n",
"36 text text DenylistConverter\n",
"37 text text DiacriticConverter\n",
"38 text text EcojiConverter\n",
"39 text text EmojiConverter\n",
"40 text text FirstLetterConverter\n",
"41 text text FlipConverter\n",
"42 text text ImagePromptStyleConverter\n",
"43 text text InsertPunctuationConverter\n",
"44 text text JsonStringConverter\n",
"45 text text LLMGenericTextConverter\n",
"46 text text LeetspeakConverter\n",
"47 text text MaliciousQuestionGeneratorConverter\n",
"48 text text MathObfuscationConverter\n",
"49 text text MathPromptConverter\n",
"50 text text MorseConverter\n",
"51 text text NatoConverter\n",
"52 text text NegationTrapConverter\n",
"53 text text NoiseConverter\n",
"54 text text PersuasionConverter\n",
"55 text text ROT13Converter\n",
"56 text text RandomCapitalLettersConverter\n",
"57 text text RandomTranslationConverter\n",
"58 text text RepeatTokenConverter\n",
"59 text text ScientificTranslationConverter\n",
"60 text text SearchReplaceConverter\n",
"61 text text SelectiveTextConverter\n",
"62 text text SneakyBitsSmugglerConverter\n",
"63 text text StringJoinConverter\n",
"64 text text SuffixAppendConverter\n",
"65 text text SuperscriptConverter\n",
"66 text text TatweelConverter\n",
"67 text text TemplateSegmentConverter\n",
"68 text text TenseConverter\n",
"69 text text TextJailbreakConverter\n",
"70 text text ToneConverter\n",
"71 text text ToxicSentenceGeneratorConverter\n",
"72 text text TranslationConverter\n",
"73 text text UnicodeConfusableConverter\n",
"74 text text UnicodeReplacementConverter\n",
"75 text text UnicodeSubstitutionConverter\n",
"76 text text UrlConverter\n",
"77 text text VariationConverter\n",
"78 text text VariationSelectorSmugglerConverter\n",
"79 text text ZalgoConverter\n",
"80 text text ZeroWidthConverter\n"
"21 text text ArabiziConverter\n",
"22 text text AsciiArtConverter\n",
"23 text text AsciiSmugglerConverter\n",
"24 text text AskToDecodeConverter\n",
"25 text text AtbashConverter\n",
"26 text text Base2048Converter\n",
"27 text text Base64Converter\n",
"28 text text BidiConverter\n",
"29 text text BinAsciiConverter\n",
"30 text text BinaryConverter\n",
"31 text text BrailleConverter\n",
"32 text text CaesarConverter\n",
"33 text text CharSwapConverter\n",
"34 text text CharacterSpaceConverter\n",
"35 text text CodeChameleonConverter\n",
"36 text text ColloquialWordswapConverter\n",
"37 text text DenylistConverter\n",
"38 text text DiacriticConverter\n",
"39 text text EcojiConverter\n",
"40 text text EmojiConverter\n",
"41 text text FirstLetterConverter\n",
"42 text text FlipConverter\n",
"43 text text ImagePromptStyleConverter\n",
"44 text text InsertPunctuationConverter\n",
"45 text text JsonStringConverter\n",
"46 text text LLMGenericTextConverter\n",
"47 text text LeetspeakConverter\n",
"48 text text MaliciousQuestionGeneratorConverter\n",
"49 text text MathObfuscationConverter\n",
"50 text text MathPromptConverter\n",
"51 text text MorseConverter\n",
"52 text text NatoConverter\n",
"53 text text NegationTrapConverter\n",
"54 text text NoiseConverter\n",
"55 text text PersuasionConverter\n",
"56 text text ROT13Converter\n",
"57 text text RandomCapitalLettersConverter\n",
"58 text text RandomTranslationConverter\n",
"59 text text RepeatTokenConverter\n",
"60 text text ScientificTranslationConverter\n",
"61 text text SearchReplaceConverter\n",
"62 text text SelectiveTextConverter\n",
"63 text text SneakyBitsSmugglerConverter\n",
"64 text text StringJoinConverter\n",
"65 text text SuffixAppendConverter\n",
"66 text text SuperscriptConverter\n",
"67 text text TatweelConverter\n",
"68 text text TemplateSegmentConverter\n",
"69 text text TenseConverter\n",
"70 text text TextJailbreakConverter\n",
"71 text text ToneConverter\n",
"72 text text ToxicSentenceGeneratorConverter\n",
"73 text text TranslationConverter\n",
"74 text text UnicodeConfusableConverter\n",
"75 text text UnicodeReplacementConverter\n",
"76 text text UnicodeSubstitutionConverter\n",
"77 text text UrlConverter\n",
"78 text text VariationConverter\n",
"79 text text VariationSelectorSmugglerConverter\n",
"80 text text ZalgoConverter\n",
"81 text text ZeroWidthConverter\n"
]
}
],
Expand Down
3 changes: 3 additions & 0 deletions doc/code/converters/1_text_to_text_converters.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,7 @@
"from pyrit.prompt_converter import (\n",
" AnsiAttackConverter,\n",
" ArabicPresentationFormConverter,\n",
" ArabiziConverter,\n",
" BidiConverter,\n",
" CharacterSpaceConverter,\n",
" CharSwapConverter,\n",
Expand Down Expand Up @@ -327,6 +328,8 @@
"print(\"Tatweel:\", await TatweelConverter().convert_async(prompt=arabic_prompt)) # type: ignore\n",
"# Arabic presentation form substitutes Arabic letters with their isolated glyphs\n",
"print(\"Arabic Presentation Form:\", await ArabicPresentationFormConverter().convert_async(prompt=arabic_prompt)) # type: ignore\n",
"# Arabizi transliterates Arabic script into Latin-script chat Arabic\n",
"print(\"Arabizi:\", await ArabiziConverter().convert_async(prompt=arabic_prompt)) # type: ignore\n",
"print(\"Superscript:\", await SuperscriptConverter().convert_async(prompt=prompt)) # type: ignore\n",
"print(\"Zalgo:\", await ZalgoConverter().convert_async(prompt=prompt)) # type: ignore\n",
"\n",
Expand Down
3 changes: 3 additions & 0 deletions doc/code/converters/1_text_to_text_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
from pyrit.prompt_converter import (
AnsiAttackConverter,
ArabicPresentationFormConverter,
ArabiziConverter,
BidiConverter,
CharacterSpaceConverter,
CharSwapConverter,
Expand Down Expand Up @@ -138,6 +139,8 @@
print("Tatweel:", await TatweelConverter().convert_async(prompt=arabic_prompt)) # type: ignore
# Arabic presentation form substitutes Arabic letters with their isolated glyphs
print("Arabic Presentation Form:", await ArabicPresentationFormConverter().convert_async(prompt=arabic_prompt)) # type: ignore
# Arabizi transliterates Arabic script into Latin-script chat Arabic
print("Arabizi:", await ArabiziConverter().convert_async(prompt=arabic_prompt)) # type: ignore
print("Superscript:", await SuperscriptConverter().convert_async(prompt=prompt)) # type: ignore
print("Zalgo:", await ZalgoConverter().convert_async(prompt=prompt)) # type: ignore

Expand Down
2 changes: 2 additions & 0 deletions pyrit/prompt_converter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from pyrit.prompt_converter.add_text_image_converter import AddTextImageConverter
from pyrit.prompt_converter.ansi_escape.ansi_attack_converter import AnsiAttackConverter
from pyrit.prompt_converter.arabic_presentation_form_converter import ArabicPresentationFormConverter
from pyrit.prompt_converter.arabizi_converter import ArabiziConverter
from pyrit.prompt_converter.ascii_art_converter import AsciiArtConverter
from pyrit.prompt_converter.ask_to_decode_converter import AskToDecodeConverter
from pyrit.prompt_converter.atbash_converter import AtbashConverter
Expand Down Expand Up @@ -147,6 +148,7 @@ def __getattr__(name: str) -> object:
"AllWordsSelectionStrategy",
"AnsiAttackConverter",
"ArabicPresentationFormConverter",
"ArabiziConverter",
"AsciiArtConverter",
"AsciiSmugglerConverter",
"AskToDecodeConverter",
Expand Down
101 changes: 101 additions & 0 deletions pyrit/prompt_converter/arabizi_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import logging

from pyrit.models import PromptDataType
from pyrit.prompt_converter.prompt_converter import ConverterResult, PromptConverter

logger = logging.getLogger(__name__)

# Arabic script mapped to Arabizi (Latin-script "chat Arabic"), using the widely documented Arabic
# chat alphabet with Gulf-leaning conventions. Keys are built from code points so the source file
# stays pure ASCII. The mapping is intentionally lossy (for example THEH and THAL both map to "th"),
# which mirrors how Arabizi is actually written.
_ARABIC_TO_ARABIZI: dict[str, str] = {
chr(0x0627): "a", # ALEF
chr(0x0628): "b", # BEH
chr(0x062A): "t", # TEH
chr(0x062B): "th", # THEH
chr(0x062C): "j", # JEEM
chr(0x062D): "7", # HAH
chr(0x062E): "5", # KHAH
chr(0x062F): "d", # DAL
chr(0x0630): "th", # THAL
chr(0x0631): "r", # REH
chr(0x0632): "z", # ZAIN
chr(0x0633): "s", # SEEN
chr(0x0634): "sh", # SHEEN
chr(0x0635): "9", # SAD
chr(0x0636): "d", # DAD
chr(0x0637): "6", # TAH
chr(0x0638): "z", # ZAH
chr(0x0639): "3", # AIN
chr(0x063A): "gh", # GHAIN
chr(0x0641): "f", # FEH
chr(0x0642): "8", # QAF
chr(0x0643): "k", # KAF
chr(0x0644): "l", # LAM
chr(0x0645): "m", # MEEM
chr(0x0646): "n", # NOON
chr(0x0647): "h", # HEH
chr(0x0648): "w", # WAW
chr(0x064A): "y", # YEH
chr(0x0621): "2", # HAMZA
chr(0x0622): "2a", # ALEF WITH MADDA ABOVE
chr(0x0623): "a", # ALEF WITH HAMZA ABOVE
chr(0x0625): "a", # ALEF WITH HAMZA BELOW
chr(0x0624): "2", # WAW WITH HAMZA ABOVE
chr(0x0626): "2", # YEH WITH HAMZA ABOVE
chr(0x0629): "a", # TEH MARBUTA
chr(0x0649): "a", # ALEF MAKSURA
chr(0x0640): "", # TATWEEL (connector, dropped)
chr(0x064B): "", # FATHATAN (short-vowel marks are dropped)
chr(0x064C): "", # DAMMATAN
chr(0x064D): "", # KASRATAN
chr(0x064E): "", # FATHA
chr(0x064F): "", # DAMMA
chr(0x0650): "", # KASRA
chr(0x0651): "", # SHADDA
chr(0x0652): "", # SUKUN
}


class ArabiziConverter(PromptConverter):
"""
Transliterates Arabic script into Arabizi (Latin-script "chat Arabic").

Arabizi is the everyday Latin-script encoding of Arabic used in chat and social media, where
letters that have no Latin equivalent are written with digits that resemble their shape (for
example HAH becomes 7, AIN becomes 3, and QAF becomes 8). This converter applies a deterministic
per-character mapping with Gulf-leaning conventions: no language model is involved, so the same
input always produces the same output. The attack surface targeted is tokenizer and safety
classifier handling of transliterated Arabic, not the language itself.

Short-vowel diacritics and the tatweel connector are dropped, and characters outside the Arabic
block (Latin text, digits, punctuation) are left unchanged. The mapping is intentionally lossy,
mirroring how Arabizi is actually written.
"""

SUPPORTED_INPUT_TYPES = ("text",)
SUPPORTED_OUTPUT_TYPES = ("text",)

async def convert_async(self, *, prompt: str, input_type: PromptDataType = "text") -> ConverterResult:
"""
Convert the given prompt by transliterating Arabic script into Arabizi.

Args:
prompt (str): The prompt to be converted.
input_type (PromptDataType): The type of input data.

Returns:
ConverterResult: The result containing the transliterated text.

Raises:
ValueError: If the input type is not supported.
"""
if not self.input_supported(input_type):
raise ValueError("Input type not supported")

converted_text = "".join(_ARABIC_TO_ARABIZI.get(char, char) for char in prompt)
return ConverterResult(output_text=converted_text, output_type="text")
78 changes: 78 additions & 0 deletions tests/unit/prompt_converter/test_arabizi_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

import pytest

from pyrit.prompt_converter import ArabiziConverter, ConverterResult

# Arabic letters (built from code points to keep this file pure ASCII).
ALEF = chr(0x0627)
BEH = chr(0x0628)
HAH = chr(0x062D) # -> 7
KHAH = chr(0x062E) # -> 5
REH = chr(0x0631)
SHEEN = chr(0x0634) # -> sh
AIN = chr(0x0639) # -> 3
QAF = chr(0x0642) # -> 8
MEEM = chr(0x0645)
ALEF_MADDA = chr(0x0622) # -> 2a
FATHA = chr(0x064E) # short-vowel diacritic, dropped
TATWEEL = chr(0x0640) # connector, dropped


def test_input_supported():
converter = ArabiziConverter()
assert converter.input_supported("text") is True
assert converter.input_supported("image") is False


async def test_transliterates_word():
# marhaba: MEEM REH HAH BEH ALEF -> m r 7 b a
result = await ArabiziConverter().convert_async(prompt=MEEM + REH + HAH + BEH + ALEF, input_type="text")
assert isinstance(result, ConverterResult)
assert result.output_type == "text"
assert result.output_text == "mr7ba"


async def test_number_letters():
result = await ArabiziConverter().convert_async(prompt=HAH + KHAH + AIN + QAF)
assert result.output_text == "753" + "8"


async def test_multi_character_mappings():
result = await ArabiziConverter().convert_async(prompt=SHEEN + ALEF_MADDA)
assert result.output_text == "sh2a"


async def test_diacritics_and_tatweel_are_dropped():
# BEH + FATHA -> "b" (diacritic dropped); BEH + TATWEEL + BEH -> "bb"
assert (await ArabiziConverter().convert_async(prompt=BEH + FATHA)).output_text == "b"
assert (await ArabiziConverter().convert_async(prompt=BEH + TATWEEL + BEH)).output_text == "bb"


async def test_leaves_non_arabic_unchanged():
result = await ArabiziConverter().convert_async(prompt="hello 123!")
assert result.output_text == "hello 123!"


async def test_mixed_text():
result = await ArabiziConverter().convert_async(prompt="ok " + BEH)
assert result.output_text == "ok b"


async def test_empty_prompt_returns_empty():
result = await ArabiziConverter().convert_async(prompt="")
assert result.output_text == ""


async def test_conversion_is_deterministic():
converter = ArabiziConverter()
prompt = MEEM + REH + HAH + BEH + ALEF
first = await converter.convert_async(prompt=prompt)
second = await converter.convert_async(prompt=prompt)
assert first.output_text == second.output_text


async def test_input_type_not_supported_raises():
with pytest.raises(ValueError):
await ArabiziConverter().convert_async(prompt=BEH, input_type="image")
Loading