Skip to content

Commit c05c6a4

Browse files
stephentoubCopilotdanmoseley
authored
Reduce backtracking for greedy loops followed by subsumed literals (#125636)
When a greedy character loop (like `\w+`, `\d+`, `[a-z]+`) is followed by a literal that's part of the loop's character class, backtracking normally requires repeated `LastIndexOf` calls to find viable positions. However, if whatever comes *after* that literal is disjoint from the loop's class, then only the very last position consumed by the loop can possibly succeed — every earlier position would have a loop-class character where the disjoint subsequent needs something else. For example, in `\b\w+n\b`, the `\w+` loop is followed by `n` (which is in `\w`), and `n` is followed by `\b`. Since the loop only matches word characters, any position in the middle of the loop's consumed range would have a word character after the `n`, and the `\b` boundary wouldn't be satisfied there. Only the very last consumed position can work, so backtracking can skip directly to it rather than searching backward one position at a time. --------- Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Co-authored-by: Dan Moseley <danmose@microsoft.com>
1 parent 4687f9a commit c05c6a4

5 files changed

Lines changed: 381 additions & 129 deletions

File tree

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

Lines changed: 49 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3482,19 +3482,59 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
34823482
subsequent?.FindStartingLiteralNode() is RegexNode literalNode &&
34833483
TryEmitIndexOf(requiredHelpers, literalNode, useLast: true, negate: false, out int literalLength, out string? indexOfExpr))
34843484
{
3485-
writer.WriteLine($"if ({startingPos} >= {endingPos} ||");
3485+
// If CanReduceLoopBacktrackingToSinglePosition determines only the last consumed character
3486+
// can succeed, we can just check it directly instead of repeatedly searching with LastIndexOf.
3487+
if (subsequent is not null && RegexNode.CanReduceLoopBacktrackingToSinglePosition(node, subsequent))
3488+
{
3489+
using (EmitBlock(writer, $"if ({startingPos} >= {endingPos})"))
3490+
{
3491+
Goto(doneLabel);
3492+
}
3493+
writer.WriteLine($"{endingPos}--;");
34863494

3487-
string setEndingPosCondition = $" ({endingPos} = inputSpan.Slice({startingPos}, ";
3488-
setEndingPosCondition = literalLength > 1 ?
3489-
$"{setEndingPosCondition}Math.Min(inputSpan.Length, {endingPos} + {literalLength - 1}) - {startingPos})" :
3490-
$"{setEndingPosCondition}{endingPos} - {startingPos})";
3495+
string charAccessExpr = $"inputSpan[{endingPos}]";
3496+
string condition = literalNode.Kind switch
3497+
{
3498+
RegexNodeKind.One or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy =>
3499+
$"{charAccessExpr} != {Literal(literalNode.Ch)}",
3500+
RegexNodeKind.Notone or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy =>
3501+
$"{charAccessExpr} == {Literal(literalNode.Ch)}",
3502+
RegexNodeKind.Multi =>
3503+
$"{charAccessExpr} != {Literal(literalNode.Str![0])}",
3504+
_ => // Set, Setloop, Setloopatomic, Setlazy
3505+
$"!{MatchCharacterClass(charAccessExpr, literalNode.Str!, negate: false, additionalDeclarations, requiredHelpers)}",
3506+
};
3507+
using (EmitBlock(writer, $"if ({condition})"))
3508+
{
3509+
Goto(doneLabel);
3510+
}
34913511

3492-
using (EmitBlock(writer, $"{setEndingPosCondition}.{indexOfExpr}) < 0)"))
3512+
writer.WriteLine($"pos = {endingPos};");
3513+
3514+
// We've now checked the only backtrack position that can succeed. Force any
3515+
// subsequent backtrack to fail immediately by setting endingPos to 0, which
3516+
// guarantees the "startingPos >= endingPos" guard (emitted above) will be true
3517+
// on re-entry. Note: if the emitter's backtracking structure changes (e.g. the
3518+
// guard condition or how endingPos is used beyond the guard and stack save/restore),
3519+
// this assumption would need to be revisited.
3520+
writer.WriteLine($"{endingPos} = 0;");
3521+
}
3522+
else
34933523
{
3494-
Goto(doneLabel);
3524+
writer.WriteLine($"if ({startingPos} >= {endingPos} ||");
3525+
3526+
string setEndingPosCondition = $" ({endingPos} = inputSpan.Slice({startingPos}, ";
3527+
setEndingPosCondition = literalLength > 1 ?
3528+
$"{setEndingPosCondition}Math.Min(inputSpan.Length, {endingPos} + {literalLength - 1}) - {startingPos})" :
3529+
$"{setEndingPosCondition}{endingPos} - {startingPos})";
3530+
3531+
using (EmitBlock(writer, $"{setEndingPosCondition}.{indexOfExpr}) < 0)"))
3532+
{
3533+
Goto(doneLabel);
3534+
}
3535+
writer.WriteLine($"{endingPos} += {startingPos};");
3536+
writer.WriteLine($"pos = {endingPos};");
34953537
}
3496-
writer.WriteLine($"{endingPos} += {startingPos};");
3497-
writer.WriteLine($"pos = {endingPos};");
34983538
}
34993539
else
35003540
{

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCharClass.cs

Lines changed: 66 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -879,6 +879,72 @@ static bool MayOverlapByEnumeration(string set1, string set2)
879879
}
880880
}
881881

882+
/// <summary>
883+
/// Determines conservatively whether <paramref name="subset"/> is a subset of <paramref name="superset"/>
884+
/// (i.e. every character in subset is also in superset). Returns false if the subset relationship cannot be determined.
885+
/// </summary>
886+
public static bool IsSubsetOf(string subset, string superset)
887+
{
888+
// Identical sets are trivially subsets.
889+
if (subset == superset)
890+
{
891+
return true;
892+
}
893+
894+
// If superset is the universal set, everything is a subset.
895+
if (superset == AnyClass)
896+
{
897+
return true;
898+
}
899+
900+
// If subset can be easily enumerated, check that every character in it is also in superset.
901+
if (!IsNegated(subset) && CanEasilyEnumerateSetContents(subset))
902+
{
903+
for (int i = SetStartIndex; i < SetStartIndex + subset[SetLengthIndex]; i += 2)
904+
{
905+
int curSetEnd = subset[i + 1];
906+
for (int c = subset[i]; c < curSetEnd; c++)
907+
{
908+
if (!CharInClass((char)c, superset))
909+
{
910+
return false;
911+
}
912+
}
913+
}
914+
915+
return true;
916+
}
917+
918+
// If both sets are composed entirely of Unicode categories, check that all
919+
// categories in subset are also present in superset.
920+
Span<UnicodeCategory> categories1 = stackalloc UnicodeCategory[16], categories2 = stackalloc UnicodeCategory[16];
921+
if (TryGetOnlyCategories(subset, categories1, out int numCategories1, out bool negated1) && !negated1 &&
922+
TryGetOnlyCategories(superset, categories2, out int numCategories2, out bool negated2) && !negated2)
923+
{
924+
foreach (UnicodeCategory cat1 in categories1.Slice(0, numCategories1))
925+
{
926+
bool found = false;
927+
foreach (UnicodeCategory cat2 in categories2.Slice(0, numCategories2))
928+
{
929+
if (cat1 == cat2)
930+
{
931+
found = true;
932+
break;
933+
}
934+
}
935+
936+
if (!found)
937+
{
938+
return false;
939+
}
940+
}
941+
942+
return true;
943+
}
944+
945+
return false;
946+
}
947+
882948
/// <summary>
883949
/// Gets whether the specified set is a named set with a reasonably small count
884950
/// of Unicode characters.
@@ -1127,55 +1193,6 @@ public static bool IsWordChar(char ch)
11271193
(WordCategoriesMask & (1 << (int)CharUnicodeInfo.GetUnicodeCategory(ch))) != 0;
11281194
}
11291195

1130-
/// <summary>Determines whether the characters that match the specified set are known to all be word characters.</summary>
1131-
public static bool IsKnownWordClassSubset(string set)
1132-
{
1133-
// Check for common sets that we know to be subsets of \w.
1134-
if (set is
1135-
WordClass or DigitClass or LetterClass or LetterOrDigitClass or
1136-
AsciiLetterClass or AsciiLetterOrDigitClass or
1137-
HexDigitClass or HexDigitUpperClass or HexDigitLowerClass)
1138-
{
1139-
return true;
1140-
}
1141-
1142-
// Check for sets composed of Unicode categories that are part of \w.
1143-
Span<UnicodeCategory> categories = stackalloc UnicodeCategory[16];
1144-
if (TryGetOnlyCategories(set, categories, out int numCategories, out bool negated) && !negated)
1145-
{
1146-
foreach (UnicodeCategory cat in categories.Slice(0, numCategories))
1147-
{
1148-
if (!IsWordCategory(cat))
1149-
{
1150-
return false;
1151-
}
1152-
}
1153-
1154-
return true;
1155-
}
1156-
1157-
// If we can enumerate every character in the set quickly, do so, checking to see whether they're all in \w.
1158-
if (CanEasilyEnumerateSetContents(set))
1159-
{
1160-
for (int i = SetStartIndex; i < SetStartIndex + set[SetLengthIndex]; i += 2)
1161-
{
1162-
int curSetEnd = set[i + 1];
1163-
for (int c = set[i]; c < curSetEnd; c++)
1164-
{
1165-
if (!CharInClass((char)c, WordClass))
1166-
{
1167-
return false;
1168-
}
1169-
}
1170-
}
1171-
1172-
return true;
1173-
}
1174-
1175-
// Unlikely to be a subset of \w, and we don't know for sure.
1176-
return false;
1177-
}
1178-
11791196
/// <summary>Determines whether a character is considered a word character for the purposes of testing a word character boundary.</summary>
11801197
public static bool IsBoundaryWordChar(char ch)
11811198
{

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

Lines changed: 92 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -3655,44 +3655,103 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
36553655
subsequent?.FindStartingLiteralNode() is RegexNode literal &&
36563656
CanEmitIndexOf(literal, out int literalLength))
36573657
{
3658-
// endingPos = inputSpan.Slice(startingPos, Math.Min(inputSpan.Length, endingPos + literal.Length - 1) - startingPos).LastIndexOf(literal);
3659-
// if (endingPos < 0)
3660-
// {
3661-
// goto doneLabel;
3662-
// }
3663-
Ldloca(inputSpan);
3664-
Ldloc(startingPos);
3665-
if (literalLength > 1)
3658+
// If CanReduceLoopBacktrackingToSinglePosition determines only the last consumed character
3659+
// can succeed, we can just check it directly instead of repeatedly searching with LastIndexOf.
3660+
if (subsequent is not null && RegexNode.CanReduceLoopBacktrackingToSinglePosition(node, subsequent))
36663661
{
3667-
// Math.Min(inputSpan.Length, endingPos + literal.Length - 1) - startingPos
3662+
// endingPos--;
3663+
Ldloc(endingPos);
3664+
Ldc(1);
3665+
Sub();
3666+
Stloc(endingPos);
3667+
3668+
// if (!literal.Matches(inputSpan[endingPos])) goto doneLabel;
36683669
Ldloca(inputSpan);
3669-
Call(SpanGetLengthMethod);
36703670
Ldloc(endingPos);
3671-
Ldc(literalLength - 1);
3672-
Add();
3673-
Call(MathMinIntIntMethod);
3671+
Call(SpanGetItemMethod);
3672+
LdindU2();
3673+
switch (literal.Kind)
3674+
{
3675+
case RegexNodeKind.One or RegexNodeKind.Oneloop or RegexNodeKind.Oneloopatomic or RegexNodeKind.Onelazy:
3676+
Ldc(literal.Ch);
3677+
BneFar(doneLabel);
3678+
break;
3679+
3680+
case RegexNodeKind.Notone or RegexNodeKind.Notoneloop or RegexNodeKind.Notoneloopatomic or RegexNodeKind.Notonelazy:
3681+
Ldc(literal.Ch);
3682+
BeqFar(doneLabel);
3683+
break;
3684+
3685+
case RegexNodeKind.Multi:
3686+
Ldc(literal.Str![0]);
3687+
BneFar(doneLabel);
3688+
break;
3689+
3690+
default: // Set, Setloop, Setloopatomic, Setlazy
3691+
EmitMatchCharacterClass(literal.Str!);
3692+
BrfalseFar(doneLabel);
3693+
break;
3694+
}
3695+
3696+
// pos = endingPos;
3697+
Ldloc(endingPos);
3698+
Stloc(pos);
3699+
3700+
// We've now checked the only backtrack position that can succeed. Force any
3701+
// subsequent backtrack to fail immediately by setting endingPos to 0, which
3702+
// guarantees the "startingPos >= endingPos" guard (emitted above) will be true
3703+
// on re-entry. Note: if the emitter's backtracking structure changes (e.g. the
3704+
// guard condition or how endingPos is used beyond the guard and stack save/restore),
3705+
// this assumption would need to be revisited.
3706+
// endingPos = 0;
3707+
Ldc(0);
3708+
Stloc(endingPos);
36743709
}
36753710
else
36763711
{
3677-
// endingPos - startingPos
3678-
Ldloc(endingPos);
3679-
}
3680-
Ldloc(startingPos);
3681-
Sub();
3682-
Call(SpanSliceIntIntMethod);
3712+
// endingPos = inputSpan.Slice(startingPos, Math.Min(inputSpan.Length, endingPos + literal.Length - 1) - startingPos).LastIndexOf(literal);
3713+
// if (endingPos < 0)
3714+
// {
3715+
// goto doneLabel;
3716+
// }
3717+
Ldloca(inputSpan);
3718+
Ldloc(startingPos);
3719+
if (literalLength > 1)
3720+
{
3721+
// Math.Min(inputSpan.Length, endingPos + literal.Length - 1) - startingPos
3722+
Ldloca(inputSpan);
3723+
Call(SpanGetLengthMethod);
3724+
Ldloc(endingPos);
3725+
Ldc(literalLength - 1);
3726+
Add();
3727+
Call(MathMinIntIntMethod);
3728+
}
3729+
else
3730+
{
3731+
// endingPos - startingPos
3732+
Ldloc(endingPos);
3733+
}
3734+
Ldloc(startingPos);
3735+
Sub();
3736+
Call(SpanSliceIntIntMethod);
36833737

3684-
EmitIndexOf(literal, useLast: true, negate: false);
3685-
Stloc(endingPos);
3738+
EmitIndexOf(literal, useLast: true, negate: false);
3739+
Stloc(endingPos);
36863740

3687-
Ldloc(endingPos);
3688-
Ldc(0);
3689-
BltFar(doneLabel);
3741+
Ldloc(endingPos);
3742+
Ldc(0);
3743+
BltFar(doneLabel);
36903744

3691-
// endingPos += startingPos;
3692-
Ldloc(endingPos);
3693-
Ldloc(startingPos);
3694-
Add();
3695-
Stloc(endingPos);
3745+
// endingPos += startingPos;
3746+
Ldloc(endingPos);
3747+
Ldloc(startingPos);
3748+
Add();
3749+
Stloc(endingPos);
3750+
3751+
// pos = endingPos;
3752+
Ldloc(endingPos);
3753+
Stloc(pos);
3754+
}
36963755
}
36973756
else
36983757
{
@@ -3701,11 +3760,11 @@ void EmitSingleCharLoop(RegexNode node, RegexNode? subsequent = null, bool emitL
37013760
Ldc(!rtl ? 1 : -1);
37023761
Sub();
37033762
Stloc(endingPos);
3704-
}
37053763

3706-
// pos = endingPos;
3707-
Ldloc(endingPos);
3708-
Stloc(pos);
3764+
// pos = endingPos;
3765+
Ldloc(endingPos);
3766+
Stloc(pos);
3767+
}
37093768

37103769
if (!rtl)
37113770
{

0 commit comments

Comments
 (0)