Skip to content

Commit 0ccbbe7

Browse files
authored
Broaden use of SearchValues in TryFindNextPossibleStartingPosition in Regex (#89205)
SearchValues has been updated to have an ASCII fast-path for inputs that are not only ASCII. This means we can simplify TryFindNextPossibleStartingPosition in Regex to not track AsciiSet specially and instead just increase the number of characters we query the set for (from 5 to 128). That way, we'll use SearchValues rather than emitting our own helper up until a (semi-arbitrary) point where we deem it impossible or infeasible to enumerate all the chars that make up the set.
1 parent 84b7c61 commit 0ccbbe7

4 files changed

Lines changed: 91 additions & 80 deletions

File tree

src/libraries/System.Text.RegularExpressions/gen/RegexGenerator.Emitter.cs

Lines changed: 67 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -399,54 +399,65 @@ private static string EmitSearchValuesOrLiteral(ReadOnlySpan<char> chars, Dictio
399399
}
400400

401401
/// <summary>Adds a SearchValues instance declaration to the required helpers collection.</summary>
402-
private static string EmitSearchValues(char[] asciiChars, Dictionary<string, string[]> requiredHelpers)
402+
private static string EmitSearchValues(char[] chars, Dictionary<string, string[]> requiredHelpers)
403403
{
404-
Debug.Assert(RegexCharClass.IsAscii(asciiChars));
404+
Array.Sort(chars);
405405

406-
// The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key.
407-
byte[] bitmap = new byte[16];
408-
foreach (char c in asciiChars)
406+
string fieldName;
407+
if (RegexCharClass.IsAscii(chars))
409408
{
410-
bitmap[c >> 3] |= (byte)(1 << (c & 7));
409+
// The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key.
410+
var bitmap = new byte[16];
411+
foreach (char c in chars)
412+
{
413+
bitmap[c >> 3] |= (byte)(1 << (c & 7));
414+
}
415+
416+
string hexBitmap = BitConverter.ToString(bitmap).Replace("-", string.Empty);
417+
418+
fieldName = hexBitmap switch
419+
{
420+
"FFFFFFFF000000000000000000000080" => "s_asciiControl",
421+
"000000000000FF030000000000000000" => "s_asciiDigits",
422+
"0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters",
423+
"000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits",
424+
"000000000000FF037E0000007E000000" => "s_asciiHexDigits",
425+
"000000000000FF03000000007E000000" => "s_asciiHexDigitsLower",
426+
"000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper",
427+
"00000000EEF7008C010000B800000028" => "s_asciiPunctuation",
428+
"00000000010000000000000000000000" => "s_asciiSeparators",
429+
"00000000100800700000004001000050" => "s_asciiSymbols",
430+
"003E0000010000000000000000000000" => "s_asciiWhiteSpace",
431+
"000000000000FF03FEFFFF87FEFFFF07" => "s_asciiWordChars",
432+
433+
"00000000FFFFFFFFFFFFFFFFFFFFFF7F" => "s_asciiExceptControl",
434+
"FFFFFFFFFFFF00FCFFFFFFFFFFFFFFFF" => "s_asciiExceptDigits",
435+
"FFFFFFFFFFFFFFFF010000F8010000F8" => "s_asciiExceptLetters",
436+
"FFFFFFFFFFFF00FC010000F8010000F8" => "s_asciiExceptLettersAndDigits",
437+
"FFFFFFFFFFFFFFFFFFFFFFFF010000F8" => "s_asciiExceptLower",
438+
"FFFFFFFF1108FF73FEFFFF47FFFFFFD7" => "s_asciiExceptPunctuation",
439+
"FFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptSeparators",
440+
"FFFFFFFFEFF7FF8FFFFFFFBFFEFFFFAF" => "s_asciiExceptSymbols",
441+
"FFFFFFFFFFFFFFFF010000F8FFFFFFFF" => "s_asciiExceptUpper",
442+
"FFC1FFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptWhiteSpace",
443+
"FFFFFFFFFFFF00FC01000078010000F8" => "s_asciiExceptWordChars",
444+
445+
_ => $"s_ascii_{hexBitmap.TrimStart('0')}"
446+
};
411447
}
412-
413-
string hexBitmap = BitConverter.ToString(bitmap).Replace("-", string.Empty);
414-
415-
string fieldName = hexBitmap switch
448+
else
416449
{
417-
"FFFFFFFF000000000000000000000080" => "s_asciiControl",
418-
"000000000000FF030000000000000000" => "s_asciiDigits",
419-
"0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters",
420-
"000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits",
421-
"000000000000FF037E0000007E000000" => "s_asciiHexDigits",
422-
"000000000000FF03000000007E000000" => "s_asciiHexDigitsLower",
423-
"000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper",
424-
"00000000EEF7008C010000B800000028" => "s_asciiPunctuation",
425-
"00000000010000000000000000000000" => "s_asciiSeparators",
426-
"00000000100800700000004001000050" => "s_asciiSymbols",
427-
"003E0000010000000000000000000000" => "s_asciiWhiteSpace",
428-
"000000000000FF03FEFFFF87FEFFFF07" => "s_asciiWordChars",
429-
430-
"00000000FFFFFFFFFFFFFFFFFFFFFF7F" => "s_asciiExceptControl",
431-
"FFFFFFFFFFFF00FCFFFFFFFFFFFFFFFF" => "s_asciiExceptDigits",
432-
"FFFFFFFFFFFFFFFF010000F8010000F8" => "s_asciiExceptLetters",
433-
"FFFFFFFFFFFF00FC010000F8010000F8" => "s_asciiExceptLettersAndDigits",
434-
"FFFFFFFFFFFFFFFFFFFFFFFF010000F8" => "s_asciiExceptLower",
435-
"FFFFFFFF1108FF73FEFFFF47FFFFFFD7" => "s_asciiExceptPunctuation",
436-
"FFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptSeparators",
437-
"FFFFFFFFEFF7FF8FFFFFFFBFFEFFFFAF" => "s_asciiExceptSymbols",
438-
"FFFFFFFFFFFFFFFF010000F8FFFFFFFF" => "s_asciiExceptUpper",
439-
"FFC1FFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptWhiteSpace",
440-
"FFFFFFFFFFFF00FC01000078010000F8" => "s_asciiExceptWordChars",
441-
442-
_ => $"s_ascii_{hexBitmap.TrimStart('0')}"
443-
};
450+
using (SHA256 sha = SHA256.Create())
451+
{
452+
#pragma warning disable CA1850 // SHA256.HashData isn't available on netstandard2.0
453+
fieldName = $"s_nonAscii_{BitConverter.ToString(sha.ComputeHash(Encoding.UTF8.GetBytes(chars))).Replace("-", "")}";
454+
#pragma warning restore CA1850
455+
}
456+
}
444457

445458
if (!requiredHelpers.ContainsKey(fieldName))
446459
{
447-
Array.Sort(asciiChars);
448-
449-
string setLiteral = Literal(new string(asciiChars));
460+
string setLiteral = Literal(new string(chars));
450461

451462
requiredHelpers.Add(fieldName, new string[]
452463
{
@@ -465,12 +476,12 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary<string,
465476
// a sequential walk). In order to do that search, we actually build up a set for all of the ASCII
466477
// characters _not_ contained in the set, and then do a search for the inverse of that, which will be
467478
// all of the target ASCII characters and all of non-ASCII.
468-
var asciiChars = new List<char>();
479+
var excludedAsciiChars = new List<char>();
469480
for (int i = 0; i < 128; i++)
470481
{
471482
if (!RegexCharClass.CharInClass((char)i, set))
472483
{
473-
asciiChars.Add((char)i);
484+
excludedAsciiChars.Add((char)i);
474485
}
475486
}
476487

@@ -538,9 +549,9 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary<string,
538549
lines.Add($"internal static int {helperName}(this ReadOnlySpan<char> span)");
539550
lines.Add($"{{");
540551
int uncheckedStart = lines.Count;
541-
lines.Add(asciiChars.Count == 128 ?
552+
lines.Add(excludedAsciiChars.Count == 128 ?
542553
$" int i = span.IndexOfAnyExceptInRange('\0', '\u007f');" :
543-
$" int i = span.IndexOfAnyExcept({EmitSearchValues(asciiChars.ToArray(), requiredHelpers)});");
554+
$" int i = span.IndexOfAnyExcept({EmitSearchValues(excludedAsciiChars.ToArray(), requiredHelpers)});");
544555
lines.Add($" if ((uint)i < (uint)span.Length)");
545556
lines.Add($" {{");
546557
lines.Add($" if (char.IsAscii(span[i]))");
@@ -1067,6 +1078,8 @@ void EmitFixedSet_LeftToRight()
10671078
string indexOf;
10681079
if (primarySet.Chars is not null)
10691080
{
1081+
Debug.Assert(primarySet.Chars.Length > 0);
1082+
10701083
// We have a chars array, so we can use IndexOf{Any}{Except} to search for it. Choose the best overload.
10711084
string indexOfName = "IndexOf", indexOfAnyName = "IndexOfAny";
10721085
if (primarySet.Negated)
@@ -1076,18 +1089,19 @@ void EmitFixedSet_LeftToRight()
10761089

10771090
indexOf = primarySet.Chars.Length switch
10781091
{
1092+
// 1, 2, 3 have dedicated optimized IndexOfAny overloads
10791093
1 => $"{span}.{indexOfName}({Literal(primarySet.Chars[0])})",
10801094
2 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])})",
10811095
3 => $"{span}.{indexOfAnyName}({Literal(primarySet.Chars[0])}, {Literal(primarySet.Chars[1])}, {Literal(primarySet.Chars[2])})",
1082-
_ => $"{span}.{indexOfAnyName}({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})",
1096+
1097+
// 4, 5 have dedicated optimized IndexOfAny overloads accessible via the ReadOnlySpan<char> overload,
1098+
// but can also be handled via SearchValues
1099+
4 or 5 => $"{span}.{indexOfAnyName}({EmitSearchValuesOrLiteral(primarySet.Chars, requiredHelpers)})",
1100+
1101+
// > 5 can only be handled efficiently via SearchValues
1102+
_ => $"{span}.{indexOfAnyName}({EmitSearchValues(primarySet.Chars, requiredHelpers)})",
10831103
};
10841104
}
1085-
else if (primarySet.AsciiSet is not null)
1086-
{
1087-
// We have a set of ASCII chars, so we can use IndexOfAny(SearchValues) to search for it.
1088-
Debug.Assert(!primarySet.Negated);
1089-
indexOf = $"{span}.IndexOfAny({EmitSearchValues(primarySet.AsciiSet, requiredHelpers)})";
1090-
}
10911105
else if (primarySet.Range is not null)
10921106
{
10931107
// We have a range, so we can use IndexOfAny{Except}InRange to search for it. In the corner case,
@@ -1102,8 +1116,8 @@ void EmitFixedSet_LeftToRight()
11021116
}
11031117
else
11041118
{
1105-
// We have an arbitrary set of characters that includes at least one non-ASCII char. We use a custom IndexOfAny helper that
1106-
// will perform the search as efficiently as possible.
1119+
// We have an arbitrary set of characters that's really large or otherwise not enumerable.
1120+
// We use a custom IndexOfAny helper that will perform the search as efficiently as possible.
11071121
indexOf = $"{span}.{EmitIndexOfAnyCustomHelper(primarySet.Set, requiredHelpers, checkOverflow)}()";
11081122
}
11091123

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexCompiler.cs

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -903,6 +903,7 @@ void EmitFixedSet_LeftToRight()
903903

904904
if (primarySet.Chars is not null)
905905
{
906+
Debug.Assert(primarySet.Chars.Length > 0);
906907
switch (primarySet.Chars.Length)
907908
{
908909
case 1:
@@ -926,19 +927,23 @@ void EmitFixedSet_LeftToRight()
926927
Call(primarySet.Negated ? s_spanIndexOfAnyExceptCharCharChar : s_spanIndexOfAnyCharCharChar);
927928
break;
928929

929-
default:
930+
case 4 or 5:
931+
// tmp = ...IndexOfAny("abcd");
932+
// Note that this case differs slightly from the source generator, where it might choose to use
933+
// SearchValues instead of a literal, but there's extra cost to doing so for RegexCompiler so
934+
// it just always uses IndexOfAny(span).
930935
Ldstr(new string(primarySet.Chars));
931936
Call(s_stringAsSpanMethod);
932937
Call(primarySet.Negated ? s_spanIndexOfAnyExceptSpan : s_spanIndexOfAnySpan);
933938
break;
939+
940+
default:
941+
// tmp = ...IndexOfAny(s_searchValues);
942+
LoadSearchValues(primarySet.Chars);
943+
Call(primarySet.Negated ? s_spanIndexOfAnyExceptSearchValues : s_spanIndexOfAnySearchValues);
944+
break;
934945
}
935946
}
936-
else if (primarySet.AsciiSet is not null)
937-
{
938-
Debug.Assert(!primarySet.Negated);
939-
LoadSearchValues(primarySet.AsciiSet);
940-
Call(s_spanIndexOfAnySearchValues);
941-
}
942947
else if (primarySet.Range is not null)
943948
{
944949
if (primarySet.Range.Value.LowInclusive == primarySet.Range.Value.HighInclusive)

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexFindOptimizations.cs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -271,8 +271,6 @@ public FixedDistanceSet(char[]? chars, string set, int distance)
271271
public int Distance;
272272
/// <summary>As an alternative to <see cref="Chars"/>, a description of the single range the set represents, if it does.</summary>
273273
public (char LowInclusive, char HighInclusive)? Range;
274-
/// <summary>As an alternative to <see cref="Chars"/>, a description of the set of ASCII characters it represents, if it does.</summary>
275-
public char[]? AsciiSet;
276274
}
277275

278276
/// <summary>When in literal after set loop node, gets the literal to search for and the RegexNode representing the leading loop.</summary>
@@ -593,7 +591,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
593591
char[]? chars = primarySet.Chars;
594592

595593
ReadOnlySpan<char> span = textSpan.Slice(pos);
596-
if (chars is not null)
594+
if (chars is { Length: <= 5 }) // 5 == currently the max length efficiently handled by IndexOfAny{Except} without SearchValues
597595
{
598596
int i = primarySet.Negated ? span.IndexOfAnyExcept(chars) : span.IndexOfAny(chars);
599597
if (i >= 0)
@@ -660,7 +658,7 @@ public bool TryFindNextStartingPositionLeftToRight(ReadOnlySpan<char> textSpan,
660658

661659
int endMinusRequiredLength = textSpan.Length - Math.Max(1, MinRequiredLength);
662660

663-
if (primarySet.Chars is not null)
661+
if (primarySet.Chars is { Length: <= 5 }) // 5 == currently the max length efficiently handled by IndexOfAny{Except}
664662
{
665663
for (int inputPosition = pos; inputPosition <= endMinusRequiredLength; inputPosition++)
666664
{

src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexPrefixAnalyzer.cs

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb)
194194
TryFindRawFixedSets(root, results, ref distance, thorough);
195195
#if DEBUG
196196
results.ForEach(r => Debug.Assert(
197-
!r.Negated && r.Chars is null && r.AsciiSet is null && r.Range is null,
197+
!r.Negated && r.Chars is null && r.Range is null,
198198
$"{nameof(TryFindRawFixedSets)} should have only populated {nameof(r.Set)} and {nameof(r.Distance)}"));
199199
#endif
200200

@@ -225,31 +225,25 @@ static bool Process(RegexNode node, ref ValueStringBuilder vsb)
225225

226226
// For every entry, try to get the chars that make up the set, if there are few enough.
227227
// For any for which we couldn't get the small chars list, see if we can get other useful info.
228-
Span<char> scratch = stackalloc char[5]; // max efficiently optimized by IndexOfAny today
228+
Span<char> scratch = stackalloc char[128]; // limit based on what's currently efficiently handled by SearchValues
229229
for (int i = 0; i < results.Count; i++)
230230
{
231231
RegexFindOptimizations.FixedDistanceSet result = results[i];
232232
result.Negated = RegexCharClass.IsNegated(result.Set);
233233

234234
int count = RegexCharClass.GetSetChars(result.Set, scratch);
235-
236235
if (count > 0)
237236
{
238237
result.Chars = scratch.Slice(0, count).ToArray();
239238
}
240239

241-
if (thorough)
240+
// Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range.
241+
if (thorough &&
242+
(result.Chars is null || result.Chars.Length > 2) &&
243+
RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive))
242244
{
243-
// Prefer IndexOfAnyInRange over IndexOfAny for sets of 3-5 values that fit in a single range.
244-
if ((result.Chars is null || count > 2) && RegexCharClass.TryGetSingleRange(result.Set, out char lowInclusive, out char highInclusive))
245-
{
246-
result.Chars = null;
247-
result.Range = (lowInclusive, highInclusive);
248-
}
249-
else if (result.Chars is null && !result.Negated && RegexCharClass.TryGetAsciiSetChars(result.Set, out char[]? asciiChars))
250-
{
251-
result.AsciiSet = asciiChars;
252-
}
245+
result.Chars = null;
246+
result.Range = (lowInclusive, highInclusive);
253247
}
254248

255249
results[i] = result;
@@ -472,8 +466,8 @@ public static void SortFixedDistanceSetsByQuality(List<RegexFindOptimizations.Fi
472466
// for the fastest and that have the best chance of matching as few false positives as possible.
473467
results.Sort(static (s1, s2) =>
474468
{
475-
char[]? s1Chars = s1.Chars ?? s1.AsciiSet;
476-
char[]? s2Chars = s2.Chars ?? s2.AsciiSet;
469+
char[]? s1Chars = s1.Chars;
470+
char[]? s2Chars = s2.Chars;
477471
int s1CharsLength = s1Chars?.Length ?? 0;
478472
int s2CharsLength = s2Chars?.Length ?? 0;
479473
bool s1Negated = s1.Negated;

0 commit comments

Comments
 (0)