@@ -399,54 +399,65 @@ private static string EmitSearchValuesOrLiteral(ReadOnlySpan<char> chars, Dictio
399399 }
400400
401401 /// <summary>Adds a SearchValues instance declaration to the required helpers collection.</summary>
402- private static string EmitSearchValues ( char [ ] asciiChars , Dictionary < string , string [ ] > requiredHelpers )
402+ private static string EmitSearchValues ( char [ ] chars , Dictionary < string , string [ ] > requiredHelpers )
403403 {
404- Debug . Assert ( RegexCharClass . IsAscii ( asciiChars ) ) ;
404+ Array . Sort ( chars ) ;
405405
406- // The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key.
407- byte [ ] bitmap = new byte [ 16 ] ;
408- foreach ( char c in asciiChars )
406+ string fieldName ;
407+ if ( RegexCharClass . IsAscii ( chars ) )
409408 {
410- bitmap [ c >> 3 ] |= ( byte ) ( 1 << ( c & 7 ) ) ;
409+ // The set of ASCII characters can be represented as a 128-bit bitmap. Use the 16-byte hex string as the key.
410+ var bitmap = new byte [ 16 ] ;
411+ foreach ( char c in chars )
412+ {
413+ bitmap [ c >> 3 ] |= ( byte ) ( 1 << ( c & 7 ) ) ;
414+ }
415+
416+ string hexBitmap = BitConverter . ToString ( bitmap ) . Replace ( "-" , string . Empty ) ;
417+
418+ fieldName = hexBitmap switch
419+ {
420+ "FFFFFFFF000000000000000000000080" => "s_asciiControl" ,
421+ "000000000000FF030000000000000000" => "s_asciiDigits" ,
422+ "0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters" ,
423+ "000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits" ,
424+ "000000000000FF037E0000007E000000" => "s_asciiHexDigits" ,
425+ "000000000000FF03000000007E000000" => "s_asciiHexDigitsLower" ,
426+ "000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper" ,
427+ "00000000EEF7008C010000B800000028" => "s_asciiPunctuation" ,
428+ "00000000010000000000000000000000" => "s_asciiSeparators" ,
429+ "00000000100800700000004001000050" => "s_asciiSymbols" ,
430+ "003E0000010000000000000000000000" => "s_asciiWhiteSpace" ,
431+ "000000000000FF03FEFFFF87FEFFFF07" => "s_asciiWordChars" ,
432+
433+ "00000000FFFFFFFFFFFFFFFFFFFFFF7F" => "s_asciiExceptControl" ,
434+ "FFFFFFFFFFFF00FCFFFFFFFFFFFFFFFF" => "s_asciiExceptDigits" ,
435+ "FFFFFFFFFFFFFFFF010000F8010000F8" => "s_asciiExceptLetters" ,
436+ "FFFFFFFFFFFF00FC010000F8010000F8" => "s_asciiExceptLettersAndDigits" ,
437+ "FFFFFFFFFFFFFFFFFFFFFFFF010000F8" => "s_asciiExceptLower" ,
438+ "FFFFFFFF1108FF73FEFFFF47FFFFFFD7" => "s_asciiExceptPunctuation" ,
439+ "FFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptSeparators" ,
440+ "FFFFFFFFEFF7FF8FFFFFFFBFFEFFFFAF" => "s_asciiExceptSymbols" ,
441+ "FFFFFFFFFFFFFFFF010000F8FFFFFFFF" => "s_asciiExceptUpper" ,
442+ "FFC1FFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptWhiteSpace" ,
443+ "FFFFFFFFFFFF00FC01000078010000F8" => "s_asciiExceptWordChars" ,
444+
445+ _ => $ "s_ascii_{ hexBitmap . TrimStart ( '0' ) } "
446+ } ;
411447 }
412-
413- string hexBitmap = BitConverter . ToString ( bitmap ) . Replace ( "-" , string . Empty ) ;
414-
415- string fieldName = hexBitmap switch
448+ else
416449 {
417- "FFFFFFFF000000000000000000000080" => "s_asciiControl" ,
418- "000000000000FF030000000000000000" => "s_asciiDigits" ,
419- "0000000000000000FEFFFF07FEFFFF07" => "s_asciiLetters" ,
420- "000000000000FF03FEFFFF07FEFFFF07" => "s_asciiLettersAndDigits" ,
421- "000000000000FF037E0000007E000000" => "s_asciiHexDigits" ,
422- "000000000000FF03000000007E000000" => "s_asciiHexDigitsLower" ,
423- "000000000000FF037E00000000000000" => "s_asciiHexDigitsUpper" ,
424- "00000000EEF7008C010000B800000028" => "s_asciiPunctuation" ,
425- "00000000010000000000000000000000" => "s_asciiSeparators" ,
426- "00000000100800700000004001000050" => "s_asciiSymbols" ,
427- "003E0000010000000000000000000000" => "s_asciiWhiteSpace" ,
428- "000000000000FF03FEFFFF87FEFFFF07" => "s_asciiWordChars" ,
429-
430- "00000000FFFFFFFFFFFFFFFFFFFFFF7F" => "s_asciiExceptControl" ,
431- "FFFFFFFFFFFF00FCFFFFFFFFFFFFFFFF" => "s_asciiExceptDigits" ,
432- "FFFFFFFFFFFFFFFF010000F8010000F8" => "s_asciiExceptLetters" ,
433- "FFFFFFFFFFFF00FC010000F8010000F8" => "s_asciiExceptLettersAndDigits" ,
434- "FFFFFFFFFFFFFFFFFFFFFFFF010000F8" => "s_asciiExceptLower" ,
435- "FFFFFFFF1108FF73FEFFFF47FFFFFFD7" => "s_asciiExceptPunctuation" ,
436- "FFFFFFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptSeparators" ,
437- "FFFFFFFFEFF7FF8FFFFFFFBFFEFFFFAF" => "s_asciiExceptSymbols" ,
438- "FFFFFFFFFFFFFFFF010000F8FFFFFFFF" => "s_asciiExceptUpper" ,
439- "FFC1FFFFFEFFFFFFFFFFFFFFFFFFFFFF" => "s_asciiExceptWhiteSpace" ,
440- "FFFFFFFFFFFF00FC01000078010000F8" => "s_asciiExceptWordChars" ,
441-
442- _ => $ "s_ascii_{ hexBitmap . TrimStart ( '0' ) } "
443- } ;
450+ using ( SHA256 sha = SHA256 . Create ( ) )
451+ {
452+ #pragma warning disable CA1850 // SHA256.HashData isn't available on netstandard2.0
453+ fieldName = $ "s_nonAscii_{ BitConverter . ToString ( sha . ComputeHash ( Encoding . UTF8 . GetBytes ( chars ) ) ) . Replace ( "-" , "" ) } ";
454+ #pragma warning restore CA1850
455+ }
456+ }
444457
445458 if ( ! requiredHelpers . ContainsKey ( fieldName ) )
446459 {
447- Array . Sort ( asciiChars ) ;
448-
449- string setLiteral = Literal ( new string ( asciiChars ) ) ;
460+ string setLiteral = Literal ( new string ( chars ) ) ;
450461
451462 requiredHelpers . Add ( fieldName , new string [ ]
452463 {
@@ -465,12 +476,12 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary<string,
465476 // a sequential walk). In order to do that search, we actually build up a set for all of the ASCII
466477 // characters _not_ contained in the set, and then do a search for the inverse of that, which will be
467478 // all of the target ASCII characters and all of non-ASCII.
468- var asciiChars = new List < char > ( ) ;
479+ var excludedAsciiChars = new List < char > ( ) ;
469480 for ( int i = 0 ; i < 128 ; i ++ )
470481 {
471482 if ( ! RegexCharClass . CharInClass ( ( char ) i , set ) )
472483 {
473- asciiChars . Add ( ( char ) i ) ;
484+ excludedAsciiChars . Add ( ( char ) i ) ;
474485 }
475486 }
476487
@@ -538,9 +549,9 @@ private static string EmitIndexOfAnyCustomHelper(string set, Dictionary<string,
538549 lines . Add ( $ "internal static int { helperName } (this ReadOnlySpan<char> span)") ;
539550 lines . Add ( $ "{{") ;
540551 int uncheckedStart = lines . Count ;
541- lines . Add ( asciiChars . Count == 128 ?
552+ lines . Add ( excludedAsciiChars . Count == 128 ?
542553 $ " int i = span.IndexOfAnyExceptInRange('\0 ', '\u007f ');" :
543- $ " int i = span.IndexOfAnyExcept({ EmitSearchValues ( asciiChars . ToArray ( ) , requiredHelpers ) } );") ;
554+ $ " int i = span.IndexOfAnyExcept({ EmitSearchValues ( excludedAsciiChars . ToArray ( ) , requiredHelpers ) } );") ;
544555 lines . Add ( $ " if ((uint)i < (uint)span.Length)") ;
545556 lines . Add ( $ " {{") ;
546557 lines . Add ( $ " if (char.IsAscii(span[i]))") ;
@@ -1067,6 +1078,8 @@ void EmitFixedSet_LeftToRight()
10671078 string indexOf ;
10681079 if ( primarySet . Chars is not null )
10691080 {
1081+ Debug . Assert ( primarySet . Chars . Length > 0 ) ;
1082+
10701083 // We have a chars array, so we can use IndexOf{Any}{Except} to search for it. Choose the best overload.
10711084 string indexOfName = "IndexOf" , indexOfAnyName = "IndexOfAny" ;
10721085 if ( primarySet . Negated )
@@ -1076,18 +1089,19 @@ void EmitFixedSet_LeftToRight()
10761089
10771090 indexOf = primarySet . Chars . Length switch
10781091 {
1092+ // 1, 2, 3 have dedicated optimized IndexOfAny overloads
10791093 1 => $ "{ span } .{ indexOfName } ({ Literal ( primarySet . Chars [ 0 ] ) } )",
10801094 2 => $ "{ span } .{ indexOfAnyName } ({ Literal ( primarySet . Chars [ 0 ] ) } , { Literal ( primarySet . Chars [ 1 ] ) } )",
10811095 3 => $ "{ span } .{ indexOfAnyName } ({ Literal ( primarySet . Chars [ 0 ] ) } , { Literal ( primarySet . Chars [ 1 ] ) } , { Literal ( primarySet . Chars [ 2 ] ) } )",
1082- _ => $ "{ span } .{ indexOfAnyName } ({ EmitSearchValuesOrLiteral ( primarySet . Chars , requiredHelpers ) } )",
1096+
1097+ // 4, 5 have dedicated optimized IndexOfAny overloads accessible via the ReadOnlySpan<char> overload,
1098+ // but can also be handled via SearchValues
1099+ 4 or 5 => $ "{ span } .{ indexOfAnyName } ({ EmitSearchValuesOrLiteral ( primarySet . Chars , requiredHelpers ) } )",
1100+
1101+ // > 5 can only be handled efficiently via SearchValues
1102+ _ => $ "{ span } .{ indexOfAnyName } ({ EmitSearchValues ( primarySet . Chars , requiredHelpers ) } )",
10831103 } ;
10841104 }
1085- else if ( primarySet . AsciiSet is not null )
1086- {
1087- // We have a set of ASCII chars, so we can use IndexOfAny(SearchValues) to search for it.
1088- Debug . Assert ( ! primarySet . Negated ) ;
1089- indexOf = $ "{ span } .IndexOfAny({ EmitSearchValues ( primarySet . AsciiSet , requiredHelpers ) } )";
1090- }
10911105 else if ( primarySet . Range is not null )
10921106 {
10931107 // We have a range, so we can use IndexOfAny{Except}InRange to search for it. In the corner case,
@@ -1102,8 +1116,8 @@ void EmitFixedSet_LeftToRight()
11021116 }
11031117 else
11041118 {
1105- // We have an arbitrary set of characters that includes at least one non-ASCII char. We use a custom IndexOfAny helper that
1106- // will perform the search as efficiently as possible.
1119+ // We have an arbitrary set of characters that's really large or otherwise not enumerable.
1120+ // We use a custom IndexOfAny helper that will perform the search as efficiently as possible.
11071121 indexOf = $ "{ span } .{ EmitIndexOfAnyCustomHelper ( primarySet . Set , requiredHelpers , checkOverflow ) } ()";
11081122 }
11091123
0 commit comments