diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs index 76075a5e66dc48..c1ad8b87781ab5 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs @@ -42,6 +42,28 @@ private static bool AllCharsInUInt64AreAscii(ulong value) return (value & ~0x007F007F_007F007Ful) == 0; } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static int GetIndexOfFirstNonAsciiByteInLane_AdvSimd(Vector128 value, Vector128 bitmask) + { + if (!AdvSimd.Arm64.IsSupported || !BitConverter.IsLittleEndian) + { + throw new PlatformNotSupportedException(); + } + + // extractedBits[i] = (value[i] >> 7) & (1 << (12 * (i % 2))); + Vector128 mostSignificantBitIsSet = AdvSimd.ShiftRightArithmetic(value.AsSByte(), 7).AsByte(); + Vector128 extractedBits = AdvSimd.And(mostSignificantBitIsSet, bitmask); + + // collapse mask to lower bits + extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits); + ulong mask = extractedBits.AsUInt64().ToScalar(); + + // calculate the index + int index = BitOperations.TrailingZeroCount(mask) >> 2; + Debug.Assert((mask != 0) ? index < 16 : index >= 16); + return index; + } + /// /// Given a DWORD which represents two packed chars in machine-endian order, /// iff the first char (in machine-endian order) is ASCII. @@ -67,8 +89,8 @@ public static unsafe nuint GetIndexOfFirstNonAsciiByte(byte* pBuffer, nuint buff // pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while // this method is running. - return (Sse2.IsSupported) - ? GetIndexOfFirstNonAsciiByte_Sse2(pBuffer, bufferLength) + return (Sse2.IsSupported || AdvSimd.Arm64.IsSupported && BitConverter.IsLittleEndian) + ? GetIndexOfFirstNonAsciiByte_Intrinsified(pBuffer, bufferLength) : GetIndexOfFirstNonAsciiByte_Default(pBuffer, bufferLength); } @@ -215,17 +237,38 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Default(byte* pBuffer, n goto Finish; } - private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuint bufferLength) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool ContainsNonAsciiByte_Sse2(uint sseMask) + { + Debug.Assert(sseMask != uint.MaxValue); + Debug.Assert(Sse2.IsSupported); + return sseMask != 0; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static bool ContainsNonAsciiByte_AdvSimd(uint advSimdIndex) + { + Debug.Assert(advSimdIndex != uint.MaxValue); + Debug.Assert(AdvSimd.IsSupported); + return advSimdIndex < 16; + } + + private static unsafe nuint GetIndexOfFirstNonAsciiByte_Intrinsified(byte* pBuffer, nuint bufferLength) { // JIT turns the below into constants uint SizeOfVector128 = (uint)Unsafe.SizeOf>(); nuint MaskOfAllBitsInVector128 = (nuint)(SizeOfVector128 - 1); - Debug.Assert(Sse2.IsSupported, "Should've been checked by caller."); - Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian."); + Debug.Assert(Sse2.IsSupported || AdvSimd.Arm64.IsSupported, "Sse2 or AdvSimd64 required."); + Debug.Assert(BitConverter.IsLittleEndian, "This SSE2/Arm64 implementation assumes little-endian."); - uint currentMask, secondMask; + Vector128 bitmask = BitConverter.IsLittleEndian ? + Vector128.Create((ushort)0x1001).AsByte() : + Vector128.Create((ushort)0x0110).AsByte(); + + uint currentSseMask = uint.MaxValue, secondSseMask = uint.MaxValue; + uint currentAdvSimdIndex = uint.MaxValue, secondAdvSimdIndex = uint.MaxValue; byte* pOriginalBuffer = pBuffer; // This method is written such that control generally flows top-to-bottom, avoiding @@ -240,11 +283,25 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin // Read the first vector unaligned. - currentMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load - - if (currentMask != 0) + if (Sse2.IsSupported) { - goto FoundNonAsciiDataInCurrentMask; + currentSseMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load + if (ContainsNonAsciiByte_Sse2(currentSseMask)) + { + goto FoundNonAsciiDataInCurrentChunk; + } + } + else if (AdvSimd.Arm64.IsSupported) + { + currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(AdvSimd.LoadVector128(pBuffer), bitmask); // unaligned load + if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex)) + { + goto FoundNonAsciiDataInCurrentChunk; + } + } + else + { + throw new PlatformNotSupportedException(); } // If we have less than 32 bytes to process, just go straight to the final unaligned @@ -281,15 +338,33 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin do { - Vector128 firstVector = Sse2.LoadAlignedVector128(pBuffer); - Vector128 secondVector = Sse2.LoadAlignedVector128(pBuffer + SizeOfVector128); + if (Sse2.IsSupported) + { + Vector128 firstVector = Sse2.LoadAlignedVector128(pBuffer); + Vector128 secondVector = Sse2.LoadAlignedVector128(pBuffer + SizeOfVector128); - currentMask = (uint)Sse2.MoveMask(firstVector); - secondMask = (uint)Sse2.MoveMask(secondVector); + currentSseMask = (uint)Sse2.MoveMask(firstVector); + secondSseMask = (uint)Sse2.MoveMask(secondVector); + if (ContainsNonAsciiByte_Sse2(currentSseMask | secondSseMask)) + { + goto FoundNonAsciiDataInInnerLoop; + } + } + else if (AdvSimd.Arm64.IsSupported) + { + Vector128 firstVector = AdvSimd.LoadVector128(pBuffer); + Vector128 secondVector = AdvSimd.LoadVector128(pBuffer + SizeOfVector128); - if ((currentMask | secondMask) != 0) + currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(firstVector, bitmask); + secondAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(secondVector, bitmask); + if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex) || ContainsNonAsciiByte_AdvSimd(secondAdvSimdIndex)) + { + goto FoundNonAsciiDataInInnerLoop; + } + } + else { - goto FoundNonAsciiDataInInnerLoop; + throw new PlatformNotSupportedException(); } pBuffer += 2 * SizeOfVector128; @@ -313,10 +388,25 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin // At least one full vector's worth of data remains, so we can safely read it. // Remember, at this point pBuffer is still aligned. - currentMask = (uint)Sse2.MoveMask(Sse2.LoadAlignedVector128(pBuffer)); - if (currentMask != 0) + if (Sse2.IsSupported) { - goto FoundNonAsciiDataInCurrentMask; + currentSseMask = (uint)Sse2.MoveMask(Sse2.LoadAlignedVector128(pBuffer)); + if (ContainsNonAsciiByte_Sse2(currentSseMask)) + { + goto FoundNonAsciiDataInCurrentChunk; + } + } + else if (AdvSimd.Arm64.IsSupported) + { + currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(AdvSimd.LoadVector128(pBuffer), bitmask); + if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex)) + { + goto FoundNonAsciiDataInCurrentChunk; + } + } + else + { + throw new PlatformNotSupportedException(); } IncrementCurrentOffsetBeforeFinalUnalignedVectorRead: @@ -332,17 +422,33 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin pBuffer += (bufferLength & MaskOfAllBitsInVector128) - SizeOfVector128; - currentMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load - if (currentMask != 0) + if (Sse2.IsSupported) { - goto FoundNonAsciiDataInCurrentMask; + currentSseMask = (uint)Sse2.MoveMask(Sse2.LoadVector128(pBuffer)); // unaligned load + if (ContainsNonAsciiByte_Sse2(currentSseMask)) + { + goto FoundNonAsciiDataInCurrentChunk; + } + + } + else if (AdvSimd.Arm64.IsSupported) + { + currentAdvSimdIndex = (uint)GetIndexOfFirstNonAsciiByteInLane_AdvSimd(AdvSimd.LoadVector128(pBuffer), bitmask); // unaligned load + if (ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex)) + { + goto FoundNonAsciiDataInCurrentChunk; + } + + } + else + { + throw new PlatformNotSupportedException(); } pBuffer += SizeOfVector128; } Finish: - return (nuint)pBuffer - (nuint)pOriginalBuffer; // and we're done! FoundNonAsciiDataInInnerLoop: @@ -351,20 +457,46 @@ private static unsafe nuint GetIndexOfFirstNonAsciiByte_Sse2(byte* pBuffer, nuin // instead be the second mask. If so, skip the entire first mask and drain ASCII bytes // from the second mask. - if (currentMask == 0) + if (Sse2.IsSupported) { - pBuffer += SizeOfVector128; - currentMask = secondMask; + if (!ContainsNonAsciiByte_Sse2(currentSseMask)) + { + pBuffer += SizeOfVector128; + currentSseMask = secondSseMask; + } } + else if (AdvSimd.IsSupported) + { + if (!ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex)) + { + pBuffer += SizeOfVector128; + currentAdvSimdIndex = secondAdvSimdIndex; + } + } + else + { + throw new PlatformNotSupportedException(); + } + FoundNonAsciiDataInCurrentChunk: - FoundNonAsciiDataInCurrentMask: - - // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte. - // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't - // available, we'll fall back to a normal loop. - Debug.Assert(currentMask != 0, "Shouldn't be here unless we see non-ASCII data."); - pBuffer += (uint)BitOperations.TrailingZeroCount(currentMask); + if (Sse2.IsSupported) + { + // The mask contains - from the LSB - a 0 for each ASCII byte we saw, and a 1 for each non-ASCII byte. + // Tzcnt is the correct operation to count the number of zero bits quickly. If this instruction isn't + // available, we'll fall back to a normal loop. + Debug.Assert(ContainsNonAsciiByte_Sse2(currentSseMask), "Shouldn't be here unless we see non-ASCII data."); + pBuffer += (uint)BitOperations.TrailingZeroCount(currentSseMask); + } + else if (AdvSimd.Arm64.IsSupported) + { + Debug.Assert(ContainsNonAsciiByte_AdvSimd(currentAdvSimdIndex), "Shouldn't be here unless we see non-ASCII data."); + pBuffer += currentAdvSimdIndex; + } + else + { + throw new PlatformNotSupportedException(); + } goto Finish;