Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 140 additions & 19 deletions src/libraries/System.Private.CoreLib/src/System/Text/ASCIIUtility.cs
Original file line number Diff line number Diff line change
Expand Up @@ -489,8 +489,8 @@ public static unsafe nuint GetIndexOfFirstNonAsciiChar(char* pBuffer, nuint buff
// pmovmskb which we know are optimized, and (b) we can avoid downclocking the processor while
// this method is running.

return (Sse2.IsSupported)
? GetIndexOfFirstNonAsciiChar_Sse2(pBuffer, bufferLength)
return (BitConverter.IsLittleEndian && (Sse2.IsSupported || AdvSimd.Arm64.IsSupported))
? GetIndexOfFirstNonAsciiChar_Intrinsified(pBuffer, bufferLength)
: GetIndexOfFirstNonAsciiChar_Default(pBuffer, bufferLength);
}

Expand Down Expand Up @@ -630,9 +630,9 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Default(char* pBuffer, n
goto Finish;
}

private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuint bufferLength /* in chars */)
private static unsafe nuint GetIndexOfFirstNonAsciiChar_Intrinsified(char* pBuffer, nuint bufferLength /* in chars */)
{
// This method contains logic optimized for both SSE2 and SSE41. Much of the logic in this method
// This method contains logic optimized for SSE2, SSE41 and ARM64. Much of the logic in this method
// will be elided by JIT once we determine which specific ISAs we support.

// Quick check for empty inputs.
Expand All @@ -647,8 +647,12 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
uint SizeOfVector128InBytes = (uint)Unsafe.SizeOf<Vector128<byte>>();
uint SizeOfVector128InChars = SizeOfVector128InBytes / sizeof(char);

Debug.Assert(Sse2.IsSupported, "Should've been checked by caller.");
Debug.Assert(BitConverter.IsLittleEndian, "SSE2 assumes little-endian.");
Debug.Assert(Sse2.IsSupported || AdvSimd.Arm64.IsSupported, "Sse2 or AdvSimd64 required.");
Debug.Assert(BitConverter.IsLittleEndian, "This SSE2/Arm64 implementation assumes little-endian.");

Vector128<byte> bitmask = BitConverter.IsLittleEndian ?
Vector128.Create(0x80402010_08040201).AsByte() :
Vector128.Create(0x01020408_10204080).AsByte();

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is also true for other PRs that Carlos sent, but can you remind me why we use bitMask for !IsLittleEndian if it is not supported? Is the idea that when we start supporting, the code will just work?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup, that's the idea.


Vector128<ushort> firstVector, secondVector;
uint currentMask;
Expand All @@ -673,13 +677,35 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin

// Read the first vector unaligned.

firstVector = Sse2.LoadVector128((ushort*)pBuffer); // unaligned load
if (Sse2.IsSupported)
{
firstVector = Sse2.LoadVector128((ushort*)pBuffer); // unaligned load
}
else if (AdvSimd.Arm64.IsSupported)
{
firstVector = AdvSimd.LoadVector128((ushort*)pBuffer); // unaligned load
}
else
{
throw new PlatformNotSupportedException();
}

// The operation below forces the 0x8000 bit of each WORD to be set iff the WORD element
// has value >= 0x0800 (non-ASCII). Then we'll treat the vector as a BYTE vector in order
// has value >= 0x0080 (non-ASCII). Then we'll treat the vector as a BYTE vector in order
// to extract the mask. Reminder: the 0x0080 bit of each WORD should be ignored.

currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
if (Sse2.IsSupported)
{
currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
}
else if (AdvSimd.Arm64.IsSupported)
{
currentMask = Unicode.Utf16Utility.GetNonAsciiBytes(AdvSimd.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte(), bitmask);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should check out excellent feedback from @TamarChristinaArm in #39050 (comment) and #39050 (comment) about optimizing this code. I think you should follow the same because in the end, all you are doing this calling BitOperations.TrailingZeroCount() on it.

@TamarChristinaArm TamarChristinaArm Aug 10, 2020

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed with splitting it off. In this case it looks like this code only cares about the index of the first non-zero element. So we can do even better here.

If you use a mask 0x0f00 and BIC you can get a much more efficient sequence (see https://github.com/ARM-software/optimized-routines/blob/224cb5f67b71757b99fe1e10b5a437c17a1d733c/string/aarch64/strlen.S#L164)

essentially

cmlt    v1.16b, v1.16b, #0
bic	v1.8h, 0x0f, lsl 8
umaxp	v1.16b, v1.16b, v1.16b
fmov	x1, d1
rbit	x0, x0
clz	x0, x1

as the sequence to get the first element that has the msb set. Of course for the cases where it's just doing if (currentMask != 0) you just need a maxp and fmov as I explained in the other post.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is for little endian btw, for big-endian you need a slight variantion. though this can be avoided loading with LD1 instead of LDR (which I think is what LoadVector128 does here).

}
else
{
throw new PlatformNotSupportedException();
}

if ((currentMask & NonAsciiDataSeenMask) != 0)
{
Expand Down Expand Up @@ -725,9 +751,23 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin

do
{
firstVector = Sse2.LoadAlignedVector128((ushort*)pBuffer);
secondVector = Sse2.LoadAlignedVector128((ushort*)pBuffer + SizeOfVector128InChars);
Vector128<ushort> combinedVector = Sse2.Or(firstVector, secondVector);
Vector128<ushort> combinedVector;
if (Sse2.IsSupported)
{
firstVector = Sse2.LoadAlignedVector128((ushort*)pBuffer);
secondVector = Sse2.LoadAlignedVector128((ushort*)pBuffer + SizeOfVector128InChars);
combinedVector = Sse2.Or(firstVector, secondVector);
}
else if (AdvSimd.Arm64.IsSupported)
{
firstVector = AdvSimd.LoadVector128((ushort*)pBuffer);

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a note for the future, this would be a great place for LoadPair #39243

secondVector = AdvSimd.LoadVector128((ushort*)pBuffer + SizeOfVector128InChars);
combinedVector = AdvSimd.Or(firstVector, secondVector);
}
else
{
throw new PlatformNotSupportedException();
}

if (Sse41.IsSupported)
{
Expand All @@ -738,7 +778,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
goto FoundNonAsciiDataInFirstOrSecondVector;
}
}
else
else if (Sse2.IsSupported)
{
// See comment earlier in the method for an explanation of how the below logic works.
currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(combinedVector, asciiMaskForAddSaturate).AsByte());
Expand All @@ -747,6 +787,18 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
goto FoundNonAsciiDataInFirstOrSecondVector;
}
}
else if (AdvSimd.Arm64.IsSupported)
{
currentMask = Unicode.Utf16Utility.GetNonAsciiBytes(AdvSimd.AddSaturate(combinedVector, asciiMaskForAddSaturate).AsByte(), bitmask);

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Minor nit: The goto FoundNonAsciiDataInFirstOrSecondVector; statement is replicated across many branches, which could make the logic harder to modify longer-term. Consider factoring SIMD logic into a predicate method, e.g. bool ContainNonAsciiDataInFirstOrSecondVector() and then consume like so:

if (ContainNonAsciiDataInFirstOrSecondVector(..args..))
{
    goto FoundNonAsciiDataInFirstOrSecondVector;
}

if ((currentMask & NonAsciiDataSeenMask) != 0)
{
goto FoundNonAsciiDataInFirstOrSecondVector;
}
}
else
{
throw new PlatformNotSupportedException();
}

pBuffer += 2 * SizeOfVector128InChars;
} while (pBuffer <= pFinalVectorReadPos);
Expand All @@ -770,7 +822,18 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
// At least one full vector's worth of data remains, so we can safely read it.
// Remember, at this point pBuffer is still aligned.

firstVector = Sse2.LoadAlignedVector128((ushort*)pBuffer);
if (Sse2.IsSupported)
{
firstVector = Sse2.LoadAlignedVector128((ushort*)pBuffer);
}
else if (AdvSimd.Arm64.IsSupported)
{
firstVector = AdvSimd.LoadVector128((ushort*)pBuffer);
}
else
{
throw new PlatformNotSupportedException();
}

if (Sse41.IsSupported)
{
Expand All @@ -781,7 +844,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
goto FoundNonAsciiDataInFirstVector;
}
}
else
else if (Sse2.IsSupported)
{
// See comment earlier in the method for an explanation of how the below logic works.
currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
Expand All @@ -790,6 +853,18 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
goto FoundNonAsciiDataInCurrentMask;
}
}
else if (AdvSimd.Arm64.IsSupported)
{
currentMask = Unicode.Utf16Utility.GetNonAsciiBytes(AdvSimd.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte(), bitmask);
if ((currentMask & NonAsciiDataSeenMask) != 0)
{
goto FoundNonAsciiDataInCurrentMask;
}
}
else
{
throw new PlatformNotSupportedException();
}

IncrementCurrentOffsetBeforeFinalUnalignedVectorRead:

Expand All @@ -803,7 +878,18 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
// We need to adjust the pointer because we're re-reading data.

pBuffer = (char*)((byte*)pBuffer + (bufferLength & (SizeOfVector128InBytes - 1)) - SizeOfVector128InBytes);
firstVector = Sse2.LoadVector128((ushort*)pBuffer); // unaligned load
if (Sse2.IsSupported)
{
firstVector = Sse2.LoadVector128((ushort*)pBuffer); // unaligned load
}
else if (AdvSimd.Arm64.IsSupported)
{
firstVector = AdvSimd.LoadVector128((ushort*)pBuffer); // unaligned load
}
else
{
throw new PlatformNotSupportedException();
}

if (Sse41.IsSupported)
{
Expand All @@ -814,7 +900,7 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
goto FoundNonAsciiDataInFirstVector;
}
}
else
else if (Sse2.IsSupported)
{
// See comment earlier in the method for an explanation of how the below logic works.
currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
Expand All @@ -823,6 +909,18 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
goto FoundNonAsciiDataInCurrentMask;
}
}
else if (AdvSimd.Arm64.IsSupported)
{
currentMask = Unicode.Utf16Utility.GetNonAsciiBytes(AdvSimd.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte(), bitmask);
if ((currentMask & NonAsciiDataSeenMask) != 0)
{
goto FoundNonAsciiDataInCurrentMask;
}
}
else
{
throw new PlatformNotSupportedException();
}

pBuffer += SizeOfVector128InChars;
}
Expand All @@ -846,14 +944,26 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
goto FoundNonAsciiDataInFirstVector;
}
}
else
else if (Sse2.IsSupported)
{
currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
if ((currentMask & NonAsciiDataSeenMask) != 0)
{
goto FoundNonAsciiDataInCurrentMask;
}
}
else if (AdvSimd.Arm64.IsSupported)
{
currentMask = Unicode.Utf16Utility.GetNonAsciiBytes(AdvSimd.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte(), bitmask);
if ((currentMask & NonAsciiDataSeenMask) != 0)
{
goto FoundNonAsciiDataInCurrentMask;
}
}
else
{
throw new PlatformNotSupportedException();
}

// Wasn't the first vector; must be the second.

Expand All @@ -863,7 +973,18 @@ private static unsafe nuint GetIndexOfFirstNonAsciiChar_Sse2(char* pBuffer, nuin
FoundNonAsciiDataInFirstVector:

// See comment earlier in the method for an explanation of how the below logic works.
currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
if (Sse2.IsSupported)
{
currentMask = (uint)Sse2.MoveMask(Sse2.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte());
}
else if (AdvSimd.Arm64.IsSupported)
{
currentMask = Unicode.Utf16Utility.GetNonAsciiBytes(AdvSimd.AddSaturate(firstVector, asciiMaskForAddSaturate).AsByte(), bitmask);
}
else
{
throw new PlatformNotSupportedException();
}

FoundNonAsciiDataInCurrentMask:

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,7 @@ static Utf16Utility()
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static uint GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bitMask128)
internal static uint GetNonAsciiBytes(Vector128<byte> value, Vector128<byte> bitMask128)
{
Debug.Assert(AdvSimd.Arm64.IsSupported);

Expand Down