diff --git a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems index 7c2939f84ebebc..1fc5d97aebdf3c 100644 --- a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems +++ b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems @@ -1239,6 +1239,7 @@ + diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs index 05db4d2f96bf73..a9ac9452c215cb 100644 --- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs +++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs @@ -9,6 +9,7 @@ using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; using System.Text; +using System.Text.Unicode; using static System.Buffers.StringSearchValuesHelper; namespace System.Buffers @@ -143,9 +144,9 @@ private static SearchValues CreateFromNormalizedValues( if (nonAsciiAffectedByCaseConversion) { - if (ContainsIncompleteSurrogatePairs(values)) + if (ContainsInvalidValues(values)) { - // Aho-Corasick can't deal with the matching semantics of standalone surrogate code units. + // Aho-Corasick can't deal with the matching semantics of invalid values. // We will use a slow but correct O(n * m) fallback implementation. return new MultiStringIgnoreCaseSearchValuesFallback(uniqueValues); } @@ -502,33 +503,13 @@ private static void AnalyzeValues( } } - private static bool ContainsIncompleteSurrogatePairs(ReadOnlySpan values) + private static bool ContainsInvalidValues(ReadOnlySpan values) { foreach (string value in values) { - int i = value.AsSpan().IndexOfAnyInRange(CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END); - if (i < 0) + if (!Utf16.IsValid(value)) { - continue; - } - - for (; (uint)i < (uint)value.Length; i++) - { - if (char.IsHighSurrogate(value[i])) - { - if ((uint)(i + 1) >= (uint)value.Length || !char.IsLowSurrogate(value[i + 1])) - { - // High surrogate not followed by a low surrogate. - return true; - } - - i++; - } - else if (char.IsLowSurrogate(value[i])) - { - // Low surrogate not preceded by a high surrogate. - return true; - } + return true; } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16.cs new file mode 100644 index 00000000000000..830669581fcfe0 --- /dev/null +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16.cs @@ -0,0 +1,27 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +namespace System.Text.Unicode +{ + /// + /// Provides static methods that validate UTF-16 strings. + /// + public static class Utf16 + { + /// + /// Validates that the value is well-formed UTF-16. + /// + /// The containing the UTF-16 input text to validate. + /// true if is well-formed UTF-16, false otherwise. + public static bool IsValid(ReadOnlySpan value) => + Utf16Utility.GetIndexOfFirstInvalidUtf16Sequence(value) < 0; + + /// + /// Finds the index of the first invalid UTF-16 subsequence. + /// + /// The containing the UTF-16 input text to examine. + /// The index of the first invalid UTF-16 subsequence, or -1 if the entire input is valid. + public static int IndexOfInvalidSubsequence(ReadOnlySpan value) => + Utf16Utility.GetIndexOfFirstInvalidUtf16Sequence(value); + } +} diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs index 9bcbf9390df28f..b36eecc1370ff0 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs @@ -3,6 +3,8 @@ using System.Diagnostics; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + #if NET using System.Runtime.Intrinsics; #endif @@ -290,5 +292,23 @@ internal static bool AllCharsInVectorAreAscii(TVector vec) return (vec & TVector.Create(unchecked((ushort)~0x007F))) == TVector.Zero; } #endif + +#if NET + /// + /// Returns the char index in where the first invalid UTF-16 sequence begins, + /// or -1 if the buffer contains no invalid sequences. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe int GetIndexOfFirstInvalidUtf16Sequence(ReadOnlySpan utf16Data) + { + fixed (char* pValue = &MemoryMarshal.GetReference(utf16Data)) + { + char* pFirstInvalidChar = GetPointerToFirstInvalidChar(pValue, utf16Data.Length, out _, out _); + int index = (int)(pFirstInvalidChar - pValue); + + return (index < utf16Data.Length) ? index : -1; + } + } +#endif } } diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs index b36be6cab83dfa..c7abb36953607e 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs @@ -10,7 +10,7 @@ namespace System.Text.Unicode { /// - /// Provides static methods that convert chunked data between UTF-8 and UTF-16 encodings. + /// Provides static methods that convert chunked data between UTF-8 and UTF-16 encodings, and methods that validate UTF-8 sequences. /// public static class Utf8 { @@ -816,6 +816,14 @@ private bool Fail() return false; } } + + /// + /// Finds the index of the first invalid UTF-8 subsequence. + /// + /// The containing the UTF-8 input text to examine. + /// The index of the first invalid UTF-8 subsequence, or -1 if the entire input is valid. + public static int IndexOfInvalidSubsequence(ReadOnlySpan value) => + Utf8Utility.GetIndexOfFirstInvalidUtf8Sequence(value, out _); #endif /// diff --git a/src/libraries/System.Runtime/ref/System.Runtime.cs b/src/libraries/System.Runtime/ref/System.Runtime.cs index c38e09a06fb819..6bb8a8766c5e88 100644 --- a/src/libraries/System.Runtime/ref/System.Runtime.cs +++ b/src/libraries/System.Runtime/ref/System.Runtime.cs @@ -16344,9 +16344,15 @@ void System.IDisposable.Dispose() { } } namespace System.Text.Unicode { + public static partial class Utf16 + { + public static int IndexOfInvalidSubsequence(System.ReadOnlySpan value) { throw null; } + public static bool IsValid(System.ReadOnlySpan value) { throw null; } + } public static partial class Utf8 { public static System.Buffers.OperationStatus FromUtf16(System.ReadOnlySpan source, System.Span destination, out int charsRead, out int bytesWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) { throw null; } + public static int IndexOfInvalidSubsequence(System.ReadOnlySpan value) { throw null; } public static bool IsValid(System.ReadOnlySpan value) { throw null; } public static System.Buffers.OperationStatus ToUtf16(System.ReadOnlySpan source, System.Span destination, out int bytesRead, out int charsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) { throw null; } public static bool TryWrite(System.Span destination, System.IFormatProvider? provider, [System.Runtime.CompilerServices.InterpolatedStringHandlerArgumentAttribute(new string[]{ "destination", "provider"})] ref System.Text.Unicode.Utf8.TryWriteInterpolatedStringHandler handler, out int bytesWritten) { throw null; } diff --git a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.cs b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.cs index 9229fd32126d11..e7e27b97f5f223 100644 --- a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.cs +++ b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.cs @@ -237,11 +237,17 @@ private static unsafe void GetIndexOfFirstInvalidUtf16Sequence_Test_Core(char[] actualRuneCount = (int)ptrDiff + scalarCountAdjustment; } + int actualIndexOfInvalidSubsequence = Utf16.IndexOfInvalidSubsequence(boundedMemory.Span); + bool actualIsValid = Utf16.IsValid(boundedMemory.Span); + // Assert Assert.Equal(expectedRetVal, actualRetVal); Assert.Equal(expectedRuneCount, actualRuneCount); - Assert.Equal(actualUtf8CodeUnitCount, expectedUtf8ByteCount); + Assert.Equal(expectedUtf8ByteCount, actualUtf8CodeUnitCount); + + Assert.Equal(expectedRetVal, actualIndexOfInvalidSubsequence); + Assert.Equal(expectedRetVal < 0, actualIsValid); } private static Lazy CreateGetPointerToFirstInvalidCharFn() diff --git a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs index 4730337b0878ed..8c56d52977a872 100644 --- a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs +++ b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs @@ -385,13 +385,17 @@ private static unsafe void GetIndexOfFirstInvalidUtf8Sequence_Test_Core(byte[] i actualSurrogatePairCount = totalUtf16CodeUnitCount - actualRuneCount; } + int actualIndexOfInvalidSubsequence = Utf8.IndexOfInvalidSubsequence(boundedMemory.Span); + bool actualIsValid = Utf8.IsValid(boundedMemory.Span); + // Assert Assert.Equal(expectedRetVal, actualRetVal); Assert.Equal(expectedRuneCount, actualRuneCount); Assert.Equal(expectedSurrogatePairCount, actualSurrogatePairCount); - Assert.True(Utf8.IsValid(boundedMemory.Span) == (expectedRetVal < 0)); + Assert.Equal(expectedRetVal, actualIndexOfInvalidSubsequence); + Assert.Equal(expectedRetVal < 0, actualIsValid); } private static Lazy CreateGetPointerToFirstInvalidByteFn() diff --git a/src/libraries/System.Web.HttpUtility/src/System/Web/Util/HttpEncoder.cs b/src/libraries/System.Web.HttpUtility/src/System/Web/Util/HttpEncoder.cs index 46227356574632..40ebf11f710355 100644 --- a/src/libraries/System.Web.HttpUtility/src/System/Web/Util/HttpEncoder.cs +++ b/src/libraries/System.Web.HttpUtility/src/System/Web/Util/HttpEncoder.cs @@ -8,6 +8,7 @@ using System.IO; using System.Net; using System.Text; +using System.Text.Unicode; namespace System.Web.Util { @@ -676,12 +677,9 @@ internal string GetString() Span chars = _charBuffer.Slice(0, _numChars); - const char HIGH_SURROGATE_START = '\ud800'; - const char LOW_SURROGATE_END = '\udfff'; - // Replace any invalid surrogate chars. - int idxOfFirstSurrogate = chars.IndexOfAnyInRange(HIGH_SURROGATE_START, LOW_SURROGATE_END); - for (int i = idxOfFirstSurrogate; (uint)i < (uint)chars.Length; i++) + int idxOfFirstInvalidSurrogate = Utf16.IndexOfInvalidSubsequence(chars); + for (int i = idxOfFirstInvalidSurrogate; (uint)i < (uint)chars.Length; i++) { if (char.IsHighSurrogate(chars[i])) {