diff --git a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems
index 7c2939f84ebebc..1fc5d97aebdf3c 100644
--- a/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems
+++ b/src/libraries/System.Private.CoreLib/src/System.Private.CoreLib.Shared.projitems
@@ -1239,6 +1239,7 @@
+
diff --git a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs
index 05db4d2f96bf73..a9ac9452c215cb 100644
--- a/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/SearchValues/Strings/StringSearchValues.cs
@@ -9,6 +9,7 @@
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
using System.Text;
+using System.Text.Unicode;
using static System.Buffers.StringSearchValuesHelper;
namespace System.Buffers
@@ -143,9 +144,9 @@ private static SearchValues CreateFromNormalizedValues(
if (nonAsciiAffectedByCaseConversion)
{
- if (ContainsIncompleteSurrogatePairs(values))
+ if (ContainsInvalidValues(values))
{
- // Aho-Corasick can't deal with the matching semantics of standalone surrogate code units.
+ // Aho-Corasick can't deal with the matching semantics of invalid values.
// We will use a slow but correct O(n * m) fallback implementation.
return new MultiStringIgnoreCaseSearchValuesFallback(uniqueValues);
}
@@ -502,33 +503,13 @@ private static void AnalyzeValues(
}
}
- private static bool ContainsIncompleteSurrogatePairs(ReadOnlySpan values)
+ private static bool ContainsInvalidValues(ReadOnlySpan values)
{
foreach (string value in values)
{
- int i = value.AsSpan().IndexOfAnyInRange(CharUnicodeInfo.HIGH_SURROGATE_START, CharUnicodeInfo.LOW_SURROGATE_END);
- if (i < 0)
+ if (!Utf16.IsValid(value))
{
- continue;
- }
-
- for (; (uint)i < (uint)value.Length; i++)
- {
- if (char.IsHighSurrogate(value[i]))
- {
- if ((uint)(i + 1) >= (uint)value.Length || !char.IsLowSurrogate(value[i + 1]))
- {
- // High surrogate not followed by a low surrogate.
- return true;
- }
-
- i++;
- }
- else if (char.IsLowSurrogate(value[i]))
- {
- // Low surrogate not preceded by a high surrogate.
- return true;
- }
+ return true;
}
}
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16.cs
new file mode 100644
index 00000000000000..830669581fcfe0
--- /dev/null
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16.cs
@@ -0,0 +1,27 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+namespace System.Text.Unicode
+{
+ ///
+ /// Provides static methods that validate UTF-16 strings.
+ ///
+ public static class Utf16
+ {
+ ///
+ /// Validates that the value is well-formed UTF-16.
+ ///
+ /// The containing the UTF-16 input text to validate.
+ /// true if is well-formed UTF-16, false otherwise.
+ public static bool IsValid(ReadOnlySpan value) =>
+ Utf16Utility.GetIndexOfFirstInvalidUtf16Sequence(value) < 0;
+
+ ///
+ /// Finds the index of the first invalid UTF-16 subsequence.
+ ///
+ /// The containing the UTF-16 input text to examine.
+ /// The index of the first invalid UTF-16 subsequence, or -1 if the entire input is valid.
+ public static int IndexOfInvalidSubsequence(ReadOnlySpan value) =>
+ Utf16Utility.GetIndexOfFirstInvalidUtf16Sequence(value);
+ }
+}
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs
index 9bcbf9390df28f..b36eecc1370ff0 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf16Utility.cs
@@ -3,6 +3,8 @@
using System.Diagnostics;
using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
#if NET
using System.Runtime.Intrinsics;
#endif
@@ -290,5 +292,23 @@ internal static bool AllCharsInVectorAreAscii(TVector vec)
return (vec & TVector.Create(unchecked((ushort)~0x007F))) == TVector.Zero;
}
#endif
+
+#if NET
+ ///
+ /// Returns the char index in where the first invalid UTF-16 sequence begins,
+ /// or -1 if the buffer contains no invalid sequences.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static unsafe int GetIndexOfFirstInvalidUtf16Sequence(ReadOnlySpan utf16Data)
+ {
+ fixed (char* pValue = &MemoryMarshal.GetReference(utf16Data))
+ {
+ char* pFirstInvalidChar = GetPointerToFirstInvalidChar(pValue, utf16Data.Length, out _, out _);
+ int index = (int)(pFirstInvalidChar - pValue);
+
+ return (index < utf16Data.Length) ? index : -1;
+ }
+ }
+#endif
}
}
diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs
index b36be6cab83dfa..c7abb36953607e 100644
--- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs
+++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8.cs
@@ -10,7 +10,7 @@
namespace System.Text.Unicode
{
///
- /// Provides static methods that convert chunked data between UTF-8 and UTF-16 encodings.
+ /// Provides static methods that convert chunked data between UTF-8 and UTF-16 encodings, and methods that validate UTF-8 sequences.
///
public static class Utf8
{
@@ -816,6 +816,14 @@ private bool Fail()
return false;
}
}
+
+ ///
+ /// Finds the index of the first invalid UTF-8 subsequence.
+ ///
+ /// The containing the UTF-8 input text to examine.
+ /// The index of the first invalid UTF-8 subsequence, or -1 if the entire input is valid.
+ public static int IndexOfInvalidSubsequence(ReadOnlySpan value) =>
+ Utf8Utility.GetIndexOfFirstInvalidUtf8Sequence(value, out _);
#endif
///
diff --git a/src/libraries/System.Runtime/ref/System.Runtime.cs b/src/libraries/System.Runtime/ref/System.Runtime.cs
index c38e09a06fb819..6bb8a8766c5e88 100644
--- a/src/libraries/System.Runtime/ref/System.Runtime.cs
+++ b/src/libraries/System.Runtime/ref/System.Runtime.cs
@@ -16344,9 +16344,15 @@ void System.IDisposable.Dispose() { }
}
namespace System.Text.Unicode
{
+ public static partial class Utf16
+ {
+ public static int IndexOfInvalidSubsequence(System.ReadOnlySpan value) { throw null; }
+ public static bool IsValid(System.ReadOnlySpan value) { throw null; }
+ }
public static partial class Utf8
{
public static System.Buffers.OperationStatus FromUtf16(System.ReadOnlySpan source, System.Span destination, out int charsRead, out int bytesWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) { throw null; }
+ public static int IndexOfInvalidSubsequence(System.ReadOnlySpan value) { throw null; }
public static bool IsValid(System.ReadOnlySpan value) { throw null; }
public static System.Buffers.OperationStatus ToUtf16(System.ReadOnlySpan source, System.Span destination, out int bytesRead, out int charsWritten, bool replaceInvalidSequences = true, bool isFinalBlock = true) { throw null; }
public static bool TryWrite(System.Span destination, System.IFormatProvider? provider, [System.Runtime.CompilerServices.InterpolatedStringHandlerArgumentAttribute(new string[]{ "destination", "provider"})] ref System.Text.Unicode.Utf8.TryWriteInterpolatedStringHandler handler, out int bytesWritten) { throw null; }
diff --git a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.cs b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.cs
index 9229fd32126d11..e7e27b97f5f223 100644
--- a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.cs
+++ b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf16UtilityTests.ValidateChars.cs
@@ -237,11 +237,17 @@ private static unsafe void GetIndexOfFirstInvalidUtf16Sequence_Test_Core(char[]
actualRuneCount = (int)ptrDiff + scalarCountAdjustment;
}
+ int actualIndexOfInvalidSubsequence = Utf16.IndexOfInvalidSubsequence(boundedMemory.Span);
+ bool actualIsValid = Utf16.IsValid(boundedMemory.Span);
+
// Assert
Assert.Equal(expectedRetVal, actualRetVal);
Assert.Equal(expectedRuneCount, actualRuneCount);
- Assert.Equal(actualUtf8CodeUnitCount, expectedUtf8ByteCount);
+ Assert.Equal(expectedUtf8ByteCount, actualUtf8CodeUnitCount);
+
+ Assert.Equal(expectedRetVal, actualIndexOfInvalidSubsequence);
+ Assert.Equal(expectedRetVal < 0, actualIsValid);
}
private static Lazy CreateGetPointerToFirstInvalidCharFn()
diff --git a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs
index 4730337b0878ed..8c56d52977a872 100644
--- a/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs
+++ b/src/libraries/System.Runtime/tests/System.Runtime.Tests/System/Text/Unicode/Utf8UtilityTests.ValidateBytes.cs
@@ -385,13 +385,17 @@ private static unsafe void GetIndexOfFirstInvalidUtf8Sequence_Test_Core(byte[] i
actualSurrogatePairCount = totalUtf16CodeUnitCount - actualRuneCount;
}
+ int actualIndexOfInvalidSubsequence = Utf8.IndexOfInvalidSubsequence(boundedMemory.Span);
+ bool actualIsValid = Utf8.IsValid(boundedMemory.Span);
+
// Assert
Assert.Equal(expectedRetVal, actualRetVal);
Assert.Equal(expectedRuneCount, actualRuneCount);
Assert.Equal(expectedSurrogatePairCount, actualSurrogatePairCount);
- Assert.True(Utf8.IsValid(boundedMemory.Span) == (expectedRetVal < 0));
+ Assert.Equal(expectedRetVal, actualIndexOfInvalidSubsequence);
+ Assert.Equal(expectedRetVal < 0, actualIsValid);
}
private static Lazy CreateGetPointerToFirstInvalidByteFn()
diff --git a/src/libraries/System.Web.HttpUtility/src/System/Web/Util/HttpEncoder.cs b/src/libraries/System.Web.HttpUtility/src/System/Web/Util/HttpEncoder.cs
index 46227356574632..40ebf11f710355 100644
--- a/src/libraries/System.Web.HttpUtility/src/System/Web/Util/HttpEncoder.cs
+++ b/src/libraries/System.Web.HttpUtility/src/System/Web/Util/HttpEncoder.cs
@@ -8,6 +8,7 @@
using System.IO;
using System.Net;
using System.Text;
+using System.Text.Unicode;
namespace System.Web.Util
{
@@ -676,12 +677,9 @@ internal string GetString()
Span chars = _charBuffer.Slice(0, _numChars);
- const char HIGH_SURROGATE_START = '\ud800';
- const char LOW_SURROGATE_END = '\udfff';
-
// Replace any invalid surrogate chars.
- int idxOfFirstSurrogate = chars.IndexOfAnyInRange(HIGH_SURROGATE_START, LOW_SURROGATE_END);
- for (int i = idxOfFirstSurrogate; (uint)i < (uint)chars.Length; i++)
+ int idxOfFirstInvalidSurrogate = Utf16.IndexOfInvalidSubsequence(chars);
+ for (int i = idxOfFirstInvalidSurrogate; (uint)i < (uint)chars.Length; i++)
{
if (char.IsHighSurrogate(chars[i]))
{