Skip to content

Commit 1d065b6

Browse files
authored
Fix string comparison with ordinal casing with Surrogates (#55771)
* Fix string comparison with ordinal casing with Surrogates * Address the feedback
1 parent 04a8f5e commit 1d065b6

3 files changed

Lines changed: 83 additions & 46 deletions

File tree

src/libraries/System.Globalization/tests/CompareInfo/CompareInfoTests.Compare.cs

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,20 @@ public static IEnumerable<object[]> Compare_TestData()
273273
yield return new object[] { new CultureInfo("de-DE_phoneb").CompareInfo, "\u00DC", "UE", CompareOptions.None, useNls ? 0 : -1 };
274274
yield return new object[] { new CultureInfo("es-ES_tradnl").CompareInfo, "llegar", "lugar", CompareOptions.None, useNls ? 1 : -1 };
275275
}
276+
277+
//
278+
// Ordinal comparisons with ignore casing.
279+
//
280+
281+
yield return new object[] { s_invariantCompare, "abcd", "abcd", CompareOptions.OrdinalIgnoreCase, 0};
282+
yield return new object[] { s_invariantCompare, "abcd", "ABCD", CompareOptions.OrdinalIgnoreCase, 0};
283+
yield return new object[] { s_invariantCompare, "Hello\u00F6", "HELLO\u00D6", CompareOptions.OrdinalIgnoreCase, 0};
284+
yield return new object[] { s_invariantCompare, "Hello\uFE6A", "Hello\U0001F601", CompareOptions.OrdinalIgnoreCase, useNls ? 1 : -1};
285+
yield return new object[] { s_invariantCompare, "Hello\U0001F601", "Hello\uFE6A", CompareOptions.OrdinalIgnoreCase, useNls ? -1 : 1};
286+
yield return new object[] { s_invariantCompare, "\uDBFF", "\uD800\uDC00", CompareOptions.OrdinalIgnoreCase, useNls ? 1 : -1};
287+
yield return new object[] { s_invariantCompare, "\uD800\uDC00", "\uDBFF", CompareOptions.OrdinalIgnoreCase, useNls ? -1 : 1};
288+
yield return new object[] { s_invariantCompare, "abcdefg\uDBFF", "abcdefg\uD800\uDC00", CompareOptions.OrdinalIgnoreCase, useNls ? 1 : -1};
289+
yield return new object[] { s_invariantCompare, "\U00010400", "\U00010428", CompareOptions.OrdinalIgnoreCase, useNls ? -1 : 0};
276290
}
277291

278292
// There is a regression in Windows 190xx version with the Kana comparison. Avoid running this test there.

src/libraries/System.Private.CoreLib/src/System/Globalization/InvariantModeCasing.cs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,14 +161,15 @@ internal static void ToLower(ReadOnlySpan<char> source, Span<char> destination)
161161
}
162162

163163
[MethodImpl(MethodImplOptions.AggressiveInlining)]
164-
private static (uint, int) GetScalar(ref char charA, int index, int length)
164+
private static (uint, int) GetScalar(ref char source, int index, int length)
165165
{
166+
char charA = source;
166167
if (!char.IsHighSurrogate(charA) || index >= length - 1)
167168
{
168169
return ((uint)charA, 1);
169170
}
170171

171-
ref char charB = ref Unsafe.Add(ref charA, 1);
172+
char charB = Unsafe.Add(ref source, 1);
172173
if (!char.IsLowSurrogate(charB))
173174
{
174175
return ((uint)charA, 1);

src/libraries/System.Private.CoreLib/src/System/Globalization/OrdinalCasing.Icu.cs

Lines changed: 66 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// Licensed to the .NET Foundation under one or more agreements.
22
// The .NET Foundation licenses this file to you under the MIT license.
33

4+
using System.Text;
45
using System.Diagnostics;
56
using System.Threading;
67
using System.Runtime.InteropServices;
@@ -196,72 +197,93 @@ internal static int CompareStringIgnoreCase(ref char strA, int lengthA, ref char
196197
ref char charA = ref strA;
197198
ref char charB = ref strB;
198199

199-
while (length != 0)
200+
int index = 0;
201+
202+
while (index < length)
200203
{
201-
// optimize for Ascii cases
202-
if (charA <= '\u00FF' || length == 1 || !char.IsHighSurrogate(charA) || !char.IsHighSurrogate(charB))
204+
char a = charA;
205+
char b = charB;
206+
char lowSurrogateA = '\0';
207+
208+
if (!char.IsHighSurrogate(a) || index >= lengthA - 1 || !char.IsLowSurrogate(lowSurrogateA = Unsafe.Add(ref charA, 1)))
203209
{
204-
if (charA == charB)
210+
if (!char.IsHighSurrogate(b) || index >= lengthB - 1 || !char.IsLowSurrogate(Unsafe.Add(ref charB, 1)))
205211
{
206-
length--;
207-
charA = ref Unsafe.Add(ref charA, 1);
208-
charB = ref Unsafe.Add(ref charB, 1);
209-
continue;
210-
}
212+
//
213+
// Neither A or B are surrogates
214+
//
211215

212-
char aUpper = OrdinalCasing.ToUpper(charA);
213-
char bUpper = OrdinalCasing.ToUpper(charB);
216+
if (b == a)
217+
{
218+
index++;
219+
charA = ref Unsafe.Add(ref charA, 1);
220+
charB = ref Unsafe.Add(ref charB, 1);
221+
continue;
222+
}
214223

215-
if (aUpper == bUpper)
216-
{
217-
length--;
218-
charA = ref Unsafe.Add(ref charA, 1);
219-
charB = ref Unsafe.Add(ref charB, 1);
220-
continue;
224+
char aUpper = OrdinalCasing.ToUpper(a);
225+
char bUpper = OrdinalCasing.ToUpper(b);
226+
227+
if (aUpper == bUpper)
228+
{
229+
index++;
230+
charA = ref Unsafe.Add(ref charA, 1);
231+
charB = ref Unsafe.Add(ref charB, 1);
232+
continue;
233+
}
234+
235+
return a - b;
221236
}
222237

223-
return aUpper - bUpper;
238+
//
239+
// charA is not surrogate and charB is valid surrogate
240+
//
241+
242+
return -1;
224243
}
225244

226-
// We come here only of we have valid high surrogates and length > 1
245+
//
246+
// A is Surrogate
247+
//
227248

228-
char a = charA;
229-
char b = charB;
230-
231-
length--;
232-
charA = ref Unsafe.Add(ref charA, 1);
233-
charB = ref Unsafe.Add(ref charB, 1);
249+
char lowSurrogateB = '\0';
234250

235-
if (!char.IsLowSurrogate(charA) || !char.IsLowSurrogate(charB))
251+
if (!char.IsHighSurrogate(b) || index >= lengthB - 1 || !char.IsLowSurrogate(lowSurrogateB = Unsafe.Add(ref charB, 1)))
236252
{
237-
// malformed Surrogates - should be rare cases
238-
if (a != b)
239-
{
240-
return a - b;
241-
}
253+
//
254+
// charB is not surrogate and charA is surrogate
255+
//
242256

243-
// Should be pointing to the right characters in the string to resume at.
244-
// Just in case we could be pointing at high surrogate now.
245-
continue;
257+
return 1;
246258
}
247259

248-
// we come here only if we have valid full surrogates
249-
SurrogateCasing.ToUpper(a, charA, out char h1, out char l1);
250-
SurrogateCasing.ToUpper(b, charB, out char h2, out char l2);
260+
//
261+
// charA and charB are surrogates
262+
//
263+
264+
Debug.Assert(lowSurrogateA != '\0');
265+
Debug.Assert(lowSurrogateB != '\0');
251266

252-
if (h1 != h2)
267+
if (a == b && lowSurrogateA == lowSurrogateB)
253268
{
254-
return (int)h1 - (int)h2;
269+
index += 2;
270+
charA = ref Unsafe.Add(ref charA, 2);
271+
charB = ref Unsafe.Add(ref charB, 2);
272+
continue;
255273
}
256274

257-
if (l1 != l2)
275+
uint upperSurrogateA = CharUnicodeInfo.ToUpper(UnicodeUtility.GetScalarFromUtf16SurrogatePair(a, lowSurrogateA));
276+
uint upperSurrogateB = CharUnicodeInfo.ToUpper(UnicodeUtility.GetScalarFromUtf16SurrogatePair(b, lowSurrogateB));
277+
278+
if (upperSurrogateA == upperSurrogateB)
258279
{
259-
return (int)l1 - (int)l2;
280+
index += 2;
281+
charA = ref Unsafe.Add(ref charA, 2);
282+
charB = ref Unsafe.Add(ref charB, 2);
283+
continue;
260284
}
261285

262-
length--;
263-
charA = ref Unsafe.Add(ref charA, 1);
264-
charB = ref Unsafe.Add(ref charB, 1);
286+
return (int)upperSurrogateA - (int)upperSurrogateB;
265287
}
266288

267289
return lengthA - lengthB;

0 commit comments

Comments
 (0)