Fix errors around strings with lone high or low surrogates (#1673)

This fixes some errors around treatment of strings with lone high/low
surrogates.
This commit is contained in:
Daniel Chao
2026-06-10 08:35:22 -07:00
committed by GitHub
parent 1a1e1cfea9
commit 27fe06c796
7 changed files with 52 additions and 13 deletions
@@ -1,5 +1,5 @@
/*
* Copyright © 2024-2025 Apple Inc. and the Pkl project authors. All rights reserved.
* Copyright © 2024-2026 Apple Inc. and the Pkl project authors. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -44,7 +44,9 @@ public abstract class SubscriptNode extends BinaryExpressionNode {
.build();
}
if (Character.isHighSurrogate(receiver.charAt(charIndex))) {
if (Character.isHighSurrogate(receiver.charAt(charIndex))
&& charIndex < receiver.length() - 1
&& Character.isLowSurrogate(receiver.charAt(charIndex + 1))) {
return receiver.substring(charIndex, charIndex + 2);
}
return receiver.substring(charIndex, charIndex + 1);
@@ -675,13 +675,14 @@ public final class VmUtils {
var charOffset = startIndex;
while (charOffset < length && codePointOffset > 0) {
if (Character.isHighSurrogate(string.charAt(charOffset++))
var ch = string.charAt(charOffset);
charOffset++;
if (Character.isHighSurrogate(ch)
&& charOffset < length
&& !Character.isLowSurrogate(string.charAt(charOffset++))) {
codePointOffset -= 2;
} else {
codePointOffset -= 1;
&& Character.isLowSurrogate(string.charAt(charOffset))) {
charOffset++;
}
codePointOffset--;
}
return codePointOffset != 0 ? -1 : charOffset;
@@ -692,13 +693,14 @@ public final class VmUtils {
var charOffset = string.length();
while (charOffset > 0 && codePointOffset > 0) {
if (Character.isLowSurrogate(string.charAt(--charOffset))
charOffset--;
char ch = string.charAt(charOffset);
if (Character.isLowSurrogate(ch)
&& charOffset > 0
&& !Character.isHighSurrogate(string.charAt(--charOffset))) {
codePointOffset -= 2;
} else {
codePointOffset -= 1;
&& Character.isHighSurrogate(string.charAt(charOffset - 1))) {
charOffset--;
}
codePointOffset--;
}
return codePointOffset != 0 ? -1 : charOffset;
@@ -220,7 +220,9 @@ public final class StringNodes {
var charIndex = VmUtils.codePointOffsetToCharOffset(self, index);
if (charIndex == -1 || charIndex == self.length()) return VmNull.withoutDefault();
if (Character.isHighSurrogate(self.charAt(charIndex))) {
if (Character.isHighSurrogate(self.charAt(charIndex))
&& charIndex < self.length() - 1
&& Character.isLowSurrogate(self.charAt(charIndex + 1))) {
return self.substring(charIndex, charIndex + 2);
}
return self.substring(charIndex, charIndex + 1);
@@ -415,6 +415,7 @@ examples {
str1.substring(2, 3)
str1.substring(2, 4)
str1.substring(0, 7)
"\u{D800}hello".substring(0, 2)
module.catch(() -> str1.substring(-1, 4))
module.catch(() -> str1.substring(1, 8))
module.catch(() -> str1.substring(3, 2))
@@ -436,6 +437,15 @@ examples {
str1.getOrNull(3)
str1.getOrNull(6)
str1.getOrNull(7)
"🏀".getOrNull(0)
"🏀".getOrNull(1)
"\u{D800}".getOrNull(0)
"\u{D800}".getOrNull(1)
"\u{D800}h".getOrNull(0)
"\u{D800}h".getOrNull(1)
"\u{D800}h".getOrNull(2)
"h\u{D800}".getOrNull(0)
"h\u{D800}".getOrNull(1)
}
["toCodePoints()"] {
@@ -59,6 +59,13 @@ examples {
str3[2]
module.catch(() -> str3[-1])
module.catch(() -> str3[4])
"🏀"[0]
"\u{D800}"[0]
"\u{D800}h"[0]
"\u{D800}h"[1]
"h\u{D800}"[0]
"h\u{D800}"[1]
}
["dollar sign has no special meaning"] {
@@ -353,6 +353,7 @@ examples {
"c"
"cd"
"abcdefg"
"?h"
"Character index `-1` is out of range `0`..`7`. String: \"abcdefg\""
"Character index `8` is out of range `1`..`7`. String: \"abcdefg\""
"Character index `2` is out of range `3`..`7`. String: \"abcdefg\""
@@ -372,6 +373,15 @@ examples {
"d"
"g"
null
"🏀"
null
"?"
null
"?"
"h"
null
"h"
"?"
}
["toCodePoints()"] {
List(97, 98, 99, 100, 101, 102, 103)
@@ -51,6 +51,12 @@ examples {
"i"
"Character index `-1` is out of range `0`..`3`. String: \"this\""
"Character index `4` is out of range `0`..`3`. String: \"this\""
"🏀"
"?"
"?"
"h"
"h"
"?"
}
["dollar sign has no special meaning"] {
"123$"