mirror of
https://github.com/apple/pkl.git
synced 2026-03-21 16:49:13 +01:00
Fix Lexer EOF sentinel collision with valid Unicode code points (#1251)
Fixes an issue where sentinel value (U+7FFF) occurring literally in the source could cause a premature termination of parsing, leading to potential EOF injection attacks. --------- Co-authored-by: Dan Chao <dan.chao@apple.com>
This commit is contained in:
committed by
GitHub
parent
a8f76d6209
commit
fdb2bd8c75
@@ -30,7 +30,7 @@ public class Lexer {
|
||||
private int sLine = 1;
|
||||
private int col = 1;
|
||||
private int sCol = 1;
|
||||
private char lookahead;
|
||||
private int lookahead;
|
||||
private State state = State.DEFAULT;
|
||||
private final Deque<InterpolationScope> interpolationStack = new ArrayDeque<>();
|
||||
private boolean stringEnded = false;
|
||||
@@ -38,7 +38,7 @@ public class Lexer {
|
||||
// how many newlines exist between two subsequent tokens
|
||||
protected int newLinesBetween = 0;
|
||||
|
||||
private static final char EOF = Short.MAX_VALUE;
|
||||
private static final int EOF = -1;
|
||||
|
||||
public Lexer(String input) {
|
||||
source = input.toCharArray();
|
||||
@@ -248,7 +248,7 @@ public class Lexer {
|
||||
yield lexNumber(ch);
|
||||
} else if (isIdentifierStart(ch)) {
|
||||
yield lexIdentifier();
|
||||
} else throw lexError(ErrorMessages.create("invalidCharacter", ch), cursor - 1, 1);
|
||||
} else throw lexError(ErrorMessages.create("invalidCharacter", (char) ch), cursor - 1, 1);
|
||||
}
|
||||
};
|
||||
}
|
||||
@@ -450,7 +450,7 @@ public class Lexer {
|
||||
case 'u' -> lexUnicodeEscape();
|
||||
default ->
|
||||
throw lexError(
|
||||
ErrorMessages.create("invalidCharacterEscapeSequence", "\\" + ch, "\\"),
|
||||
ErrorMessages.create("invalidCharacterEscapeSequence", "\\" + (char) ch, "\\"),
|
||||
cursor - 2,
|
||||
2);
|
||||
};
|
||||
@@ -513,7 +513,7 @@ public class Lexer {
|
||||
}
|
||||
}
|
||||
|
||||
private Token lexNumber(char start) {
|
||||
private Token lexNumber(int start) {
|
||||
if (start == '0') {
|
||||
if (lookahead == 'x' || lookahead == 'X') {
|
||||
nextChar();
|
||||
@@ -626,9 +626,9 @@ public class Lexer {
|
||||
if (lookahead == '_') {
|
||||
throw lexError("invalidSeparatorPosition");
|
||||
}
|
||||
var ch = (int) lookahead;
|
||||
var ch = lookahead;
|
||||
if (!(ch >= 48 && ch <= 55)) {
|
||||
throw unexpectedChar((char) ch, "octal number");
|
||||
throw unexpectedChar(ch, "octal number");
|
||||
}
|
||||
while ((ch >= 48 && ch <= 55) || ch == '_') {
|
||||
nextChar();
|
||||
@@ -671,20 +671,19 @@ public class Lexer {
|
||||
return Token.SHEBANG;
|
||||
}
|
||||
|
||||
private boolean isHex(char ch) {
|
||||
var code = (int) ch;
|
||||
private boolean isHex(int code) {
|
||||
return (code >= 48 && code <= 57) || (code >= 97 && code <= 102) || (code >= 65 && code <= 70);
|
||||
}
|
||||
|
||||
private static boolean isIdentifierStart(char c) {
|
||||
private static boolean isIdentifierStart(int c) {
|
||||
return c == '_' || c == '$' || Character.isUnicodeIdentifierStart(c);
|
||||
}
|
||||
|
||||
private static boolean isIdentifierPart(char c) {
|
||||
private static boolean isIdentifierPart(int c) {
|
||||
return c != EOF && (c == '$' || Character.isUnicodeIdentifierPart(c));
|
||||
}
|
||||
|
||||
private char nextChar() {
|
||||
private int nextChar() {
|
||||
var tmp = lookahead;
|
||||
cursor++;
|
||||
if (cursor >= size) {
|
||||
@@ -726,11 +725,11 @@ public class Lexer {
|
||||
return new ParserError(msg, span);
|
||||
}
|
||||
|
||||
private ParserError unexpectedChar(char got, String didYouMean) {
|
||||
private ParserError unexpectedChar(int got, String didYouMean) {
|
||||
if (got == EOF) {
|
||||
return unexpectedChar("EOF", didYouMean);
|
||||
}
|
||||
return lexError("unexpectedCharacter", got, didYouMean);
|
||||
return lexError("unexpectedCharacter", (char) got, didYouMean);
|
||||
}
|
||||
|
||||
private ParserError unexpectedChar(String got, String didYouMean) {
|
||||
|
||||
@@ -53,4 +53,70 @@ class LexerTest {
|
||||
val thrown = assertThrows<ParserError> { Lexer("`").next() }
|
||||
assertThat(thrown).hasMessageContaining("Unexpected character `EOF`")
|
||||
}
|
||||
|
||||
@Test
|
||||
fun rejectsSentinelBetweenTokens() {
|
||||
val lexerFFFF = Lexer("// Comment with \uFFFF character\nclass \uFFFF Bar")
|
||||
assertThat(lexerFFFF.next()).isEqualTo(Token.LINE_COMMENT)
|
||||
assertThat(lexerFFFF.next()).isEqualTo(Token.CLASS)
|
||||
val thrown = assertThrows<ParserError> { lexerFFFF.next() }
|
||||
assertThat(thrown).hasMessageContaining("Invalid identifier")
|
||||
}
|
||||
|
||||
@Test
|
||||
fun acceptsAllUnicodeCodepointsInComments() {
|
||||
// Test valid Unicode codepoints can appear literally
|
||||
// without being misinterpreted as EOF.
|
||||
|
||||
// Test the previously problematic U+7FFF (Short.MAX_VALUE)
|
||||
val lexer7FFF = Lexer("// Comment with \u7FFF character\nclass Foo")
|
||||
assertThat(lexer7FFF.next()).isEqualTo(Token.LINE_COMMENT)
|
||||
assertThat(lexer7FFF.next()).isEqualTo(Token.CLASS)
|
||||
assertThat(lexer7FFF.next()).isEqualTo(Token.IDENTIFIER)
|
||||
assertThat(lexer7FFF.next()).isEqualTo(Token.EOF)
|
||||
|
||||
// Test U+FFFF (Character.MAX_VALUE)
|
||||
val lexerFFFF = Lexer("// Comment with \uFFFF character\nclass Bar")
|
||||
assertThat(lexerFFFF.next()).isEqualTo(Token.LINE_COMMENT)
|
||||
assertThat(lexerFFFF.next()).isEqualTo(Token.CLASS)
|
||||
assertThat(lexerFFFF.next()).isEqualTo(Token.IDENTIFIER)
|
||||
assertThat(lexerFFFF.next()).isEqualTo(Token.EOF)
|
||||
|
||||
// Test a range of codepoints including edge cases
|
||||
val testCodepoints =
|
||||
listOf(
|
||||
0x0000, // NULL
|
||||
0x0001, // Start of heading
|
||||
0x007F, // DELETE
|
||||
0x0080, // First non-ASCII
|
||||
0x7FFE, // One before the old problematic value
|
||||
0x7FFF, // Old EOF sentinel (Short.MAX_VALUE)
|
||||
0x8000, // One after the old problematic value
|
||||
0xFFFE, // One before Character.MAX_VALUE
|
||||
0xFFFF, // Character.MAX_VALUE (noncharacter)
|
||||
)
|
||||
|
||||
for (codepoint in testCodepoints) {
|
||||
val char = codepoint.toChar()
|
||||
// Put the test character in a comment, followed by actual code tokens
|
||||
val input = "// Test $char\nmodule Test"
|
||||
val lexer = Lexer(input)
|
||||
assertThat(lexer.next())
|
||||
.withFailMessage("Codepoint U+%04X should be accepted in comment", codepoint)
|
||||
.isEqualTo(Token.LINE_COMMENT)
|
||||
assertThat(lexer.next())
|
||||
.withFailMessage(
|
||||
"Codepoint U+%04X should not terminate input early (expecting MODULE)",
|
||||
codepoint,
|
||||
)
|
||||
.isEqualTo(Token.MODULE)
|
||||
assertThat(lexer.next())
|
||||
.withFailMessage(
|
||||
"Codepoint U+%04X should not terminate input early (expecting IDENTIFIER)",
|
||||
codepoint,
|
||||
)
|
||||
.isEqualTo(Token.IDENTIFIER)
|
||||
assertThat(lexer.next()).isEqualTo(Token.EOF)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user