From fdb2bd8c75a690f6d26d18e1cb967302672d225f Mon Sep 17 00:00:00 2001 From: Spencer Phillip Young Date: Sun, 26 Oct 2025 15:53:48 -0700 Subject: [PATCH] Fix Lexer EOF sentinel collision with valid Unicode code points (#1251) Fixes an issue where sentinel value (U+7FFF) occurring literally in the source could cause a premature termination of parsing, leading to potential EOF injection attacks. --------- Co-authored-by: Dan Chao --- .../src/main/java/org/pkl/parser/Lexer.java | 27 ++++---- .../test/kotlin/org/pkl/parser/LexerTest.kt | 66 +++++++++++++++++++ 2 files changed, 79 insertions(+), 14 deletions(-) diff --git a/pkl-parser/src/main/java/org/pkl/parser/Lexer.java b/pkl-parser/src/main/java/org/pkl/parser/Lexer.java index a127286e..c79f811c 100644 --- a/pkl-parser/src/main/java/org/pkl/parser/Lexer.java +++ b/pkl-parser/src/main/java/org/pkl/parser/Lexer.java @@ -30,7 +30,7 @@ public class Lexer { private int sLine = 1; private int col = 1; private int sCol = 1; - private char lookahead; + private int lookahead; private State state = State.DEFAULT; private final Deque interpolationStack = new ArrayDeque<>(); private boolean stringEnded = false; @@ -38,7 +38,7 @@ public class Lexer { // how many newlines exist between two subsequent tokens protected int newLinesBetween = 0; - private static final char EOF = Short.MAX_VALUE; + private static final int EOF = -1; public Lexer(String input) { source = input.toCharArray(); @@ -248,7 +248,7 @@ public class Lexer { yield lexNumber(ch); } else if (isIdentifierStart(ch)) { yield lexIdentifier(); - } else throw lexError(ErrorMessages.create("invalidCharacter", ch), cursor - 1, 1); + } else throw lexError(ErrorMessages.create("invalidCharacter", (char) ch), cursor - 1, 1); } }; } @@ -450,7 +450,7 @@ public class Lexer { case 'u' -> lexUnicodeEscape(); default -> throw lexError( - ErrorMessages.create("invalidCharacterEscapeSequence", "\\" + ch, "\\"), + ErrorMessages.create("invalidCharacterEscapeSequence", "\\" + (char) ch, "\\"), cursor - 2, 2); }; @@ -513,7 +513,7 @@ public class Lexer { } } - private Token lexNumber(char start) { + private Token lexNumber(int start) { if (start == '0') { if (lookahead == 'x' || lookahead == 'X') { nextChar(); @@ -626,9 +626,9 @@ public class Lexer { if (lookahead == '_') { throw lexError("invalidSeparatorPosition"); } - var ch = (int) lookahead; + var ch = lookahead; if (!(ch >= 48 && ch <= 55)) { - throw unexpectedChar((char) ch, "octal number"); + throw unexpectedChar(ch, "octal number"); } while ((ch >= 48 && ch <= 55) || ch == '_') { nextChar(); @@ -671,20 +671,19 @@ public class Lexer { return Token.SHEBANG; } - private boolean isHex(char ch) { - var code = (int) ch; + private boolean isHex(int code) { return (code >= 48 && code <= 57) || (code >= 97 && code <= 102) || (code >= 65 && code <= 70); } - private static boolean isIdentifierStart(char c) { + private static boolean isIdentifierStart(int c) { return c == '_' || c == '$' || Character.isUnicodeIdentifierStart(c); } - private static boolean isIdentifierPart(char c) { + private static boolean isIdentifierPart(int c) { return c != EOF && (c == '$' || Character.isUnicodeIdentifierPart(c)); } - private char nextChar() { + private int nextChar() { var tmp = lookahead; cursor++; if (cursor >= size) { @@ -726,11 +725,11 @@ public class Lexer { return new ParserError(msg, span); } - private ParserError unexpectedChar(char got, String didYouMean) { + private ParserError unexpectedChar(int got, String didYouMean) { if (got == EOF) { return unexpectedChar("EOF", didYouMean); } - return lexError("unexpectedCharacter", got, didYouMean); + return lexError("unexpectedCharacter", (char) got, didYouMean); } private ParserError unexpectedChar(String got, String didYouMean) { diff --git a/pkl-parser/src/test/kotlin/org/pkl/parser/LexerTest.kt b/pkl-parser/src/test/kotlin/org/pkl/parser/LexerTest.kt index e76eebab..c9c19841 100644 --- a/pkl-parser/src/test/kotlin/org/pkl/parser/LexerTest.kt +++ b/pkl-parser/src/test/kotlin/org/pkl/parser/LexerTest.kt @@ -53,4 +53,70 @@ class LexerTest { val thrown = assertThrows { Lexer("`").next() } assertThat(thrown).hasMessageContaining("Unexpected character `EOF`") } + + @Test + fun rejectsSentinelBetweenTokens() { + val lexerFFFF = Lexer("// Comment with \uFFFF character\nclass \uFFFF Bar") + assertThat(lexerFFFF.next()).isEqualTo(Token.LINE_COMMENT) + assertThat(lexerFFFF.next()).isEqualTo(Token.CLASS) + val thrown = assertThrows { lexerFFFF.next() } + assertThat(thrown).hasMessageContaining("Invalid identifier") + } + + @Test + fun acceptsAllUnicodeCodepointsInComments() { + // Test valid Unicode codepoints can appear literally + // without being misinterpreted as EOF. + + // Test the previously problematic U+7FFF (Short.MAX_VALUE) + val lexer7FFF = Lexer("// Comment with \u7FFF character\nclass Foo") + assertThat(lexer7FFF.next()).isEqualTo(Token.LINE_COMMENT) + assertThat(lexer7FFF.next()).isEqualTo(Token.CLASS) + assertThat(lexer7FFF.next()).isEqualTo(Token.IDENTIFIER) + assertThat(lexer7FFF.next()).isEqualTo(Token.EOF) + + // Test U+FFFF (Character.MAX_VALUE) + val lexerFFFF = Lexer("// Comment with \uFFFF character\nclass Bar") + assertThat(lexerFFFF.next()).isEqualTo(Token.LINE_COMMENT) + assertThat(lexerFFFF.next()).isEqualTo(Token.CLASS) + assertThat(lexerFFFF.next()).isEqualTo(Token.IDENTIFIER) + assertThat(lexerFFFF.next()).isEqualTo(Token.EOF) + + // Test a range of codepoints including edge cases + val testCodepoints = + listOf( + 0x0000, // NULL + 0x0001, // Start of heading + 0x007F, // DELETE + 0x0080, // First non-ASCII + 0x7FFE, // One before the old problematic value + 0x7FFF, // Old EOF sentinel (Short.MAX_VALUE) + 0x8000, // One after the old problematic value + 0xFFFE, // One before Character.MAX_VALUE + 0xFFFF, // Character.MAX_VALUE (noncharacter) + ) + + for (codepoint in testCodepoints) { + val char = codepoint.toChar() + // Put the test character in a comment, followed by actual code tokens + val input = "// Test $char\nmodule Test" + val lexer = Lexer(input) + assertThat(lexer.next()) + .withFailMessage("Codepoint U+%04X should be accepted in comment", codepoint) + .isEqualTo(Token.LINE_COMMENT) + assertThat(lexer.next()) + .withFailMessage( + "Codepoint U+%04X should not terminate input early (expecting MODULE)", + codepoint, + ) + .isEqualTo(Token.MODULE) + assertThat(lexer.next()) + .withFailMessage( + "Codepoint U+%04X should not terminate input early (expecting IDENTIFIER)", + codepoint, + ) + .isEqualTo(Token.IDENTIFIER) + assertThat(lexer.next()).isEqualTo(Token.EOF) + } + } }