From fdb2bd8c75a690f6d26d18e1cb967302672d225f Mon Sep 17 00:00:00 2001
From: Spencer Phillip Young <spencer.young@spyoung.com>
Date: Sun, 26 Oct 2025 15:53:48 -0700
Subject: [PATCH] Fix Lexer EOF sentinel collision with valid Unicode code
 points (#1251)

Fixes an issue where sentinel value (U+7FFF) occurring literally in the source could cause a premature termination of parsing, leading to potential EOF injection attacks.

---------

Co-authored-by: Dan Chao <dan.chao@apple.com>
---
 .../src/main/java/org/pkl/parser/Lexer.java   | 27 ++++----
 .../test/kotlin/org/pkl/parser/LexerTest.kt   | 66 +++++++++++++++++++
 2 files changed, 79 insertions(+), 14 deletions(-)

diff --git a/pkl-parser/src/main/java/org/pkl/parser/Lexer.java b/pkl-parser/src/main/java/org/pkl/parser/Lexer.java
index a127286e..c79f811c 100644
--- a/pkl-parser/src/main/java/org/pkl/parser/Lexer.java
+++ b/pkl-parser/src/main/java/org/pkl/parser/Lexer.java
@@ -30,7 +30,7 @@ public class Lexer {
   private int sLine = 1;
   private int col = 1;
   private int sCol = 1;
-  private char lookahead;
+  private int lookahead;
   private State state = State.DEFAULT;
   private final Deque<InterpolationScope> interpolationStack = new ArrayDeque<>();
   private boolean stringEnded = false;
@@ -38,7 +38,7 @@ public class Lexer {
   // how many newlines exist between two subsequent tokens
   protected int newLinesBetween = 0;
 
-  private static final char EOF = Short.MAX_VALUE;
+  private static final int EOF = -1;
 
   public Lexer(String input) {
     source = input.toCharArray();
@@ -248,7 +248,7 @@ public class Lexer {
           yield lexNumber(ch);
         } else if (isIdentifierStart(ch)) {
           yield lexIdentifier();
-        } else throw lexError(ErrorMessages.create("invalidCharacter", ch), cursor - 1, 1);
+        } else throw lexError(ErrorMessages.create("invalidCharacter", (char) ch), cursor - 1, 1);
       }
     };
   }
@@ -450,7 +450,7 @@ public class Lexer {
       case 'u' -> lexUnicodeEscape();
       default ->
           throw lexError(
-              ErrorMessages.create("invalidCharacterEscapeSequence", "\\" + ch, "\\"),
+              ErrorMessages.create("invalidCharacterEscapeSequence", "\\" + (char) ch, "\\"),
               cursor - 2,
               2);
     };
@@ -513,7 +513,7 @@ public class Lexer {
     }
   }
 
-  private Token lexNumber(char start) {
+  private Token lexNumber(int start) {
     if (start == '0') {
       if (lookahead == 'x' || lookahead == 'X') {
         nextChar();
@@ -626,9 +626,9 @@ public class Lexer {
     if (lookahead == '_') {
       throw lexError("invalidSeparatorPosition");
     }
-    var ch = (int) lookahead;
+    var ch = lookahead;
     if (!(ch >= 48 && ch <= 55)) {
-      throw unexpectedChar((char) ch, "octal number");
+      throw unexpectedChar(ch, "octal number");
     }
     while ((ch >= 48 && ch <= 55) || ch == '_') {
       nextChar();
@@ -671,20 +671,19 @@ public class Lexer {
     return Token.SHEBANG;
   }
 
-  private boolean isHex(char ch) {
-    var code = (int) ch;
+  private boolean isHex(int code) {
     return (code >= 48 && code <= 57) || (code >= 97 && code <= 102) || (code >= 65 && code <= 70);
   }
 
-  private static boolean isIdentifierStart(char c) {
+  private static boolean isIdentifierStart(int c) {
     return c == '_' || c == '$' || Character.isUnicodeIdentifierStart(c);
   }
 
-  private static boolean isIdentifierPart(char c) {
+  private static boolean isIdentifierPart(int c) {
     return c != EOF && (c == '$' || Character.isUnicodeIdentifierPart(c));
   }
 
-  private char nextChar() {
+  private int nextChar() {
     var tmp = lookahead;
     cursor++;
     if (cursor >= size) {
@@ -726,11 +725,11 @@ public class Lexer {
     return new ParserError(msg, span);
   }
 
-  private ParserError unexpectedChar(char got, String didYouMean) {
+  private ParserError unexpectedChar(int got, String didYouMean) {
     if (got == EOF) {
       return unexpectedChar("EOF", didYouMean);
     }
-    return lexError("unexpectedCharacter", got, didYouMean);
+    return lexError("unexpectedCharacter", (char) got, didYouMean);
   }
 
   private ParserError unexpectedChar(String got, String didYouMean) {
diff --git a/pkl-parser/src/test/kotlin/org/pkl/parser/LexerTest.kt b/pkl-parser/src/test/kotlin/org/pkl/parser/LexerTest.kt
index e76eebab..c9c19841 100644
--- a/pkl-parser/src/test/kotlin/org/pkl/parser/LexerTest.kt
+++ b/pkl-parser/src/test/kotlin/org/pkl/parser/LexerTest.kt
@@ -53,4 +53,70 @@ class LexerTest {
     val thrown = assertThrows<ParserError> { Lexer("`").next() }
     assertThat(thrown).hasMessageContaining("Unexpected character `EOF`")
   }
+
+  @Test
+  fun rejectsSentinelBetweenTokens() {
+    val lexerFFFF = Lexer("// Comment with \uFFFF character\nclass \uFFFF Bar")
+    assertThat(lexerFFFF.next()).isEqualTo(Token.LINE_COMMENT)
+    assertThat(lexerFFFF.next()).isEqualTo(Token.CLASS)
+    val thrown = assertThrows<ParserError> { lexerFFFF.next() }
+    assertThat(thrown).hasMessageContaining("Invalid identifier")
+  }
+
+  @Test
+  fun acceptsAllUnicodeCodepointsInComments() {
+    // Test valid Unicode codepoints can appear literally
+    // without being misinterpreted as EOF.
+
+    // Test the previously problematic U+7FFF (Short.MAX_VALUE)
+    val lexer7FFF = Lexer("// Comment with \u7FFF character\nclass Foo")
+    assertThat(lexer7FFF.next()).isEqualTo(Token.LINE_COMMENT)
+    assertThat(lexer7FFF.next()).isEqualTo(Token.CLASS)
+    assertThat(lexer7FFF.next()).isEqualTo(Token.IDENTIFIER)
+    assertThat(lexer7FFF.next()).isEqualTo(Token.EOF)
+
+    // Test U+FFFF (Character.MAX_VALUE)
+    val lexerFFFF = Lexer("// Comment with \uFFFF character\nclass Bar")
+    assertThat(lexerFFFF.next()).isEqualTo(Token.LINE_COMMENT)
+    assertThat(lexerFFFF.next()).isEqualTo(Token.CLASS)
+    assertThat(lexerFFFF.next()).isEqualTo(Token.IDENTIFIER)
+    assertThat(lexerFFFF.next()).isEqualTo(Token.EOF)
+
+    // Test a range of codepoints including edge cases
+    val testCodepoints =
+      listOf(
+        0x0000, // NULL
+        0x0001, // Start of heading
+        0x007F, // DELETE
+        0x0080, // First non-ASCII
+        0x7FFE, // One before the old problematic value
+        0x7FFF, // Old EOF sentinel (Short.MAX_VALUE)
+        0x8000, // One after the old problematic value
+        0xFFFE, // One before Character.MAX_VALUE
+        0xFFFF, // Character.MAX_VALUE (noncharacter)
+      )
+
+    for (codepoint in testCodepoints) {
+      val char = codepoint.toChar()
+      // Put the test character in a comment, followed by actual code tokens
+      val input = "// Test $char\nmodule Test"
+      val lexer = Lexer(input)
+      assertThat(lexer.next())
+        .withFailMessage("Codepoint U+%04X should be accepted in comment", codepoint)
+        .isEqualTo(Token.LINE_COMMENT)
+      assertThat(lexer.next())
+        .withFailMessage(
+          "Codepoint U+%04X should not terminate input early (expecting MODULE)",
+          codepoint,
+        )
+        .isEqualTo(Token.MODULE)
+      assertThat(lexer.next())
+        .withFailMessage(
+          "Codepoint U+%04X should not terminate input early (expecting IDENTIFIER)",
+          codepoint,
+        )
+        .isEqualTo(Token.IDENTIFIER)
+      assertThat(lexer.next()).isEqualTo(Token.EOF)
+    }
+  }
 }