Fix Lexer EOF sentinel collision with valid Unicode code points (#1251)

Fixes an issue where sentinel value (U+7FFF) occurring literally in the source could cause a premature termination of parsing, leading to potential EOF injection attacks. --------- Co-authored-by: Dan Chao <dan.chao@apple.com>
2026-04-14 12:39:44 +02:00 · 2025-10-26 15:53:48 -07:00
parent a8f76d6209
commit fdb2bd8c75
2 changed files with 79 additions and 14 deletions
--- a/pkl-parser/src/main/java/org/pkl/parser/Lexer.java
+++ b/pkl-parser/src/main/java/org/pkl/parser/Lexer.java
@@ -30,7 +30,7 @@ public class Lexer {
  private int sLine = 1;
  private int col = 1;
  private int sCol = 1;
-  private char lookahead;
+  private int lookahead;
  private State state = State.DEFAULT;
  private final Deque<InterpolationScope> interpolationStack = new ArrayDeque<>();
  private boolean stringEnded = false;
@@ -38,7 +38,7 @@ public class Lexer {
  // how many newlines exist between two subsequent tokens
  protected int newLinesBetween = 0;

-  private static final char EOF = Short.MAX_VALUE;
+  private static final int EOF = -1;

  public Lexer(String input) {
    source = input.toCharArray();
@@ -248,7 +248,7 @@ public class Lexer {
          yield lexNumber(ch);
        } else if (isIdentifierStart(ch)) {
          yield lexIdentifier();
-        } else throw lexError(ErrorMessages.create("invalidCharacter", ch), cursor - 1, 1);
+        } else throw lexError(ErrorMessages.create("invalidCharacter", (char) ch), cursor - 1, 1);
      }
    };
  }
@@ -450,7 +450,7 @@ public class Lexer {
      case 'u' -> lexUnicodeEscape();
      default ->
          throw lexError(
-              ErrorMessages.create("invalidCharacterEscapeSequence", "\\" + ch, "\\"),
+              ErrorMessages.create("invalidCharacterEscapeSequence", "\\" + (char) ch, "\\"),
              cursor - 2,
              2);
    };
@@ -513,7 +513,7 @@ public class Lexer {
    }
  }

-  private Token lexNumber(char start) {
+  private Token lexNumber(int start) {
    if (start == '0') {
      if (lookahead == 'x' || lookahead == 'X') {
        nextChar();
@@ -626,9 +626,9 @@ public class Lexer {
    if (lookahead == '_') {
      throw lexError("invalidSeparatorPosition");
    }
-    var ch = (int) lookahead;
+    var ch = lookahead;
    if (!(ch >= 48 && ch <= 55)) {
-      throw unexpectedChar((char) ch, "octal number");
+      throw unexpectedChar(ch, "octal number");
    }
    while ((ch >= 48 && ch <= 55) || ch == '_') {
      nextChar();
@@ -671,20 +671,19 @@ public class Lexer {
    return Token.SHEBANG;
  }

-  private boolean isHex(char ch) {
-    var code = (int) ch;
+  private boolean isHex(int code) {
    return (code >= 48 && code <= 57) || (code >= 97 && code <= 102) || (code >= 65 && code <= 70);
  }

-  private static boolean isIdentifierStart(char c) {
+  private static boolean isIdentifierStart(int c) {
    return c == '_' || c == '$' || Character.isUnicodeIdentifierStart(c);
  }

-  private static boolean isIdentifierPart(char c) {
+  private static boolean isIdentifierPart(int c) {
    return c != EOF && (c == '$' || Character.isUnicodeIdentifierPart(c));
  }

-  private char nextChar() {
+  private int nextChar() {
    var tmp = lookahead;
    cursor++;
    if (cursor >= size) {
@@ -726,11 +725,11 @@ public class Lexer {
    return new ParserError(msg, span);
  }

-  private ParserError unexpectedChar(char got, String didYouMean) {
+  private ParserError unexpectedChar(int got, String didYouMean) {
    if (got == EOF) {
      return unexpectedChar("EOF", didYouMean);
    }
-    return lexError("unexpectedCharacter", got, didYouMean);
+    return lexError("unexpectedCharacter", (char) got, didYouMean);
  }

  private ParserError unexpectedChar(String got, String didYouMean) {