Fix Lexer EOF sentinel collision with valid Unicode code points (#1251)

Fixes an issue where sentinel value (U+7FFF) occurring literally in the source could cause a premature termination of parsing, leading to potential EOF injection attacks. --------- Co-authored-by: Dan Chao <dan.chao@apple.com>
2026-07-18 19:11:15 +02:00 · 2025-10-26 15:53:48 -07:00
parent a8f76d6209
commit fdb2bd8c75
2 changed files with 79 additions and 14 deletions
@@ -30,7 +30,7 @@ public class Lexer {
  private int sLine = 1;
  private int col = 1;
  private int sCol = 1;
-  private char lookahead;
+  private int lookahead;
  private State state = State.DEFAULT;
  private final Deque<InterpolationScope> interpolationStack = new ArrayDeque<>();
  private boolean stringEnded = false;
@@ -38,7 +38,7 @@ public class Lexer {
  // how many newlines exist between two subsequent tokens
  protected int newLinesBetween = 0;

-  private static final char EOF = Short.MAX_VALUE;
+  private static final int EOF = -1;

  public Lexer(String input) {
    source = input.toCharArray();
@@ -248,7 +248,7 @@ public class Lexer {
          yield lexNumber(ch);
        } else if (isIdentifierStart(ch)) {
          yield lexIdentifier();
-        } else throw lexError(ErrorMessages.create("invalidCharacter", ch), cursor - 1, 1);
+        } else throw lexError(ErrorMessages.create("invalidCharacter", (char) ch), cursor - 1, 1);
      }
    };
  }
@@ -450,7 +450,7 @@ public class Lexer {
      case 'u' -> lexUnicodeEscape();
      default ->
          throw lexError(
-              ErrorMessages.create("invalidCharacterEscapeSequence", "\\" + ch, "\\"),
+              ErrorMessages.create("invalidCharacterEscapeSequence", "\\" + (char) ch, "\\"),
              cursor - 2,
              2);
    };
@@ -513,7 +513,7 @@ public class Lexer {
    }
  }

-  private Token lexNumber(char start) {
+  private Token lexNumber(int start) {
    if (start == '0') {
      if (lookahead == 'x' || lookahead == 'X') {
        nextChar();
@@ -626,9 +626,9 @@ public class Lexer {
    if (lookahead == '_') {
      throw lexError("invalidSeparatorPosition");
    }
-    var ch = (int) lookahead;
+    var ch = lookahead;
    if (!(ch >= 48 && ch <= 55)) {
-      throw unexpectedChar((char) ch, "octal number");
+      throw unexpectedChar(ch, "octal number");
    }
    while ((ch >= 48 && ch <= 55) || ch == '_') {
      nextChar();
@@ -671,20 +671,19 @@ public class Lexer {
    return Token.SHEBANG;
  }

-  private boolean isHex(char ch) {
-    var code = (int) ch;
+  private boolean isHex(int code) {
    return (code >= 48 && code <= 57) || (code >= 97 && code <= 102) || (code >= 65 && code <= 70);
  }

-  private static boolean isIdentifierStart(char c) {
+  private static boolean isIdentifierStart(int c) {
    return c == '_' || c == '$' || Character.isUnicodeIdentifierStart(c);
  }

-  private static boolean isIdentifierPart(char c) {
+  private static boolean isIdentifierPart(int c) {
    return c != EOF && (c == '$' || Character.isUnicodeIdentifierPart(c));
  }

-  private char nextChar() {
+  private int nextChar() {
    var tmp = lookahead;
    cursor++;
    if (cursor >= size) {
@@ -726,11 +725,11 @@ public class Lexer {
    return new ParserError(msg, span);
  }

-  private ParserError unexpectedChar(char got, String didYouMean) {
+  private ParserError unexpectedChar(int got, String didYouMean) {
    if (got == EOF) {
      return unexpectedChar("EOF", didYouMean);
    }
-    return lexError("unexpectedCharacter", got, didYouMean);
+    return lexError("unexpectedCharacter", (char) got, didYouMean);
  }

  private ParserError unexpectedChar(String got, String didYouMean) {
@@ -53,4 +53,70 @@ class LexerTest {
    val thrown = assertThrows<ParserError> { Lexer("`").next() }
    assertThat(thrown).hasMessageContaining("Unexpected character `EOF`")
  }
+
+  @Test
+  fun rejectsSentinelBetweenTokens() {
+    val lexerFFFF = Lexer("// Comment with \uFFFF character\nclass \uFFFF Bar")
+    assertThat(lexerFFFF.next()).isEqualTo(Token.LINE_COMMENT)
+    assertThat(lexerFFFF.next()).isEqualTo(Token.CLASS)
+    val thrown = assertThrows<ParserError> { lexerFFFF.next() }
+    assertThat(thrown).hasMessageContaining("Invalid identifier")
+  }
+
+  @Test
+  fun acceptsAllUnicodeCodepointsInComments() {
+    // Test valid Unicode codepoints can appear literally
+    // without being misinterpreted as EOF.
+
+    // Test the previously problematic U+7FFF (Short.MAX_VALUE)
+    val lexer7FFF = Lexer("// Comment with \u7FFF character\nclass Foo")
+    assertThat(lexer7FFF.next()).isEqualTo(Token.LINE_COMMENT)
+    assertThat(lexer7FFF.next()).isEqualTo(Token.CLASS)
+    assertThat(lexer7FFF.next()).isEqualTo(Token.IDENTIFIER)
+    assertThat(lexer7FFF.next()).isEqualTo(Token.EOF)
+
+    // Test U+FFFF (Character.MAX_VALUE)
+    val lexerFFFF = Lexer("// Comment with \uFFFF character\nclass Bar")
+    assertThat(lexerFFFF.next()).isEqualTo(Token.LINE_COMMENT)
+    assertThat(lexerFFFF.next()).isEqualTo(Token.CLASS)
+    assertThat(lexerFFFF.next()).isEqualTo(Token.IDENTIFIER)
+    assertThat(lexerFFFF.next()).isEqualTo(Token.EOF)
+
+    // Test a range of codepoints including edge cases
+    val testCodepoints =
+      listOf(
+        0x0000, // NULL
+        0x0001, // Start of heading
+        0x007F, // DELETE
+        0x0080, // First non-ASCII
+        0x7FFE, // One before the old problematic value
+        0x7FFF, // Old EOF sentinel (Short.MAX_VALUE)
+        0x8000, // One after the old problematic value
+        0xFFFE, // One before Character.MAX_VALUE
+        0xFFFF, // Character.MAX_VALUE (noncharacter)
+      )
+
+    for (codepoint in testCodepoints) {
+      val char = codepoint.toChar()
+      // Put the test character in a comment, followed by actual code tokens
+      val input = "// Test $char\nmodule Test"
+      val lexer = Lexer(input)
+      assertThat(lexer.next())
+        .withFailMessage("Codepoint U+%04X should be accepted in comment", codepoint)
+        .isEqualTo(Token.LINE_COMMENT)
+      assertThat(lexer.next())
+        .withFailMessage(
+          "Codepoint U+%04X should not terminate input early (expecting MODULE)",
+          codepoint,
+        )
+        .isEqualTo(Token.MODULE)
+      assertThat(lexer.next())
+        .withFailMessage(
+          "Codepoint U+%04X should not terminate input early (expecting IDENTIFIER)",
+          codepoint,
+        )
+        .isEqualTo(Token.IDENTIFIER)
+      assertThat(lexer.next()).isEqualTo(Token.EOF)
+    }
+  }
 }