Fix Lexer EOF sentinel collision with valid Unicode code points (#1251)

Fixes an issue where sentinel value (U+7FFF) occurring literally in the source could cause a premature termination of parsing, leading to potential EOF injection attacks.

---------

Co-authored-by: Dan Chao <dan.chao@apple.com>
This commit is contained in:
Spencer Phillip Young
2025-10-26 15:53:48 -07:00
committed by GitHub
parent a8f76d6209
commit fdb2bd8c75
2 changed files with 79 additions and 14 deletions

View File

@@ -30,7 +30,7 @@ public class Lexer {
private int sLine = 1;
private int col = 1;
private int sCol = 1;
private char lookahead;
private int lookahead;
private State state = State.DEFAULT;
private final Deque<InterpolationScope> interpolationStack = new ArrayDeque<>();
private boolean stringEnded = false;
@@ -38,7 +38,7 @@ public class Lexer {
// how many newlines exist between two subsequent tokens
protected int newLinesBetween = 0;
private static final char EOF = Short.MAX_VALUE;
private static final int EOF = -1;
public Lexer(String input) {
source = input.toCharArray();
@@ -248,7 +248,7 @@ public class Lexer {
yield lexNumber(ch);
} else if (isIdentifierStart(ch)) {
yield lexIdentifier();
} else throw lexError(ErrorMessages.create("invalidCharacter", ch), cursor - 1, 1);
} else throw lexError(ErrorMessages.create("invalidCharacter", (char) ch), cursor - 1, 1);
}
};
}
@@ -450,7 +450,7 @@ public class Lexer {
case 'u' -> lexUnicodeEscape();
default ->
throw lexError(
ErrorMessages.create("invalidCharacterEscapeSequence", "\\" + ch, "\\"),
ErrorMessages.create("invalidCharacterEscapeSequence", "\\" + (char) ch, "\\"),
cursor - 2,
2);
};
@@ -513,7 +513,7 @@ public class Lexer {
}
}
private Token lexNumber(char start) {
private Token lexNumber(int start) {
if (start == '0') {
if (lookahead == 'x' || lookahead == 'X') {
nextChar();
@@ -626,9 +626,9 @@ public class Lexer {
if (lookahead == '_') {
throw lexError("invalidSeparatorPosition");
}
var ch = (int) lookahead;
var ch = lookahead;
if (!(ch >= 48 && ch <= 55)) {
throw unexpectedChar((char) ch, "octal number");
throw unexpectedChar(ch, "octal number");
}
while ((ch >= 48 && ch <= 55) || ch == '_') {
nextChar();
@@ -671,20 +671,19 @@ public class Lexer {
return Token.SHEBANG;
}
private boolean isHex(char ch) {
var code = (int) ch;
private boolean isHex(int code) {
return (code >= 48 && code <= 57) || (code >= 97 && code <= 102) || (code >= 65 && code <= 70);
}
private static boolean isIdentifierStart(char c) {
private static boolean isIdentifierStart(int c) {
return c == '_' || c == '$' || Character.isUnicodeIdentifierStart(c);
}
private static boolean isIdentifierPart(char c) {
private static boolean isIdentifierPart(int c) {
return c != EOF && (c == '$' || Character.isUnicodeIdentifierPart(c));
}
private char nextChar() {
private int nextChar() {
var tmp = lookahead;
cursor++;
if (cursor >= size) {
@@ -726,11 +725,11 @@ public class Lexer {
return new ParserError(msg, span);
}
private ParserError unexpectedChar(char got, String didYouMean) {
private ParserError unexpectedChar(int got, String didYouMean) {
if (got == EOF) {
return unexpectedChar("EOF", didYouMean);
}
return lexError("unexpectedCharacter", got, didYouMean);
return lexError("unexpectedCharacter", (char) got, didYouMean);
}
private ParserError unexpectedChar(String got, String didYouMean) {

View File

@@ -53,4 +53,70 @@ class LexerTest {
val thrown = assertThrows<ParserError> { Lexer("`").next() }
assertThat(thrown).hasMessageContaining("Unexpected character `EOF`")
}
@Test
fun rejectsSentinelBetweenTokens() {
val lexerFFFF = Lexer("// Comment with \uFFFF character\nclass \uFFFF Bar")
assertThat(lexerFFFF.next()).isEqualTo(Token.LINE_COMMENT)
assertThat(lexerFFFF.next()).isEqualTo(Token.CLASS)
val thrown = assertThrows<ParserError> { lexerFFFF.next() }
assertThat(thrown).hasMessageContaining("Invalid identifier")
}
@Test
fun acceptsAllUnicodeCodepointsInComments() {
// Test valid Unicode codepoints can appear literally
// without being misinterpreted as EOF.
// Test the previously problematic U+7FFF (Short.MAX_VALUE)
val lexer7FFF = Lexer("// Comment with \u7FFF character\nclass Foo")
assertThat(lexer7FFF.next()).isEqualTo(Token.LINE_COMMENT)
assertThat(lexer7FFF.next()).isEqualTo(Token.CLASS)
assertThat(lexer7FFF.next()).isEqualTo(Token.IDENTIFIER)
assertThat(lexer7FFF.next()).isEqualTo(Token.EOF)
// Test U+FFFF (Character.MAX_VALUE)
val lexerFFFF = Lexer("// Comment with \uFFFF character\nclass Bar")
assertThat(lexerFFFF.next()).isEqualTo(Token.LINE_COMMENT)
assertThat(lexerFFFF.next()).isEqualTo(Token.CLASS)
assertThat(lexerFFFF.next()).isEqualTo(Token.IDENTIFIER)
assertThat(lexerFFFF.next()).isEqualTo(Token.EOF)
// Test a range of codepoints including edge cases
val testCodepoints =
listOf(
0x0000, // NULL
0x0001, // Start of heading
0x007F, // DELETE
0x0080, // First non-ASCII
0x7FFE, // One before the old problematic value
0x7FFF, // Old EOF sentinel (Short.MAX_VALUE)
0x8000, // One after the old problematic value
0xFFFE, // One before Character.MAX_VALUE
0xFFFF, // Character.MAX_VALUE (noncharacter)
)
for (codepoint in testCodepoints) {
val char = codepoint.toChar()
// Put the test character in a comment, followed by actual code tokens
val input = "// Test $char\nmodule Test"
val lexer = Lexer(input)
assertThat(lexer.next())
.withFailMessage("Codepoint U+%04X should be accepted in comment", codepoint)
.isEqualTo(Token.LINE_COMMENT)
assertThat(lexer.next())
.withFailMessage(
"Codepoint U+%04X should not terminate input early (expecting MODULE)",
codepoint,
)
.isEqualTo(Token.MODULE)
assertThat(lexer.next())
.withFailMessage(
"Codepoint U+%04X should not terminate input early (expecting IDENTIFIER)",
codepoint,
)
.isEqualTo(Token.IDENTIFIER)
assertThat(lexer.next()).isEqualTo(Token.EOF)
}
}
}