Fix Lexer EOF sentinel collision with valid Unicode code points (#1251)

Fixes an issue where sentinel value (U+7FFF) occurring literally in the source could cause a premature termination of parsing, leading to potential EOF injection attacks.

---------

Co-authored-by: Dan Chao <dan.chao@apple.com>
This commit is contained in:
Spencer Phillip Young
2025-10-26 15:53:48 -07:00
committed by GitHub
parent a8f76d6209
commit fdb2bd8c75
2 changed files with 79 additions and 14 deletions

View File

@@ -30,7 +30,7 @@ public class Lexer {
private int sLine = 1;
private int col = 1;
private int sCol = 1;
private char lookahead;
private int lookahead;
private State state = State.DEFAULT;
private final Deque<InterpolationScope> interpolationStack = new ArrayDeque<>();
private boolean stringEnded = false;
@@ -38,7 +38,7 @@ public class Lexer {
// how many newlines exist between two subsequent tokens
protected int newLinesBetween = 0;
private static final char EOF = Short.MAX_VALUE;
private static final int EOF = -1;
public Lexer(String input) {
source = input.toCharArray();
@@ -248,7 +248,7 @@ public class Lexer {
yield lexNumber(ch);
} else if (isIdentifierStart(ch)) {
yield lexIdentifier();
} else throw lexError(ErrorMessages.create("invalidCharacter", ch), cursor - 1, 1);
} else throw lexError(ErrorMessages.create("invalidCharacter", (char) ch), cursor - 1, 1);
}
};
}
@@ -450,7 +450,7 @@ public class Lexer {
case 'u' -> lexUnicodeEscape();
default ->
throw lexError(
ErrorMessages.create("invalidCharacterEscapeSequence", "\\" + ch, "\\"),
ErrorMessages.create("invalidCharacterEscapeSequence", "\\" + (char) ch, "\\"),
cursor - 2,
2);
};
@@ -513,7 +513,7 @@ public class Lexer {
}
}
private Token lexNumber(char start) {
private Token lexNumber(int start) {
if (start == '0') {
if (lookahead == 'x' || lookahead == 'X') {
nextChar();
@@ -626,9 +626,9 @@ public class Lexer {
if (lookahead == '_') {
throw lexError("invalidSeparatorPosition");
}
var ch = (int) lookahead;
var ch = lookahead;
if (!(ch >= 48 && ch <= 55)) {
throw unexpectedChar((char) ch, "octal number");
throw unexpectedChar(ch, "octal number");
}
while ((ch >= 48 && ch <= 55) || ch == '_') {
nextChar();
@@ -671,20 +671,19 @@ public class Lexer {
return Token.SHEBANG;
}
private boolean isHex(char ch) {
var code = (int) ch;
private boolean isHex(int code) {
return (code >= 48 && code <= 57) || (code >= 97 && code <= 102) || (code >= 65 && code <= 70);
}
private static boolean isIdentifierStart(char c) {
private static boolean isIdentifierStart(int c) {
return c == '_' || c == '$' || Character.isUnicodeIdentifierStart(c);
}
private static boolean isIdentifierPart(char c) {
private static boolean isIdentifierPart(int c) {
return c != EOF && (c == '$' || Character.isUnicodeIdentifierPart(c));
}
private char nextChar() {
private int nextChar() {
var tmp = lookahead;
cursor++;
if (cursor >= size) {
@@ -726,11 +725,11 @@ public class Lexer {
return new ParserError(msg, span);
}
private ParserError unexpectedChar(char got, String didYouMean) {
private ParserError unexpectedChar(int got, String didYouMean) {
if (got == EOF) {
return unexpectedChar("EOF", didYouMean);
}
return lexError("unexpectedCharacter", got, didYouMean);
return lexError("unexpectedCharacter", (char) got, didYouMean);
}
private ParserError unexpectedChar(String got, String didYouMean) {