Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/main/java/org/perlonjava/lexer/Lexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ public LexerToken consumeWhitespace() {
while (position < length
&& input.charAt(position) != '\n'
&& input.charAt(position) != '\r'
&& (input.charAt(position) == ' ' || Character.isWhitespace(input.charAt(position)))) {
&& (input.charAt(position) == ' ' || input.charAt(position) == '\t' || input.charAt(position) == '\f')) {
position++;
}
return new LexerToken(LexerTokenType.WHITESPACE, input.substring(start, position));
Expand Down
48 changes: 42 additions & 6 deletions src/main/java/org/perlonjava/parser/IdentifierParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,28 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr
LexerToken token = parser.tokens.get(parser.tokenIndex);
LexerToken nextToken = parser.tokens.get(parser.tokenIndex + 1);

// In `no utf8` mode (or `evalbytes`), Perl still allows many non-ASCII bytes as length-1 variables,
// but it must reject whitespace-like bytes and format/control bytes. Additionally, for length-2+
// identifiers, non-ASCII bytes are not allowed.
boolean utf8Enabled = parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8)
&& !parser.ctx.compilerOptions.isEvalbytes;

if (!utf8Enabled && token.type == LexerTokenType.IDENTIFIER) {
// The Lexer may have greedily consumed non-ASCII identifier parts into a single IDENTIFIER token.
// Under `no utf8` / `evalbytes`, those are not allowed for length-2+ variables.
String id = token.text;
if (id.length() > 1) {
for (int i = 0; i < id.length(); ) {
int cp = id.codePointAt(i);
if (cp > 127) {
String hex = "\\x{" + Integer.toHexString(cp) + "}";
throw new PerlCompilerException("Unrecognized character " + hex + ";");
}
i += Character.charCount(cp);
}
}
}

if (skippedWhitespace) {
// Perl allows "$ a" (whitespace before an identifier). But if whitespace is followed by
// something that cannot start an identifier (e.g. "$\t = 4"), Perl reports a syntax error.
Expand Down Expand Up @@ -209,8 +231,6 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr

// Under 'no utf8', Perl allows many non-ASCII bytes as length-1 variables.
// Only enforce XID_START there for multi-character identifiers.
boolean utf8Enabled = parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8)
&& !parser.ctx.compilerOptions.isEvalbytes;
boolean hasMoreIdentifierContent = insideBraces
&& (nextToken.type == LexerTokenType.IDENTIFIER || nextToken.type == LexerTokenType.NUMBER);
boolean mustValidateStart = utf8Enabled || id.length() > 1 || hasMoreIdentifierContent;
Expand All @@ -220,19 +240,37 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr
// Also reject control characters (0x00-0x1F, 0x7F) as identifier starts.
// Reject control characters and other non-graphic bytes that Perl treats as invalid variable names.
// In particular, C1 controls (0x80-0x9F) must always be rejected.
// Under `no utf8` / `evalbytes`, reject whitespace-like and format/control characters even
// for length-1 variables.
boolean rejectEvenAsLengthOne = !utf8Enabled
&& id.length() == 1
&& (UCharacter.hasBinaryProperty(cp, UProperty.WHITE_SPACE)
|| UCharacter.getType(cp) == UCharacter.FORMAT
|| UCharacter.getType(cp) == UCharacter.CONTROL);

if (cp == 0xFFFD
|| cp < 32
|| cp == 127
|| (cp >= 0x80 && cp <= 0x9F)
|| rejectEvenAsLengthOne
|| (mustValidateStart && !valid)) {
String hex;
// Special case: if we got the Unicode replacement character (0xFFFD),
// it likely means the original was an invalid UTF-8 byte sequence.
// For Perl compatibility, we should report a representative invalid byte.
if (cp == 0xFFFD) {
hex = "\\xB6";
hex = utf8Enabled ? "\\x{fffd}" : "\\xB6";
} else {
if (cp <= 255) {
if (cp < 32 || cp == 127) {
// Perl formats control bytes differently depending on the syntactic form.
// In ${...} contexts it commonly uses \xNN, while for bare length-1 identifiers
// (e.g. \x{0}) it uses \x{n}.
if (insideBraces) {
hex = String.format("\\x%02x", cp);
} else {
hex = "\\x{" + Integer.toHexString(cp) + "}";
}
} else if (cp <= 255) {
// Perl tends to report non-ASCII bytes as \x{..} in these contexts
hex = "\\x{" + Integer.toHexString(cp) + "}";
} else {
Expand All @@ -253,8 +291,6 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr
int cp = id.codePointAt(0);
boolean valid = cp == '_' || UCharacter.hasBinaryProperty(cp, UProperty.XID_START);

boolean utf8Enabled = parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8)
&& !parser.ctx.compilerOptions.isEvalbytes;
boolean mustValidateStart = utf8Enabled || id.length() > 1;

if (mustValidateStart && !valid) {
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/perlonjava/parser/ParseInfix.java
Original file line number Diff line number Diff line change
Expand Up @@ -295,7 +295,7 @@ public static Node parseInfixOperation(Parser parser, Node left, int precedence)
parser.tokenIndex--;
return left;
}
throw new PerlCompilerException(parser.tokenIndex, "Unexpected infix operator: " + token, parser.ctx.errorUtil);
throw new PerlCompilerException(parser.tokenIndex, "syntax error", parser.ctx.errorUtil);
}
}

Expand Down