From 1cab0238fabea1b193f27f502ada4c14d014f4cb Mon Sep 17 00:00:00 2001 From: Flavio Soibelmann Glock Date: Thu, 29 Jan 2026 16:38:06 +0100 Subject: [PATCH] Fix uni/variables.t diagnostics under no-utf8/evalbytes --- src/main/java/org/perlonjava/lexer/Lexer.java | 2 +- .../perlonjava/parser/IdentifierParser.java | 48 ++++++++++++++++--- .../org/perlonjava/parser/ParseInfix.java | 2 +- 3 files changed, 44 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/perlonjava/lexer/Lexer.java b/src/main/java/org/perlonjava/lexer/Lexer.java index 96fd6b8f..2b907560 100644 --- a/src/main/java/org/perlonjava/lexer/Lexer.java +++ b/src/main/java/org/perlonjava/lexer/Lexer.java @@ -174,7 +174,7 @@ public LexerToken consumeWhitespace() { while (position < length && input.charAt(position) != '\n' && input.charAt(position) != '\r' - && (input.charAt(position) == ' ' || Character.isWhitespace(input.charAt(position)))) { + && (input.charAt(position) == ' ' || input.charAt(position) == '\t' || input.charAt(position) == '\f')) { position++; } return new LexerToken(LexerTokenType.WHITESPACE, input.substring(start, position)); diff --git a/src/main/java/org/perlonjava/parser/IdentifierParser.java b/src/main/java/org/perlonjava/parser/IdentifierParser.java index 7c7b7baa..4ee5df98 100644 --- a/src/main/java/org/perlonjava/parser/IdentifierParser.java +++ b/src/main/java/org/perlonjava/parser/IdentifierParser.java @@ -155,6 +155,28 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr LexerToken token = parser.tokens.get(parser.tokenIndex); LexerToken nextToken = parser.tokens.get(parser.tokenIndex + 1); + // In `no utf8` mode (or `evalbytes`), Perl still allows many non-ASCII bytes as length-1 variables, + // but it must reject whitespace-like bytes and format/control bytes. Additionally, for length-2+ + // identifiers, non-ASCII bytes are not allowed. + boolean utf8Enabled = parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8) + && !parser.ctx.compilerOptions.isEvalbytes; + + if (!utf8Enabled && token.type == LexerTokenType.IDENTIFIER) { + // The Lexer may have greedily consumed non-ASCII identifier parts into a single IDENTIFIER token. + // Under `no utf8` / `evalbytes`, those are not allowed for length-2+ variables. + String id = token.text; + if (id.length() > 1) { + for (int i = 0; i < id.length(); ) { + int cp = id.codePointAt(i); + if (cp > 127) { + String hex = "\\x{" + Integer.toHexString(cp) + "}"; + throw new PerlCompilerException("Unrecognized character " + hex + ";"); + } + i += Character.charCount(cp); + } + } + } + if (skippedWhitespace) { // Perl allows "$ a" (whitespace before an identifier). But if whitespace is followed by // something that cannot start an identifier (e.g. "$\t = 4"), Perl reports a syntax error. @@ -209,8 +231,6 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr // Under 'no utf8', Perl allows many non-ASCII bytes as length-1 variables. // Only enforce XID_START there for multi-character identifiers. - boolean utf8Enabled = parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8) - && !parser.ctx.compilerOptions.isEvalbytes; boolean hasMoreIdentifierContent = insideBraces && (nextToken.type == LexerTokenType.IDENTIFIER || nextToken.type == LexerTokenType.NUMBER); boolean mustValidateStart = utf8Enabled || id.length() > 1 || hasMoreIdentifierContent; @@ -220,19 +240,37 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr // Also reject control characters (0x00-0x1F, 0x7F) as identifier starts. // Reject control characters and other non-graphic bytes that Perl treats as invalid variable names. // In particular, C1 controls (0x80-0x9F) must always be rejected. + // Under `no utf8` / `evalbytes`, reject whitespace-like and format/control characters even + // for length-1 variables. + boolean rejectEvenAsLengthOne = !utf8Enabled + && id.length() == 1 + && (UCharacter.hasBinaryProperty(cp, UProperty.WHITE_SPACE) + || UCharacter.getType(cp) == UCharacter.FORMAT + || UCharacter.getType(cp) == UCharacter.CONTROL); + if (cp == 0xFFFD || cp < 32 || cp == 127 || (cp >= 0x80 && cp <= 0x9F) + || rejectEvenAsLengthOne || (mustValidateStart && !valid)) { String hex; // Special case: if we got the Unicode replacement character (0xFFFD), // it likely means the original was an invalid UTF-8 byte sequence. // For Perl compatibility, we should report a representative invalid byte. if (cp == 0xFFFD) { - hex = "\\xB6"; + hex = utf8Enabled ? "\\x{fffd}" : "\\xB6"; } else { - if (cp <= 255) { + if (cp < 32 || cp == 127) { + // Perl formats control bytes differently depending on the syntactic form. + // In ${...} contexts it commonly uses \xNN, while for bare length-1 identifiers + // (e.g. \x{0}) it uses \x{n}. + if (insideBraces) { + hex = String.format("\\x%02x", cp); + } else { + hex = "\\x{" + Integer.toHexString(cp) + "}"; + } + } else if (cp <= 255) { // Perl tends to report non-ASCII bytes as \x{..} in these contexts hex = "\\x{" + Integer.toHexString(cp) + "}"; } else { @@ -253,8 +291,6 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr int cp = id.codePointAt(0); boolean valid = cp == '_' || UCharacter.hasBinaryProperty(cp, UProperty.XID_START); - boolean utf8Enabled = parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8) - && !parser.ctx.compilerOptions.isEvalbytes; boolean mustValidateStart = utf8Enabled || id.length() > 1; if (mustValidateStart && !valid) { diff --git a/src/main/java/org/perlonjava/parser/ParseInfix.java b/src/main/java/org/perlonjava/parser/ParseInfix.java index 23d5dd3c..eca75b33 100644 --- a/src/main/java/org/perlonjava/parser/ParseInfix.java +++ b/src/main/java/org/perlonjava/parser/ParseInfix.java @@ -295,7 +295,7 @@ public static Node parseInfixOperation(Parser parser, Node left, int precedence) parser.tokenIndex--; return left; } - throw new PerlCompilerException(parser.tokenIndex, "Unexpected infix operator: " + token, parser.ctx.errorUtil); + throw new PerlCompilerException(parser.tokenIndex, "syntax error", parser.ctx.errorUtil); } }