From 1cab0238fabea1b193f27f502ada4c14d014f4cb Mon Sep 17 00:00:00 2001
From: Flavio Soibelmann Glock <fglock@gmail.com>
Date: Thu, 29 Jan 2026 16:38:06 +0100
Subject: [PATCH] Fix uni/variables.t diagnostics under no-utf8/evalbytes

---
 src/main/java/org/perlonjava/lexer/Lexer.java |  2 +-
 .../perlonjava/parser/IdentifierParser.java   | 48 ++++++++++++++++---
 .../org/perlonjava/parser/ParseInfix.java     |  2 +-
 3 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/src/main/java/org/perlonjava/lexer/Lexer.java b/src/main/java/org/perlonjava/lexer/Lexer.java
index 96fd6b8f..2b907560 100644
--- a/src/main/java/org/perlonjava/lexer/Lexer.java
+++ b/src/main/java/org/perlonjava/lexer/Lexer.java
@@ -174,7 +174,7 @@ public LexerToken consumeWhitespace() {
         while (position < length
                 && input.charAt(position) != '\n'
                 && input.charAt(position) != '\r'
-                && (input.charAt(position) == ' ' || Character.isWhitespace(input.charAt(position)))) {
+                && (input.charAt(position) == ' ' || input.charAt(position) == '\t' || input.charAt(position) == '\f')) {
             position++;
         }
         return new LexerToken(LexerTokenType.WHITESPACE, input.substring(start, position));
diff --git a/src/main/java/org/perlonjava/parser/IdentifierParser.java b/src/main/java/org/perlonjava/parser/IdentifierParser.java
index 7c7b7baa..4ee5df98 100644
--- a/src/main/java/org/perlonjava/parser/IdentifierParser.java
+++ b/src/main/java/org/perlonjava/parser/IdentifierParser.java
@@ -155,6 +155,28 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr
         LexerToken token = parser.tokens.get(parser.tokenIndex);
         LexerToken nextToken = parser.tokens.get(parser.tokenIndex + 1);
 
+        // In `no utf8` mode (or `evalbytes`), Perl still allows many non-ASCII bytes as length-1 variables,
+        // but it must reject whitespace-like bytes and format/control bytes. Additionally, for length-2+
+        // identifiers, non-ASCII bytes are not allowed.
+        boolean utf8Enabled = parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8)
+                && !parser.ctx.compilerOptions.isEvalbytes;
+
+        if (!utf8Enabled && token.type == LexerTokenType.IDENTIFIER) {
+            // The Lexer may have greedily consumed non-ASCII identifier parts into a single IDENTIFIER token.
+            // Under `no utf8` / `evalbytes`, those are not allowed for length-2+ variables.
+            String id = token.text;
+            if (id.length() > 1) {
+                for (int i = 0; i < id.length(); ) {
+                    int cp = id.codePointAt(i);
+                    if (cp > 127) {
+                        String hex = "\\x{" + Integer.toHexString(cp) + "}";
+                        throw new PerlCompilerException("Unrecognized character " + hex + ";");
+                    }
+                    i += Character.charCount(cp);
+                }
+            }
+        }
+
         if (skippedWhitespace) {
             // Perl allows "$ a" (whitespace before an identifier). But if whitespace is followed by
             // something that cannot start an identifier (e.g. "$\t = 4"), Perl reports a syntax error.
@@ -209,8 +231,6 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr
 
             // Under 'no utf8', Perl allows many non-ASCII bytes as length-1 variables.
             // Only enforce XID_START there for multi-character identifiers.
-            boolean utf8Enabled = parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8)
-                    && !parser.ctx.compilerOptions.isEvalbytes;
             boolean hasMoreIdentifierContent = insideBraces
                     && (nextToken.type == LexerTokenType.IDENTIFIER || nextToken.type == LexerTokenType.NUMBER);
             boolean mustValidateStart = utf8Enabled || id.length() > 1 || hasMoreIdentifierContent;
@@ -220,19 +240,37 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr
             // Also reject control characters (0x00-0x1F, 0x7F) as identifier starts.
             // Reject control characters and other non-graphic bytes that Perl treats as invalid variable names.
             // In particular, C1 controls (0x80-0x9F) must always be rejected.
+            // Under `no utf8` / `evalbytes`, reject whitespace-like and format/control characters even
+            // for length-1 variables.
+            boolean rejectEvenAsLengthOne = !utf8Enabled
+                    && id.length() == 1
+                    && (UCharacter.hasBinaryProperty(cp, UProperty.WHITE_SPACE)
+                    || UCharacter.getType(cp) == UCharacter.FORMAT
+                    || UCharacter.getType(cp) == UCharacter.CONTROL);
+
             if (cp == 0xFFFD
                     || cp < 32
                     || cp == 127
                     || (cp >= 0x80 && cp <= 0x9F)
+                    || rejectEvenAsLengthOne
                     || (mustValidateStart && !valid)) {
                 String hex;
                 // Special case: if we got the Unicode replacement character (0xFFFD),
                 // it likely means the original was an invalid UTF-8 byte sequence.
                 // For Perl compatibility, we should report a representative invalid byte.
                 if (cp == 0xFFFD) {
-                    hex = "\\xB6";
+                    hex = utf8Enabled ? "\\x{fffd}" : "\\xB6";
                 } else {
-                    if (cp <= 255) {
+                    if (cp < 32 || cp == 127) {
+                        // Perl formats control bytes differently depending on the syntactic form.
+                        // In ${...} contexts it commonly uses \xNN, while for bare length-1 identifiers
+                        // (e.g. \x{0}) it uses \x{n}.
+                        if (insideBraces) {
+                            hex = String.format("\\x%02x", cp);
+                        } else {
+                            hex = "\\x{" + Integer.toHexString(cp) + "}";
+                        }
+                    } else if (cp <= 255) {
                         // Perl tends to report non-ASCII bytes as \x{..} in these contexts
                         hex = "\\x{" + Integer.toHexString(cp) + "}";
                     } else {
@@ -253,8 +291,6 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr
                 int cp = id.codePointAt(0);
                 boolean valid = cp == '_' || UCharacter.hasBinaryProperty(cp, UProperty.XID_START);
 
-                boolean utf8Enabled = parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8)
-                        && !parser.ctx.compilerOptions.isEvalbytes;
                 boolean mustValidateStart = utf8Enabled || id.length() > 1;
 
                 if (mustValidateStart && !valid) {
diff --git a/src/main/java/org/perlonjava/parser/ParseInfix.java b/src/main/java/org/perlonjava/parser/ParseInfix.java
index 23d5dd3c..eca75b33 100644
--- a/src/main/java/org/perlonjava/parser/ParseInfix.java
+++ b/src/main/java/org/perlonjava/parser/ParseInfix.java
@@ -295,7 +295,7 @@ public static Node parseInfixOperation(Parser parser, Node left, int precedence)
                     parser.tokenIndex--;
                     return left;
                 }
-                throw new PerlCompilerException(parser.tokenIndex, "Unexpected infix operator: " + token, parser.ctx.errorUtil);
+                throw new PerlCompilerException(parser.tokenIndex, "syntax error", parser.ctx.errorUtil);
         }
     }