From 32c8ac3ba230931d01c590dcc925295806a5af0f Mon Sep 17 00:00:00 2001
From: Flavio Soibelmann Glock <fglock@gmail.com>
Date: Thu, 29 Jan 2026 13:57:47 +0100
Subject: [PATCH] Fix Unicode identifiers and related strict-vars/regex
 regressions

---
 .../org/perlonjava/codegen/EmitVariable.java  |  18 +-
 src/main/java/org/perlonjava/lexer/Lexer.java | 212 ++++++++++--------
 .../org/perlonjava/operators/Operator.java    |   8 +-
 .../perlonjava/parser/IdentifierParser.java   |  80 +++++--
 .../perlonjava/parser/StatementParser.java    |  10 +
 .../java/org/perlonjava/parser/Variable.java  |   2 +-
 .../runtime/RuntimeSubstrLvalue.java          |   9 +-
 7 files changed, 216 insertions(+), 123 deletions(-)

diff --git a/src/main/java/org/perlonjava/codegen/EmitVariable.java b/src/main/java/org/perlonjava/codegen/EmitVariable.java
index 04bf87ba4..f17793dd2 100644
--- a/src/main/java/org/perlonjava/codegen/EmitVariable.java
+++ b/src/main/java/org/perlonjava/codegen/EmitVariable.java
@@ -327,10 +327,12 @@ static void handleVariableOperator(EmitterVisitor emitterVisitor, OperatorNode n
                 // ===== STRICT VARS LOGIC =====
                 // Determine if this variable should be allowed under 'use strict "vars"'
                 
-                // Special case: $a and $b in main:: package are exempt from strict
+                // Special case: $a and $b are exempt from strict
                 // (they're used by sort() without declaration)
                 String normalizedName = NameNormalizer.normalizeVariableName(name, emitterVisitor.ctx.symbolTable.getCurrentPackage());
-                boolean isSpecialSortVar = sigil.equals("$") && ("main::a".equals(normalizedName) || "main::b".equals(normalizedName));
+                boolean isSpecialSortVar = sigil.equals("$")
+                        && !name.contains("::")
+                        && (name.equals("a") || name.equals("b"));
 
                 boolean allowIfAlreadyExists = false;
                 if (emitterVisitor.ctx.symbolTable.isStrictOptionEnabled(HINT_STRICT_VARS)) {
@@ -341,6 +343,18 @@ static void handleVariableOperator(EmitterVisitor emitterVisitor, OperatorNode n
                     } else if (sigil.equals("%") && !normalizedName.endsWith("::")) {
                         allowIfAlreadyExists = GlobalVariable.existsGlobalHash(normalizedName);
                     }
+
+                    // Perl's strict 'vars' requires declaration for unqualified globals like $A
+                    // even if they were previously created under 'no strict'.
+                    // Keep this narrow to avoid changing behavior for other globals.
+                    if (sigil.equals("$")
+                            && name != null
+                            && name.length() == 1
+                            && Character.isLetter(name.charAt(0))
+                            && !name.contains("::")
+                            && !isSpecialSortVar) {
+                        allowIfAlreadyExists = false;
+                    }
                 }
 
                 // Compute createIfNotExists flag - determines if variable can be auto-vivified
diff --git a/src/main/java/org/perlonjava/lexer/Lexer.java b/src/main/java/org/perlonjava/lexer/Lexer.java
index ca3e68ef1..eac727e8f 100644
--- a/src/main/java/org/perlonjava/lexer/Lexer.java
+++ b/src/main/java/org/perlonjava/lexer/Lexer.java
@@ -53,7 +53,7 @@ public class Lexer {
     }
 
     // Input characters to be tokenized
-    public char[] input;
+    public String input;
     // Current position in the input
     public int position;
     // Length of the input
@@ -61,11 +61,29 @@ public class Lexer {
 
     // Constructor to initialize the Lexer with input string
     public Lexer(String input) {
-        this.input = input.toCharArray();
-        this.length = this.input.length;
+        this.input = input;
+        this.length = this.input.length();
         this.position = 0;
     }
 
+    private int getCurrentCodePoint() {
+        if (position >= length) {
+            return -1;
+        }
+        char c1 = input.charAt(position);
+        if (Character.isHighSurrogate(c1) && position + 1 < length) {
+            char c2 = input.charAt(position + 1);
+            if (Character.isLowSurrogate(c2)) {
+                return Character.toCodePoint(c1, c2);
+            }
+        }
+        return c1;
+    }
+
+    private void advanceCodePoint(int codePoint) {
+        position += Character.charCount(codePoint);
+    }
+
     // Main method for testing the Lexer
     public static void main(String[] args) {
         // Sample code to be tokenized
@@ -116,7 +134,8 @@ public LexerToken nextToken() {
             return null;
         }
 
-        char current = input[position];
+        char current = input.charAt(position);
+        int currentCp = getCurrentCodePoint();
 
         if (isAsciiWhitespace(current)) {
             if (current == '\n') {
@@ -131,315 +150,318 @@ public LexerToken nextToken() {
             }
         } else if (current >= '0' && current <= '9') {
             return consumeNumber();
-        } else if (current == '_' || UCharacter.hasBinaryProperty(current, UProperty.XID_START)) {
+        } else if (currentCp == '_' || Character.isUnicodeIdentifierStart(currentCp)) {
             return consumeIdentifier();
         } else if (current < 128 && isOperator[current]) {
             return consumeOperator();
         } else {
-            position++;
-            return new LexerToken(LexerTokenType.STRING, String.valueOf(current));
+            int start = position;
+            advanceCodePoint(currentCp);
+            return new LexerToken(LexerTokenType.STRING, input.substring(start, position));
         }
     }
 
     public LexerToken consumeWhitespace() {
         int start = position;
         while (position < length
-                && input[position] != '\n'
-                && input[position] != '\r'
-                && (input[position] == ' ' || Character.isWhitespace(input[position]))) {
+                && input.charAt(position) != '\n'
+                && input.charAt(position) != '\r'
+                && (input.charAt(position) == ' ' || Character.isWhitespace(input.charAt(position)))) {
             position++;
         }
-        return new LexerToken(LexerTokenType.WHITESPACE, new String(input, start, position - start));
+        return new LexerToken(LexerTokenType.WHITESPACE, input.substring(start, position));
     }
 
     public LexerToken consumeNumber() {
         int start = position;
-        while (position < length && ((input[position] >= '0' && input[position] <= '9') || input[position] == '_')) {
+        while (position < length && ((input.charAt(position) >= '0' && input.charAt(position) <= '9') || input.charAt(position) == '_')) {
             position++;
         }
-        return new LexerToken(LexerTokenType.NUMBER, new String(input, start, position - start));
+        return new LexerToken(LexerTokenType.NUMBER, input.substring(start, position));
     }
 
     public LexerToken consumeIdentifier() {
         int start = position;
-        position++; // Move past the initial character we already validated
+        int cp = getCurrentCodePoint();
+        advanceCodePoint(cp); // Move past the initial character we already validated
 
         while (position < length) {
-            int current = input[position];
-            if (current == '_' || UCharacter.hasBinaryProperty(current, UProperty.XID_CONTINUE)) {
-                position++;
+            int curCp = getCurrentCodePoint();
+            if (curCp == '_' || Character.isUnicodeIdentifierPart(curCp)) {
+                advanceCodePoint(curCp);
             } else {
                 break;
             }
         }
-        return new LexerToken(LexerTokenType.IDENTIFIER, new String(input, start, position - start));
+        // Build token text using substring to preserve surrogate pairs correctly
+        return new LexerToken(LexerTokenType.IDENTIFIER, input.substring(start, position));
     }
 
     public LexerToken consumeOperator() {
         int start = position;
-        char current = input[position];
+        char current = input.charAt(position);
         if (position < length && (current < 128 && isOperator[current])) {
             switch (current) {
                 case '!':
-                    if (position + 2 <= input.length && input[position + 1] == '=') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '=') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "!=");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '~') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '~') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "!~");
                     }
                     break;
                 case '$':
-                    if (position + 2 <= input.length && input[position + 1] == '#') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '#') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "$#");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '*') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '*') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "$*");
                     }
                     break;
                 case '@':
-                    if (position + 2 <= input.length && input[position + 1] == '*') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '*') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "@*");
                     }
                     break;
                 case '%':
-                    if (position + 2 <= input.length && input[position + 1] == '=') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '=') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "%=");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '*') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '*') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "%*");
                     }
                     break;
                 case '&':
-                    if (position + 3 <= input.length
-                            && input[position + 1] == '&'
-                            && input[position + 2] == '=') {
+                    if (position + 3 <= input.length()
+                            && input.charAt(position + 1) == '&'
+                            && input.charAt(position + 2) == '=') {
                         position += 3;
                         return new LexerToken(LexerTokenType.OPERATOR, "&&=");
                     }
-                    if (position + 3 <= input.length
-                            && input[position + 1] == '.'
-                            && input[position + 2] == '=') {
+                    if (position + 3 <= input.length()
+                            && input.charAt(position + 1) == '.'
+                            && input.charAt(position + 2) == '=') {
                         position += 3;
                         return new LexerToken(LexerTokenType.OPERATOR, "&.=");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '&') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '&') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "&&");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '*') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '*') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "&*");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '=') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '=') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "&=");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '.') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '.') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "&.");
                     }
                     break;
                 case '*':
-                    if (position + 3 <= input.length
-                            && input[position + 1] == '*'
-                            && input[position + 2] == '=') {
+                    if (position + 3 <= input.length()
+                            && input.charAt(position + 1) == '*'
+                            && input.charAt(position + 2) == '=') {
                         position += 3;
                         return new LexerToken(LexerTokenType.OPERATOR, "**=");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '*') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '*') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "**");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '=') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '=') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "*=");
                     }
                     break;
                 case '+':
-                    if (position + 2 <= input.length && input[position + 1] == '+') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '+') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "++");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '=') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '=') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "+=");
                     }
                     break;
                 case '-':
-                    if (position + 2 <= input.length && input[position + 1] == '-') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '-') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "--");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '=') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '=') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "-=");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '>') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '>') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "->");
                     }
                     break;
                 case '.':
-                    if (position + 3 <= input.length
-                            && input[position + 1] == '.'
-                            && input[position + 2] == '.') {
+                    if (position + 3 <= input.length()
+                            && input.charAt(position + 1) == '.'
+                            && input.charAt(position + 2) == '.') {
                         position += 3;
                         return new LexerToken(LexerTokenType.OPERATOR, "...");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '.') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '.') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "..");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '=') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '=') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, ".=");
                     }
                     break;
                 case '/':
-                    if (position + 3 <= input.length
-                            && input[position + 1] == '/'
-                            && input[position + 2] == '=') {
+                    if (position + 3 <= input.length()
+                            && input.charAt(position + 1) == '/'
+                            && input.charAt(position + 2) == '=') {
                         position += 3;
                         return new LexerToken(LexerTokenType.OPERATOR, "//=");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '/') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '/') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "//");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '=') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '=') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "/=");
                     }
                     break;
                 case ':':
-                    if (position + 2 <= input.length && input[position + 1] == ':') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == ':') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "::");
                     }
                     break;
                 case '<':
-                    if (position + 3 <= input.length
-                            && input[position + 1] == '<'
-                            && input[position + 2] == '=') {
+                    if (position + 3 <= input.length()
+                            && input.charAt(position + 1) == '<'
+                            && input.charAt(position + 2) == '=') {
                         position += 3;
                         return new LexerToken(LexerTokenType.OPERATOR, "<<=");
                     }
-                    if (position + 3 <= input.length
-                            && input[position + 1] == '='
-                            && input[position + 2] == '>') {
+                    if (position + 3 <= input.length()
+                            && input.charAt(position + 1) == '='
+                            && input.charAt(position + 2) == '>') {
                         position += 3;
                         return new LexerToken(LexerTokenType.OPERATOR, "<=>");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '<') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '<') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "<<");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '=') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '=') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "<=");
                     }
                     break;
                 case '=':
-                    if (position + 2 <= input.length && input[position + 1] == '=') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '=') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "==");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '>') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '>') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "=>");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '~') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '~') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "=~");
                     }
                     break;
                 case '>':
-                    if (position + 3 <= input.length
-                            && input[position + 1] == '>'
-                            && input[position + 2] == '=') {
+                    if (position + 3 <= input.length()
+                            && input.charAt(position + 1) == '>'
+                            && input.charAt(position + 2) == '=') {
                         position += 3;
                         return new LexerToken(LexerTokenType.OPERATOR, ">>=");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '=') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '=') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, ">=");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '>') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '>') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, ">>");
                     }
                     break;
                 case '^':
-                    if (position + 3 <= input.length
-                            && input[position + 1] == '^'
-                            && input[position + 2] == '=') {
+                    if (position + 3 <= input.length()
+                            && input.charAt(position + 1) == '^'
+                            && input.charAt(position + 2) == '=') {
                         position += 3;
                         return new LexerToken(LexerTokenType.OPERATOR, "^^=");
                     }
-                    if (position + 3 <= input.length
-                            && input[position + 1] == '.'
-                            && input[position + 2] == '=') {
+                    if (position + 3 <= input.length()
+                            && input.charAt(position + 1) == '.'
+                            && input.charAt(position + 2) == '=') {
                         position += 3;
                         return new LexerToken(LexerTokenType.OPERATOR, "^.=");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '=') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '=') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "^=");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '^') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '^') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "^^");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '.') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '.') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "^.");
                     }
                     break;
                 case 'x':
-                    if (position + 2 <= input.length && input[position + 1] == '=') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '=') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "x=");
                     }
                     break;
                 case '|':
-                    if (position + 3 <= input.length
-                            && input[position + 1] == '.'
-                            && input[position + 2] == '=') {
+                    if (position + 3 <= input.length()
+                            && input.charAt(position + 1) == '.'
+                            && input.charAt(position + 2) == '=') {
                         position += 3;
                         return new LexerToken(LexerTokenType.OPERATOR, "|.=");
                     }
-                    if (position + 3 <= input.length
-                            && input[position + 1] == '|'
-                            && input[position + 2] == '=') {
+                    if (position + 3 <= input.length()
+                            && input.charAt(position + 1) == '|'
+                            && input.charAt(position + 2) == '=') {
                         position += 3;
                         return new LexerToken(LexerTokenType.OPERATOR, "||=");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '=') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '=') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "|=");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '|') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '|') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "||");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '.') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '.') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "|.");
                     }
                     break;
                 case '~':
-                    if (position + 2 <= input.length && input[position + 1] == '~') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '~') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "~~");
                     }
-                    if (position + 2 <= input.length && input[position + 1] == '.') {
+                    if (position + 2 <= input.length() && input.charAt(position + 1) == '.') {
                         position += 2;
                         return new LexerToken(LexerTokenType.OPERATOR, "~.");
                     }
@@ -448,7 +470,7 @@ public LexerToken consumeOperator() {
         }
 
         position++;
-        return new LexerToken(LexerTokenType.OPERATOR, new String(input, start, 1));
+        return new LexerToken(LexerTokenType.OPERATOR, input.substring(start, start + 1));
     }
 }
 
diff --git a/src/main/java/org/perlonjava/operators/Operator.java b/src/main/java/org/perlonjava/operators/Operator.java
index be188e460..f54e12aba 100644
--- a/src/main/java/org/perlonjava/operators/Operator.java
+++ b/src/main/java/org/perlonjava/operators/Operator.java
@@ -247,7 +247,7 @@ public static RuntimeList split(RuntimeScalar quotedRegex, RuntimeList args, int
      */
     public static RuntimeScalar substr(int ctx, RuntimeBase... args) {
         String str = args[0].toString();
-        int strLength = str.length();
+        int strLength = str.codePointCount(0, str.length());
 
         int size = args.length;
         int offset = ((RuntimeScalar) args[1]).getInt();
@@ -275,8 +275,10 @@ public static RuntimeScalar substr(int ctx, RuntimeBase... args) {
         // Ensure length is non-negative and within bounds
         length = Math.max(0, Math.min(length, strLength - offset));
 
-        // Extract the substring
-        String result = str.substring(offset, offset + length);
+        // Extract the substring (offset/length are in Unicode code points)
+        int startIndex = str.offsetByCodePoints(0, offset);
+        int endIndex = str.offsetByCodePoints(startIndex, length);
+        String result = str.substring(startIndex, endIndex);
 
         // Return an LValue "RuntimeSubstrLvalue" that can be used to assign to the original string
         // This allows for in-place modification of the original string if needed
diff --git a/src/main/java/org/perlonjava/parser/IdentifierParser.java b/src/main/java/org/perlonjava/parser/IdentifierParser.java
index 9cbe52383..a352f3242 100644
--- a/src/main/java/org/perlonjava/parser/IdentifierParser.java
+++ b/src/main/java/org/perlonjava/parser/IdentifierParser.java
@@ -7,12 +7,24 @@
 import org.perlonjava.perlmodule.Strict;
 import org.perlonjava.runtime.PerlCompilerException;
 
+import java.nio.charset.StandardCharsets;
+
 /**
  * The IdentifierParser class is responsible for parsing complex Perl identifiers
  * from a list of tokens, excluding the sigil (e.g., $, @, %).
  */
 public class IdentifierParser {
 
+    private static boolean isIdentifierTooLong(StringBuilder variableName, boolean isTypeglob) {
+        // perl5_t/t/comp/parser.t builds boundary cases using UTF-8 byte length.
+        // With 4-byte UTF-8 identifier characters, the boundary is 255 * 4 = 1020 bytes.
+        // Perl has a slightly different boundary for typeglob identifiers:
+        //   - $ / @ / % / & / $# contexts: 1020 bytes is already too long
+        //   - * (typeglob) context: 1020 bytes is allowed; only > 1020 is too long
+        int byteLen = variableName.toString().getBytes(StandardCharsets.UTF_8).length;
+        return isTypeglob ? byteLen > 1020 : byteLen >= 1020;
+    }
+
     /**
      * Parses a complex Perl identifier from the list of tokens, excluding the sigil.
      * This method handles identifiers that may be enclosed in braces.
@@ -21,6 +33,10 @@ public class IdentifierParser {
      * @return The parsed identifier as a String, or null if there is no valid identifier.
      */
     public static String parseComplexIdentifier(Parser parser) {
+        return parseComplexIdentifier(parser, false);
+    }
+
+    public static String parseComplexIdentifier(Parser parser, boolean isTypeglob) {
         // Save the current token index to allow backtracking if needed
         int saveIndex = parser.tokenIndex;
 
@@ -52,7 +68,7 @@ public static String parseComplexIdentifier(Parser parser) {
         }
 
         // Parse the identifier using the inner method
-        String identifier = parseComplexIdentifierInner(parser, insideBraces);
+        String identifier = parseComplexIdentifierInner(parser, insideBraces, isTypeglob);
 
         // If an identifier was found, and it was inside braces, ensure the braces are properly closed
         if (identifier != null && insideBraces) {
@@ -109,12 +125,21 @@ private static boolean isSingleQuotePackageSeparator(Parser parser, StringBuilde
      * @return The parsed identifier as a String, or null if there is no valid identifier.
      */
     public static String parseComplexIdentifierInner(Parser parser, boolean insideBraces) {
+        return parseComplexIdentifierInner(parser, insideBraces, false);
+    }
+
+    public static String parseComplexIdentifierInner(Parser parser, boolean insideBraces, boolean isTypeglob) {
+        // Perl allows whitespace between the sigil and the variable name (e.g. "$ a" parses as "$a").
+        // But if whitespace is skipped and the next token is not a valid identifier start (e.g. "$\t = 4"),
+        // the variable name is missing and we should trigger a plain "syntax error".
+        int wsStart = parser.tokenIndex;
         // Skip horizontal whitespace to find the start of the identifier.
         // Do not skip NEWLINE here: "$\n" is not a valid variable name.
         while (parser.tokenIndex < parser.tokens.size()
                 && parser.tokens.get(parser.tokenIndex).type == LexerTokenType.WHITESPACE) {
             parser.tokenIndex++;
         }
+        boolean skippedWhitespace = parser.tokenIndex != wsStart;
 
         boolean isFirstToken = true;
         StringBuilder variableName = new StringBuilder();
@@ -122,6 +147,17 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr
         LexerToken token = parser.tokens.get(parser.tokenIndex);
         LexerToken nextToken = parser.tokens.get(parser.tokenIndex + 1);
 
+        if (skippedWhitespace) {
+            // Perl allows "$ a" (whitespace before an identifier). But if whitespace is followed by
+            // something that cannot start an identifier (e.g. "$\t = 4"), Perl reports a syntax error.
+            // Signal "missing variable name" to the caller by returning the empty string.
+            if (token.type != LexerTokenType.IDENTIFIER
+                    && token.type != LexerTokenType.NUMBER
+                    && token.type != LexerTokenType.STRING) {
+                return "";
+            }
+        }
+
         // Special case: Handle ellipsis inside braces - ${...} should be parsed as a block, not as ${.}
         if (insideBraces && token.type == LexerTokenType.OPERATOR && token.text.equals("...")) {
             // Return null to force fallback to block parsing for ellipsis inside braces
@@ -172,7 +208,8 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr
 
             // Always reject the Unicode replacement character: it usually indicates an invalid byte sequence.
             // Perl reports these as unrecognized bytes (e.g. \xB6 in comp/parser_run.t test 66).
-            if (cp == 0xFFFD || (mustValidateStart && !valid)) {
+            // Also reject control characters (0x00-0x1F, 0x7F) as identifier starts.
+            if (cp == 0xFFFD || cp < 32 || cp == 127 || (mustValidateStart && !valid)) {
                 String hex;
                 // Special case: if we got the Unicode replacement character (0xFFFD),
                 // it likely means the original was an invalid UTF-8 byte sequence.
@@ -313,10 +350,23 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr
                 }
                 if (!(token.type == LexerTokenType.NUMBER)) {
                     // Not ::, not ', and not a number, so this is the end
+                    // Validate STRING tokens to reject control characters
+                    if (token.type == LexerTokenType.STRING) {
+                        String id = token.text;
+                        if (!id.isEmpty()) {
+                            int cp = id.codePointAt(0);
+                            // Reject control characters (0x00-0x1F, 0x7F) and replacement char
+                            if (cp < 32 || cp == 127 || cp == 0xFFFD) {
+                                String hex = cp <= 255 ? String.format("\\x{%02X}", cp) : "\\x{" + Integer.toHexString(cp) + "}";
+                                throw new PerlCompilerException("Unrecognized character " + hex + ";");
+                            }
+                        }
+                    }
+                    
                     variableName.append(token.text);
 
                     // Check identifier length limit (Perl's limit is around 251 characters)
-                    if (variableName.length() > 251) {
+                    if (isIdentifierTooLong(variableName, isTypeglob)) {
                         parser.throwCleanError("Identifier too long");
                     }
 
@@ -328,7 +378,7 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr
                 variableName.append(token.text);
 
                 // Check identifier length limit (Perl's limit is around 251 characters)
-                if (variableName.length() > 251) {
+                if (isIdentifierTooLong(variableName, isTypeglob)) {
                     parser.throwCleanError("Identifier too long");
                 }
 
@@ -368,7 +418,7 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr
                 variableName.append(token.text);
 
                 // Check identifier length limit (Perl's limit is around 251 characters)
-                if (variableName.length() > 251) {
+                if (isIdentifierTooLong(variableName, isTypeglob)) {
                     parser.throwCleanError("Identifier too long");
                 }
 
@@ -518,21 +568,13 @@ static void validateIdentifier(Parser parser, String varName, int startIndex) {
 
         // Check for non-ASCII characters in variable names under 'no utf8'
         if (!parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8)) {
-            // Under 'no utf8', check if this is a multi-character identifier with non-ASCII
-            boolean hasNonAscii = false;
-            for (int i = 0; i < varName.length(); i++) {
-                if (varName.charAt(i) > 127) {
-                    hasNonAscii = true;
-                    break;
-                }
-            }
-
-            if (hasNonAscii && varName.length() > 1) {
-                // Multi-character identifier with non-ASCII under 'no utf8' is an error
-                // Reset parser position and throw error
+            // Under 'no utf8', perl5 still accepts valid Unicode identifiers when the source is
+            // already Unicode (e.g. eval() of a UTF-8 string). What must be rejected are invalid
+            // sequences that decode to U+FFFD (replacement character).
+            if (varName.length() > 1 && varName.indexOf('\uFFFD') >= 0) {
                 parser.tokenIndex = startIndex;
-                parser.throwError("Unrecognized character \\x{" +
-                        Integer.toHexString(varName.charAt(varName.length() - 1)) + "}");
+                int lastCp = varName.codePointBefore(varName.length());
+                parser.throwError("Unrecognized character \\x{" + Integer.toHexString(lastCp) + "}");
             }
         }
     }
diff --git a/src/main/java/org/perlonjava/parser/StatementParser.java b/src/main/java/org/perlonjava/parser/StatementParser.java
index fb09e18c5..8f8c75a35 100644
--- a/src/main/java/org/perlonjava/parser/StatementParser.java
+++ b/src/main/java/org/perlonjava/parser/StatementParser.java
@@ -130,6 +130,16 @@ public static Node parseForStatement(Parser parser, String label) {
             parser.parsingForLoopVariable = false;
         }
 
+        // If we didn't parse a loop variable, Perl expects the '(' of the for(..) header next.
+        // When something else appears (e.g. a bare identifier), perl5 reports:
+        //   Missing $ on loop variable ...
+        if (varNode == null) {
+            LexerToken afterVar = TokenUtils.peek(parser);
+            if (!afterVar.text.equals("(")) {
+                parser.throwCleanError("Missing $ on loop variable " + afterVar.text);
+            }
+        }
+
         TokenUtils.consume(parser, LexerTokenType.OPERATOR, "(");
 
         // Parse the initialization part
diff --git a/src/main/java/org/perlonjava/parser/Variable.java b/src/main/java/org/perlonjava/parser/Variable.java
index 0a988e19a..ed2221cc6 100644
--- a/src/main/java/org/perlonjava/parser/Variable.java
+++ b/src/main/java/org/perlonjava/parser/Variable.java
@@ -137,7 +137,7 @@ public static Node parseVariable(Parser parser, String sigil) {
         // Store the current position before parsing the identifier
         int startIndex = parser.tokenIndex;
 
-        String varName = IdentifierParser.parseComplexIdentifier(parser);
+        String varName = IdentifierParser.parseComplexIdentifier(parser, sigil.equals("*"));
         parser.ctx.logDebug("Parsing variable: " + varName);
 
         if (varName != null) {
diff --git a/src/main/java/org/perlonjava/runtime/RuntimeSubstrLvalue.java b/src/main/java/org/perlonjava/runtime/RuntimeSubstrLvalue.java
index 5775d92f2..a7902f903 100644
--- a/src/main/java/org/perlonjava/runtime/RuntimeSubstrLvalue.java
+++ b/src/main/java/org/perlonjava/runtime/RuntimeSubstrLvalue.java
@@ -54,7 +54,7 @@ public RuntimeScalar set(RuntimeScalar value) {
 
         String parentValue = lvalue.toString();
         String newValue = this.toString();
-        int strLength = parentValue.length();
+        int strLength = parentValue.codePointCount(0, parentValue.length());
 
         // Calculate the actual offset, handling negative offsets
         int actualOffset = offset < 0 ? strLength + offset : offset;
@@ -83,14 +83,17 @@ public RuntimeScalar set(RuntimeScalar value) {
 
         StringBuilder updatedValue = new StringBuilder(parentValue);
 
+        // Convert code point offsets to UTF-16 indices for StringBuilder operations
+        int startIndex = parentValue.offsetByCodePoints(0, actualOffset);
+        int endIndex = parentValue.offsetByCodePoints(startIndex, actualLength);
+
         // Handle the case where the offset is beyond the current string length
         if (actualOffset >= strLength) {
             // append the new value
             updatedValue.append(newValue);
         } else {
             // Replace the substring with the new value
-            int endIndex = actualOffset + actualLength;
-            updatedValue.replace(actualOffset, endIndex, newValue);
+            updatedValue.replace(startIndex, endIndex, newValue);
         }
 
         // Update the parent RuntimeScalar with the modified string