Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions src/main/java/org/perlonjava/codegen/EmitVariable.java
Original file line number Diff line number Diff line change
Expand Up @@ -327,10 +327,12 @@ static void handleVariableOperator(EmitterVisitor emitterVisitor, OperatorNode n
// ===== STRICT VARS LOGIC =====
// Determine if this variable should be allowed under 'use strict "vars"'

// Special case: $a and $b in main:: package are exempt from strict
// Special case: $a and $b are exempt from strict
// (they're used by sort() without declaration)
String normalizedName = NameNormalizer.normalizeVariableName(name, emitterVisitor.ctx.symbolTable.getCurrentPackage());
boolean isSpecialSortVar = sigil.equals("$") && ("main::a".equals(normalizedName) || "main::b".equals(normalizedName));
boolean isSpecialSortVar = sigil.equals("$")
&& !name.contains("::")
&& (name.equals("a") || name.equals("b"));

boolean allowIfAlreadyExists = false;
if (emitterVisitor.ctx.symbolTable.isStrictOptionEnabled(HINT_STRICT_VARS)) {
Expand All @@ -341,6 +343,18 @@ static void handleVariableOperator(EmitterVisitor emitterVisitor, OperatorNode n
} else if (sigil.equals("%") && !normalizedName.endsWith("::")) {
allowIfAlreadyExists = GlobalVariable.existsGlobalHash(normalizedName);
}

// Perl's strict 'vars' requires declaration for unqualified globals like $A
// even if they were previously created under 'no strict'.
// Keep this narrow to avoid changing behavior for other globals.
if (sigil.equals("$")
&& name != null
&& name.length() == 1
&& Character.isLetter(name.charAt(0))
&& !name.contains("::")
&& !isSpecialSortVar) {
allowIfAlreadyExists = false;
}
}

// Compute createIfNotExists flag - determines if variable can be auto-vivified
Expand Down
212 changes: 117 additions & 95 deletions src/main/java/org/perlonjava/lexer/Lexer.java

Large diffs are not rendered by default.

8 changes: 5 additions & 3 deletions src/main/java/org/perlonjava/operators/Operator.java
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ public static RuntimeList split(RuntimeScalar quotedRegex, RuntimeList args, int
*/
public static RuntimeScalar substr(int ctx, RuntimeBase... args) {
String str = args[0].toString();
int strLength = str.length();
int strLength = str.codePointCount(0, str.length());

int size = args.length;
int offset = ((RuntimeScalar) args[1]).getInt();
Expand Down Expand Up @@ -275,8 +275,10 @@ public static RuntimeScalar substr(int ctx, RuntimeBase... args) {
// Ensure length is non-negative and within bounds
length = Math.max(0, Math.min(length, strLength - offset));

// Extract the substring
String result = str.substring(offset, offset + length);
// Extract the substring (offset/length are in Unicode code points)
int startIndex = str.offsetByCodePoints(0, offset);
int endIndex = str.offsetByCodePoints(startIndex, length);
String result = str.substring(startIndex, endIndex);

// Return an LValue "RuntimeSubstrLvalue" that can be used to assign to the original string
// This allows for in-place modification of the original string if needed
Expand Down
80 changes: 61 additions & 19 deletions src/main/java/org/perlonjava/parser/IdentifierParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,24 @@
import org.perlonjava.perlmodule.Strict;
import org.perlonjava.runtime.PerlCompilerException;

import java.nio.charset.StandardCharsets;

/**
* The IdentifierParser class is responsible for parsing complex Perl identifiers
* from a list of tokens, excluding the sigil (e.g., $, @, %).
*/
public class IdentifierParser {

private static boolean isIdentifierTooLong(StringBuilder variableName, boolean isTypeglob) {
// perl5_t/t/comp/parser.t builds boundary cases using UTF-8 byte length.
// With 4-byte UTF-8 identifier characters, the boundary is 255 * 4 = 1020 bytes.
// Perl has a slightly different boundary for typeglob identifiers:
// - $ / @ / % / & / $# contexts: 1020 bytes is already too long
// - * (typeglob) context: 1020 bytes is allowed; only > 1020 is too long
int byteLen = variableName.toString().getBytes(StandardCharsets.UTF_8).length;
return isTypeglob ? byteLen > 1020 : byteLen >= 1020;
}

/**
* Parses a complex Perl identifier from the list of tokens, excluding the sigil.
* This method handles identifiers that may be enclosed in braces.
Expand All @@ -21,6 +33,10 @@ public class IdentifierParser {
* @return The parsed identifier as a String, or null if there is no valid identifier.
*/
public static String parseComplexIdentifier(Parser parser) {
return parseComplexIdentifier(parser, false);
}

public static String parseComplexIdentifier(Parser parser, boolean isTypeglob) {
// Save the current token index to allow backtracking if needed
int saveIndex = parser.tokenIndex;

Expand Down Expand Up @@ -52,7 +68,7 @@ public static String parseComplexIdentifier(Parser parser) {
}

// Parse the identifier using the inner method
String identifier = parseComplexIdentifierInner(parser, insideBraces);
String identifier = parseComplexIdentifierInner(parser, insideBraces, isTypeglob);

// If an identifier was found, and it was inside braces, ensure the braces are properly closed
if (identifier != null && insideBraces) {
Expand Down Expand Up @@ -109,19 +125,39 @@ private static boolean isSingleQuotePackageSeparator(Parser parser, StringBuilde
* @return The parsed identifier as a String, or null if there is no valid identifier.
*/
public static String parseComplexIdentifierInner(Parser parser, boolean insideBraces) {
return parseComplexIdentifierInner(parser, insideBraces, false);
}

public static String parseComplexIdentifierInner(Parser parser, boolean insideBraces, boolean isTypeglob) {
// Perl allows whitespace between the sigil and the variable name (e.g. "$ a" parses as "$a").
// But if whitespace is skipped and the next token is not a valid identifier start (e.g. "$\t = 4"),
// the variable name is missing and we should trigger a plain "syntax error".
int wsStart = parser.tokenIndex;
// Skip horizontal whitespace to find the start of the identifier.
// Do not skip NEWLINE here: "$\n" is not a valid variable name.
while (parser.tokenIndex < parser.tokens.size()
&& parser.tokens.get(parser.tokenIndex).type == LexerTokenType.WHITESPACE) {
parser.tokenIndex++;
}
boolean skippedWhitespace = parser.tokenIndex != wsStart;

boolean isFirstToken = true;
StringBuilder variableName = new StringBuilder();

LexerToken token = parser.tokens.get(parser.tokenIndex);
LexerToken nextToken = parser.tokens.get(parser.tokenIndex + 1);

if (skippedWhitespace) {
// Perl allows "$ a" (whitespace before an identifier). But if whitespace is followed by
// something that cannot start an identifier (e.g. "$\t = 4"), Perl reports a syntax error.
// Signal "missing variable name" to the caller by returning the empty string.
if (token.type != LexerTokenType.IDENTIFIER
&& token.type != LexerTokenType.NUMBER
&& token.type != LexerTokenType.STRING) {
return "";
}
}

// Special case: Handle ellipsis inside braces - ${...} should be parsed as a block, not as ${.}
if (insideBraces && token.type == LexerTokenType.OPERATOR && token.text.equals("...")) {
// Return null to force fallback to block parsing for ellipsis inside braces
Expand Down Expand Up @@ -172,7 +208,8 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr

// Always reject the Unicode replacement character: it usually indicates an invalid byte sequence.
// Perl reports these as unrecognized bytes (e.g. \xB6 in comp/parser_run.t test 66).
if (cp == 0xFFFD || (mustValidateStart && !valid)) {
// Also reject control characters (0x00-0x1F, 0x7F) as identifier starts.
if (cp == 0xFFFD || cp < 32 || cp == 127 || (mustValidateStart && !valid)) {
String hex;
// Special case: if we got the Unicode replacement character (0xFFFD),
// it likely means the original was an invalid UTF-8 byte sequence.
Expand Down Expand Up @@ -313,10 +350,23 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr
}
if (!(token.type == LexerTokenType.NUMBER)) {
// Not ::, not ', and not a number, so this is the end
// Validate STRING tokens to reject control characters
if (token.type == LexerTokenType.STRING) {
String id = token.text;
if (!id.isEmpty()) {
int cp = id.codePointAt(0);
// Reject control characters (0x00-0x1F, 0x7F) and replacement char
if (cp < 32 || cp == 127 || cp == 0xFFFD) {
String hex = cp <= 255 ? String.format("\\x{%02X}", cp) : "\\x{" + Integer.toHexString(cp) + "}";
throw new PerlCompilerException("Unrecognized character " + hex + ";");
}
}
}

variableName.append(token.text);

// Check identifier length limit (Perl's limit is around 251 characters)
if (variableName.length() > 251) {
if (isIdentifierTooLong(variableName, isTypeglob)) {
parser.throwCleanError("Identifier too long");
}

Expand All @@ -328,7 +378,7 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr
variableName.append(token.text);

// Check identifier length limit (Perl's limit is around 251 characters)
if (variableName.length() > 251) {
if (isIdentifierTooLong(variableName, isTypeglob)) {
parser.throwCleanError("Identifier too long");
}

Expand Down Expand Up @@ -368,7 +418,7 @@ public static String parseComplexIdentifierInner(Parser parser, boolean insideBr
variableName.append(token.text);

// Check identifier length limit (Perl's limit is around 251 characters)
if (variableName.length() > 251) {
if (isIdentifierTooLong(variableName, isTypeglob)) {
parser.throwCleanError("Identifier too long");
}

Expand Down Expand Up @@ -518,21 +568,13 @@ static void validateIdentifier(Parser parser, String varName, int startIndex) {

// Check for non-ASCII characters in variable names under 'no utf8'
if (!parser.ctx.symbolTable.isStrictOptionEnabled(Strict.HINT_UTF8)) {
// Under 'no utf8', check if this is a multi-character identifier with non-ASCII
boolean hasNonAscii = false;
for (int i = 0; i < varName.length(); i++) {
if (varName.charAt(i) > 127) {
hasNonAscii = true;
break;
}
}

if (hasNonAscii && varName.length() > 1) {
// Multi-character identifier with non-ASCII under 'no utf8' is an error
// Reset parser position and throw error
// Under 'no utf8', perl5 still accepts valid Unicode identifiers when the source is
// already Unicode (e.g. eval() of a UTF-8 string). What must be rejected are invalid
// sequences that decode to U+FFFD (replacement character).
if (varName.length() > 1 && varName.indexOf('\uFFFD') >= 0) {
parser.tokenIndex = startIndex;
parser.throwError("Unrecognized character \\x{" +
Integer.toHexString(varName.charAt(varName.length() - 1)) + "}");
int lastCp = varName.codePointBefore(varName.length());
parser.throwError("Unrecognized character \\x{" + Integer.toHexString(lastCp) + "}");
}
}
}
Expand Down
10 changes: 10 additions & 0 deletions src/main/java/org/perlonjava/parser/StatementParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,16 @@ public static Node parseForStatement(Parser parser, String label) {
parser.parsingForLoopVariable = false;
}

// If we didn't parse a loop variable, Perl expects the '(' of the for(..) header next.
// When something else appears (e.g. a bare identifier), perl5 reports:
// Missing $ on loop variable ...
if (varNode == null) {
LexerToken afterVar = TokenUtils.peek(parser);
if (!afterVar.text.equals("(")) {
parser.throwCleanError("Missing $ on loop variable " + afterVar.text);
}
}

TokenUtils.consume(parser, LexerTokenType.OPERATOR, "(");

// Parse the initialization part
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/perlonjava/parser/Variable.java
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ public static Node parseVariable(Parser parser, String sigil) {
// Store the current position before parsing the identifier
int startIndex = parser.tokenIndex;

String varName = IdentifierParser.parseComplexIdentifier(parser);
String varName = IdentifierParser.parseComplexIdentifier(parser, sigil.equals("*"));
parser.ctx.logDebug("Parsing variable: " + varName);

if (varName != null) {
Expand Down
9 changes: 6 additions & 3 deletions src/main/java/org/perlonjava/runtime/RuntimeSubstrLvalue.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ public RuntimeScalar set(RuntimeScalar value) {

String parentValue = lvalue.toString();
String newValue = this.toString();
int strLength = parentValue.length();
int strLength = parentValue.codePointCount(0, parentValue.length());

// Calculate the actual offset, handling negative offsets
int actualOffset = offset < 0 ? strLength + offset : offset;
Expand Down Expand Up @@ -83,14 +83,17 @@ public RuntimeScalar set(RuntimeScalar value) {

StringBuilder updatedValue = new StringBuilder(parentValue);

// Convert code point offsets to UTF-16 indices for StringBuilder operations
int startIndex = parentValue.offsetByCodePoints(0, actualOffset);
int endIndex = parentValue.offsetByCodePoints(startIndex, actualLength);

// Handle the case where the offset is beyond the current string length
if (actualOffset >= strLength) {
// append the new value
updatedValue.append(newValue);
} else {
// Replace the substring with the new value
int endIndex = actualOffset + actualLength;
updatedValue.replace(actualOffset, endIndex, newValue);
updatedValue.replace(startIndex, endIndex, newValue);
}

// Update the parent RuntimeScalar with the modified string
Expand Down