171 lines
6.0 KiB
JavaScript
171 lines
6.0 KiB
JavaScript
// Constant properties for tracking regex syntax context
|
|
export const Context = Object.freeze({
|
|
DEFAULT: 'DEFAULT',
|
|
CHAR_CLASS: 'CHAR_CLASS',
|
|
});
|
|
|
|
/**
|
|
Replaces all unescaped instances of a regex pattern in the given context, using a replacement
|
|
string or callback.
|
|
|
|
Doesn't skip over complete multicharacter tokens (only `\` plus its folowing char) so must be used
|
|
with knowledge of what's safe to do given regex syntax. Assumes UnicodeSets-mode syntax.
|
|
@param {string} expression Search target
|
|
@param {string} needle Search as a regex pattern, with flags `su` applied
|
|
@param {string | (match: RegExpExecArray, details: {
|
|
context: 'DEFAULT' | 'CHAR_CLASS';
|
|
negated: boolean;
|
|
}) => string} replacement
|
|
@param {'DEFAULT' | 'CHAR_CLASS'} [context] All contexts if not specified
|
|
@returns {string} Updated expression
|
|
@example
|
|
const str = '.\\.\\\\.[[\\.].].';
|
|
replaceUnescaped(str, '\\.', '@');
|
|
// → '@\\.\\\\@[[\\.]@]@'
|
|
replaceUnescaped(str, '\\.', '@', Context.DEFAULT);
|
|
// → '@\\.\\\\@[[\\.].]@'
|
|
replaceUnescaped(str, '\\.', '@', Context.CHAR_CLASS);
|
|
// → '.\\.\\\\.[[\\.]@].'
|
|
*/
|
|
export function replaceUnescaped(expression, needle, replacement, context) {
|
|
const re = new RegExp(String.raw`${needle}|(?<$skip>\[\^?|\\?.)`, 'gsu');
|
|
const negated = [false];
|
|
let numCharClassesOpen = 0;
|
|
let result = '';
|
|
for (const match of expression.matchAll(re)) {
|
|
const {0: m, groups: {$skip}} = match;
|
|
if (!$skip && (!context || (context === Context.DEFAULT) === !numCharClassesOpen)) {
|
|
if (replacement instanceof Function) {
|
|
result += replacement(match, {
|
|
context: numCharClassesOpen ? Context.CHAR_CLASS : Context.DEFAULT,
|
|
negated: negated[negated.length - 1],
|
|
});
|
|
} else {
|
|
result += replacement;
|
|
}
|
|
continue;
|
|
}
|
|
if (m[0] === '[') {
|
|
numCharClassesOpen++;
|
|
negated.push(m[1] === '^');
|
|
} else if (m === ']' && numCharClassesOpen) {
|
|
numCharClassesOpen--;
|
|
negated.pop();
|
|
}
|
|
result += m;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/**
|
|
Runs a callback for each unescaped instance of a regex pattern in the given context.
|
|
|
|
Doesn't skip over complete multicharacter tokens (only `\` plus its folowing char) so must be used
|
|
with knowledge of what's safe to do given regex syntax. Assumes UnicodeSets-mode syntax.
|
|
@param {string} expression Search target
|
|
@param {string} needle Search as a regex pattern, with flags `su` applied
|
|
@param {(match: RegExpExecArray, details: {
|
|
context: 'DEFAULT' | 'CHAR_CLASS';
|
|
negated: boolean;
|
|
}) => void} callback
|
|
@param {'DEFAULT' | 'CHAR_CLASS'} [context] All contexts if not specified
|
|
*/
|
|
export function forEachUnescaped(expression, needle, callback, context) {
|
|
// Do this the easy way
|
|
replaceUnescaped(expression, needle, callback, context);
|
|
}
|
|
|
|
/**
|
|
Returns a match object for the first unescaped instance of a regex pattern in the given context, or
|
|
`null`.
|
|
|
|
Doesn't skip over complete multicharacter tokens (only `\` plus its folowing char) so must be used
|
|
with knowledge of what's safe to do given regex syntax. Assumes UnicodeSets-mode syntax.
|
|
@param {string} expression Search target
|
|
@param {string} needle Search as a regex pattern, with flags `su` applied
|
|
@param {number} [pos] Offset to start the search
|
|
@param {'DEFAULT' | 'CHAR_CLASS'} [context] All contexts if not specified
|
|
@returns {RegExpExecArray | null}
|
|
*/
|
|
export function execUnescaped(expression, needle, pos = 0, context) {
|
|
// Quick partial test; avoid the loop if not needed
|
|
if (!(new RegExp(needle, 'su').test(expression))) {
|
|
return null;
|
|
}
|
|
const re = new RegExp(`${needle}|(?<$skip>\\\\?.)`, 'gsu');
|
|
re.lastIndex = pos;
|
|
let numCharClassesOpen = 0;
|
|
let match;
|
|
while (match = re.exec(expression)) {
|
|
const {0: m, groups: {$skip}} = match;
|
|
if (!$skip && (!context || (context === Context.DEFAULT) === !numCharClassesOpen)) {
|
|
return match;
|
|
}
|
|
if (m === '[') {
|
|
numCharClassesOpen++;
|
|
} else if (m === ']' && numCharClassesOpen) {
|
|
numCharClassesOpen--;
|
|
}
|
|
// Avoid an infinite loop on zero-length matches
|
|
if (re.lastIndex == match.index) {
|
|
re.lastIndex++;
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
Checks whether an unescaped instance of a regex pattern appears in the given context.
|
|
|
|
Doesn't skip over complete multicharacter tokens (only `\` plus its folowing char) so must be used
|
|
with knowledge of what's safe to do given regex syntax. Assumes UnicodeSets-mode syntax.
|
|
@param {string} expression Search target
|
|
@param {string} needle Search as a regex pattern, with flags `su` applied
|
|
@param {'DEFAULT' | 'CHAR_CLASS'} [context] All contexts if not specified
|
|
@returns {boolean} Whether the pattern was found
|
|
*/
|
|
export function hasUnescaped(expression, needle, context) {
|
|
// Do this the easy way
|
|
return !!execUnescaped(expression, needle, 0, context);
|
|
}
|
|
|
|
/**
|
|
Extracts the full contents of a group (subpattern) from the given expression, accounting for
|
|
escaped characters, nested groups, and character classes. The group is identified by the position
|
|
where its contents start (the string index just after the group's opening delimiter). Returns the
|
|
rest of the string if the group is unclosed.
|
|
|
|
Assumes UnicodeSets-mode syntax.
|
|
@param {string} expression Search target
|
|
@param {number} contentsStartPos
|
|
@returns {string}
|
|
*/
|
|
export function getGroupContents(expression, contentsStartPos) {
|
|
const token = /\\?./gsu;
|
|
token.lastIndex = contentsStartPos;
|
|
let contentsEndPos = expression.length;
|
|
let numCharClassesOpen = 0;
|
|
// Starting search within an open group, after the group's opening
|
|
let numGroupsOpen = 1;
|
|
let match;
|
|
while (match = token.exec(expression)) {
|
|
const [m] = match;
|
|
if (m === '[') {
|
|
numCharClassesOpen++;
|
|
} else if (!numCharClassesOpen) {
|
|
if (m === '(') {
|
|
numGroupsOpen++;
|
|
} else if (m === ')') {
|
|
numGroupsOpen--;
|
|
if (!numGroupsOpen) {
|
|
contentsEndPos = match.index;
|
|
break;
|
|
}
|
|
}
|
|
} else if (m === ']') {
|
|
numCharClassesOpen--;
|
|
}
|
|
}
|
|
return expression.slice(contentsStartPos, contentsEndPos);
|
|
}
|