Improve extraction of tokens from regexes

Fixed flawed extraction of tokens with optional sequences, i.e.
when quantifier could be zero.
Related issue:
- https://github.com/uBlockOrigin/uBlock-issues/issues/2367

Ignore look-around sequences as suggested when normalizing into
tokenizable string.
Related issue:
- https://github.com/uBlockOrigin/uBlock-issues/issues/2368

Fix regex analyzer throwing with trailing `-` in character
class sequence.
Related issue:
- https://github.com/AdguardTeam/AdguardFilters/pull/134630
This commit is contained in:
Raymond Hill 2022-11-17 08:24:55 -05:00
parent 560c1fe888
commit d51b7e082b
No known key found for this signature in database
GPG Key ID: 25E1490B761470C2
3 changed files with 41 additions and 9 deletions

View File

@ -3009,11 +3009,11 @@ Parser.utils = Parser.prototype.utils = (( ) => {
class regex {
static firstCharCodeClass(s) {
return /^[\x01%0-9A-Za-z]/.test(s) ? 1 : 0;
return /^[\x01\x03%0-9A-Za-z]/.test(s) ? 1 : 0;
}
static lastCharCodeClass(s) {
return /[\x01%0-9A-Za-z]$/.test(s) ? 1 : 0;
return /[\x01\x03%0-9A-Za-z]$/.test(s) ? 1 : 0;
}
static tokenizableStrFromNode(node) {
@ -3042,18 +3042,24 @@ Parser.utils = Parser.prototype.utils = (( ) => {
return String.fromCharCode(firstChar, lastChar);
}
case 4: /* T_GROUP, 'Group' */ {
if ( node.flags.NegativeLookAhead === 1 ) { return '\x01'; }
if ( node.flags.NegativeLookBehind === 1 ) { return '\x01'; }
if (
node.flags.LookAhead === 1 ||
node.flags.NegativeLookAhead === 1 ||
node.flags.LookBehind === 1 ||
node.flags.NegativeLookBehind === 1
) {
return '';
}
return this.tokenizableStrFromNode(node.val);
}
case 16: /* T_QUANTIFIER, 'Quantifier' */ {
const s = this.tokenizableStrFromNode(node.val);
const first = this.firstCharCodeClass(s);
const last = this.lastCharCodeClass(s);
if ( node.flags.min === 0 && first === 0 && last === 0 ) {
return '';
if ( node.flags.min !== 0 ) {
return String.fromCharCode(first, last);
}
return String.fromCharCode(first, last);
return String.fromCharCode(first+2, last+2);
}
case 64: /* T_HEXCHAR, 'HexChar' */ {
return String.fromCharCode(parseInt(node.val.slice(1), 16));
@ -3142,13 +3148,33 @@ Parser.utils = Parser.prototype.utils = (( ) => {
static toTokenizableStr(reStr) {
if ( regexAnalyzer === null ) { return ''; }
let s = '';
try {
return this.tokenizableStrFromNode(
s = this.tokenizableStrFromNode(
regexAnalyzer(reStr, false).tree()
);
} catch(ex) {
}
return '';
// Process optional sequences
const reOptional = /[\x02\x03]+/g;
for (;;) {
const match = reOptional.exec(s);
if ( match === null ) { break; }
const left = s.slice(0, match.index);
const middle = match[0];
const right = s.slice(match.index + middle.length);
s = left;
s += this.firstCharCodeClass(right) === 1 ||
this.firstCharCodeClass(middle) === 1
? '\x01'
: '\x00';
s += this.lastCharCodeClass(left) === 1 ||
this.lastCharCodeClass(middle) === 1
? '\x01'
: '\x00';
s += right;
}
return s;
}
}

View File

View File

@ -1019,6 +1019,12 @@ var rnd = function( a, b ){ return Math.round((b-a)*Math.random()+a); },
}
}
if ( isRange && escaped === false && ']' === ch )
{
isRange = false;
chars.push('-');
}
if ( isRange )
{
if ( chars.length )