Improve extraction of tokens from regexes

Fixed flawed extraction of tokens with optional sequences, i.e.
when quantifier could be zero.
Related issue:
- https://github.com/uBlockOrigin/uBlock-issues/issues/2367

Ignore look-around sequences as suggested when normalizing into
tokenizable string.
Related issue:
- https://github.com/uBlockOrigin/uBlock-issues/issues/2368

Fix regex analyzer throwing with trailing `-` in character
class sequence.
Related issue:
- https://github.com/AdguardTeam/AdguardFilters/pull/134630
This commit is contained in:
Raymond Hill 2022-11-17 08:24:55 -05:00
parent 560c1fe888
commit d51b7e082b
No known key found for this signature in database
GPG Key ID: 25E1490B761470C2
3 changed files with 41 additions and 9 deletions

View File

@ -3009,11 +3009,11 @@ Parser.utils = Parser.prototype.utils = (( ) => {
class regex { class regex {
static firstCharCodeClass(s) { static firstCharCodeClass(s) {
return /^[\x01%0-9A-Za-z]/.test(s) ? 1 : 0; return /^[\x01\x03%0-9A-Za-z]/.test(s) ? 1 : 0;
} }
static lastCharCodeClass(s) { static lastCharCodeClass(s) {
return /[\x01%0-9A-Za-z]$/.test(s) ? 1 : 0; return /[\x01\x03%0-9A-Za-z]$/.test(s) ? 1 : 0;
} }
static tokenizableStrFromNode(node) { static tokenizableStrFromNode(node) {
@ -3042,18 +3042,24 @@ Parser.utils = Parser.prototype.utils = (( ) => {
return String.fromCharCode(firstChar, lastChar); return String.fromCharCode(firstChar, lastChar);
} }
case 4: /* T_GROUP, 'Group' */ { case 4: /* T_GROUP, 'Group' */ {
if ( node.flags.NegativeLookAhead === 1 ) { return '\x01'; } if (
if ( node.flags.NegativeLookBehind === 1 ) { return '\x01'; } node.flags.LookAhead === 1 ||
node.flags.NegativeLookAhead === 1 ||
node.flags.LookBehind === 1 ||
node.flags.NegativeLookBehind === 1
) {
return '';
}
return this.tokenizableStrFromNode(node.val); return this.tokenizableStrFromNode(node.val);
} }
case 16: /* T_QUANTIFIER, 'Quantifier' */ { case 16: /* T_QUANTIFIER, 'Quantifier' */ {
const s = this.tokenizableStrFromNode(node.val); const s = this.tokenizableStrFromNode(node.val);
const first = this.firstCharCodeClass(s); const first = this.firstCharCodeClass(s);
const last = this.lastCharCodeClass(s); const last = this.lastCharCodeClass(s);
if ( node.flags.min === 0 && first === 0 && last === 0 ) { if ( node.flags.min !== 0 ) {
return ''; return String.fromCharCode(first, last);
} }
return String.fromCharCode(first, last); return String.fromCharCode(first+2, last+2);
} }
case 64: /* T_HEXCHAR, 'HexChar' */ { case 64: /* T_HEXCHAR, 'HexChar' */ {
return String.fromCharCode(parseInt(node.val.slice(1), 16)); return String.fromCharCode(parseInt(node.val.slice(1), 16));
@ -3142,13 +3148,33 @@ Parser.utils = Parser.prototype.utils = (( ) => {
static toTokenizableStr(reStr) { static toTokenizableStr(reStr) {
if ( regexAnalyzer === null ) { return ''; } if ( regexAnalyzer === null ) { return ''; }
let s = '';
try { try {
return this.tokenizableStrFromNode( s = this.tokenizableStrFromNode(
regexAnalyzer(reStr, false).tree() regexAnalyzer(reStr, false).tree()
); );
} catch(ex) { } catch(ex) {
} }
return ''; // Process optional sequences
const reOptional = /[\x02\x03]+/g;
for (;;) {
const match = reOptional.exec(s);
if ( match === null ) { break; }
const left = s.slice(0, match.index);
const middle = match[0];
const right = s.slice(match.index + middle.length);
s = left;
s += this.firstCharCodeClass(right) === 1 ||
this.firstCharCodeClass(middle) === 1
? '\x01'
: '\x00';
s += this.lastCharCodeClass(left) === 1 ||
this.lastCharCodeClass(middle) === 1
? '\x01'
: '\x00';
s += right;
}
return s;
} }
} }

View File

View File

@ -1019,6 +1019,12 @@ var rnd = function( a, b ){ return Math.round((b-a)*Math.random()+a); },
} }
} }
if ( isRange && escaped === false && ']' === ch )
{
isRange = false;
chars.push('-');
}
if ( isRange ) if ( isRange )
{ {
if ( chars.length ) if ( chars.length )