From d51b7e082b73db745a1f8321e65b086902d80d80 Mon Sep 17 00:00:00 2001 From: Raymond Hill Date: Thu, 17 Nov 2022 08:24:55 -0500 Subject: [PATCH] Improve extraction of tokens from regexes Fixed flawed extraction of tokens with optional sequences, i.e. when quantifier could be zero. Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/2367 Ignore look-around sequences as suggested when normalizing into tokenizable string. Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/2368 Fix regex analyzer throwing with trailing `-` in character class sequence. Related issue: - https://github.com/AdguardTeam/AdguardFilters/pull/134630 --- src/js/static-filtering-parser.js | 44 ++++++++++++++++++++++++------- src/lib/regexanalyzer/CHANGES.md | 0 src/lib/regexanalyzer/regex.js | 6 +++++ 3 files changed, 41 insertions(+), 9 deletions(-) create mode 100644 src/lib/regexanalyzer/CHANGES.md diff --git a/src/js/static-filtering-parser.js b/src/js/static-filtering-parser.js index e62d198d8..56bc7e21b 100644 --- a/src/js/static-filtering-parser.js +++ b/src/js/static-filtering-parser.js @@ -3009,11 +3009,11 @@ Parser.utils = Parser.prototype.utils = (( ) => { class regex { static firstCharCodeClass(s) { - return /^[\x01%0-9A-Za-z]/.test(s) ? 1 : 0; + return /^[\x01\x03%0-9A-Za-z]/.test(s) ? 1 : 0; } static lastCharCodeClass(s) { - return /[\x01%0-9A-Za-z]$/.test(s) ? 1 : 0; + return /[\x01\x03%0-9A-Za-z]$/.test(s) ? 1 : 0; } static tokenizableStrFromNode(node) { @@ -3042,18 +3042,24 @@ Parser.utils = Parser.prototype.utils = (( ) => { return String.fromCharCode(firstChar, lastChar); } case 4: /* T_GROUP, 'Group' */ { - if ( node.flags.NegativeLookAhead === 1 ) { return '\x01'; } - if ( node.flags.NegativeLookBehind === 1 ) { return '\x01'; } + if ( + node.flags.LookAhead === 1 || + node.flags.NegativeLookAhead === 1 || + node.flags.LookBehind === 1 || + node.flags.NegativeLookBehind === 1 + ) { + return ''; + } return this.tokenizableStrFromNode(node.val); } case 16: /* T_QUANTIFIER, 'Quantifier' */ { const s = this.tokenizableStrFromNode(node.val); const first = this.firstCharCodeClass(s); const last = this.lastCharCodeClass(s); - if ( node.flags.min === 0 && first === 0 && last === 0 ) { - return ''; + if ( node.flags.min !== 0 ) { + return String.fromCharCode(first, last); } - return String.fromCharCode(first, last); + return String.fromCharCode(first+2, last+2); } case 64: /* T_HEXCHAR, 'HexChar' */ { return String.fromCharCode(parseInt(node.val.slice(1), 16)); @@ -3142,13 +3148,33 @@ Parser.utils = Parser.prototype.utils = (( ) => { static toTokenizableStr(reStr) { if ( regexAnalyzer === null ) { return ''; } + let s = ''; try { - return this.tokenizableStrFromNode( + s = this.tokenizableStrFromNode( regexAnalyzer(reStr, false).tree() ); } catch(ex) { } - return ''; + // Process optional sequences + const reOptional = /[\x02\x03]+/g; + for (;;) { + const match = reOptional.exec(s); + if ( match === null ) { break; } + const left = s.slice(0, match.index); + const middle = match[0]; + const right = s.slice(match.index + middle.length); + s = left; + s += this.firstCharCodeClass(right) === 1 || + this.firstCharCodeClass(middle) === 1 + ? '\x01' + : '\x00'; + s += this.lastCharCodeClass(left) === 1 || + this.lastCharCodeClass(middle) === 1 + ? '\x01' + : '\x00'; + s += right; + } + return s; } } diff --git a/src/lib/regexanalyzer/CHANGES.md b/src/lib/regexanalyzer/CHANGES.md new file mode 100644 index 000000000..e69de29bb diff --git a/src/lib/regexanalyzer/regex.js b/src/lib/regexanalyzer/regex.js index 0d206f2c7..5931d20fd 100644 --- a/src/lib/regexanalyzer/regex.js +++ b/src/lib/regexanalyzer/regex.js @@ -1019,6 +1019,12 @@ var rnd = function( a, b ){ return Math.round((b-a)*Math.random()+a); }, } } + if ( isRange && escaped === false && ']' === ch ) + { + isRange = false; + chars.push('-'); + } + if ( isRange ) { if ( chars.length )