Improve extraction of tokens from regexes

Fixed flawed extraction of tokens with optional sequences, i.e. when quantifier could be zero. Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/2367 Ignore look-around sequences as suggested when normalizing into tokenizable string. Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/2368 Fix regex analyzer throwing with trailing `-` in character class sequence. Related issue: - https://github.com/AdguardTeam/AdguardFilters/pull/134630
2022-11-17 08:24:55 -05:00 · 2022-11-17 08:24:55 -05:00 · d51b7e082b
parent 560c1fe888
commit d51b7e082b
3 changed files with 41 additions and 9 deletions
--- a/src/js/static-filtering-parser.js
+++ b/src/js/static-filtering-parser.js
@ -3009,11 +3009,11 @@ Parser.utils = Parser.prototype.utils = (( ) => {
    class regex {
        static firstCharCodeClass(s) {
-            return /^[\x01%0-9A-Za-z]/.test(s) ? 1 : 0;
+            return /^[\x01\x03%0-9A-Za-z]/.test(s) ? 1 : 0;
        }
        static lastCharCodeClass(s) {
-            return /[\x01%0-9A-Za-z]$/.test(s) ? 1 : 0;
+            return /[\x01\x03%0-9A-Za-z]$/.test(s) ? 1 : 0;
        }
        static tokenizableStrFromNode(node) {
@ -3042,18 +3042,24 @@ Parser.utils = Parser.prototype.utils = (( ) => {
                return String.fromCharCode(firstChar, lastChar);
            }
            case 4: /* T_GROUP, 'Group' */ {
-                if ( node.flags.NegativeLookAhead === 1 ) { return '\x01'; }
+                if (
-                if ( node.flags.NegativeLookBehind === 1 ) { return '\x01'; }
+                    node.flags.LookAhead === 1 ||
                    node.flags.NegativeLookAhead === 1 ||
                    node.flags.LookBehind === 1 ||
                    node.flags.NegativeLookBehind === 1
                ) {
                    return '';
                }
                return this.tokenizableStrFromNode(node.val);
            }
            case 16: /* T_QUANTIFIER, 'Quantifier' */ {
                const s = this.tokenizableStrFromNode(node.val);
                const first = this.firstCharCodeClass(s);
                const last = this.lastCharCodeClass(s);
-                if ( node.flags.min === 0 && first === 0 && last === 0 ) {
+                if ( node.flags.min !== 0 ) {
-                    return '';
+                    return String.fromCharCode(first, last);
                }
-                return String.fromCharCode(first, last);
+                return String.fromCharCode(first+2, last+2);
            }
            case 64: /* T_HEXCHAR, 'HexChar' */ {
                return String.fromCharCode(parseInt(node.val.slice(1), 16));
@ -3142,13 +3148,33 @@ Parser.utils = Parser.prototype.utils = (( ) => {
        static toTokenizableStr(reStr) {
            if ( regexAnalyzer === null ) { return ''; }
            let s = '';
            try {
-                return this.tokenizableStrFromNode(
+                s = this.tokenizableStrFromNode(
                    regexAnalyzer(reStr, false).tree()
                );
            } catch(ex) {
            }
-            return '';
+            // Process optional sequences
            const reOptional = /[\x02\x03]+/g;
            for (;;) {
                const match = reOptional.exec(s);
                if ( match === null ) { break; }
                const left = s.slice(0, match.index);
                const middle = match[0];
                const right = s.slice(match.index + middle.length);
                s = left;
                s += this.firstCharCodeClass(right) === 1 ||
                        this.firstCharCodeClass(middle) === 1
                    ? '\x01'
                    : '\x00';
                s += this.lastCharCodeClass(left) === 1 ||
                        this.lastCharCodeClass(middle) === 1
                    ? '\x01'
                    : '\x00';
                s += right;
            }
            return s;
        }
    }
--- a/src/lib/regexanalyzer/CHANGES.md
+++ b/src/lib/regexanalyzer/CHANGES.md
--- a/src/lib/regexanalyzer/regex.js
+++ b/src/lib/regexanalyzer/regex.js
@ -1019,6 +1019,12 @@ var rnd = function( a, b ){ return Math.round((b-a)*Math.random()+a); },
                }
            }
            if ( isRange && escaped === false && ']' === ch )
            {
                isRange = false;
                chars.push('-');
            }
            if ( isRange )
            {
                if ( chars.length )