From b1de8d3fe48755da58268ba86dccd5d76940b613 Mon Sep 17 00:00:00 2001 From: Raymond Hill Date: Mon, 30 Jan 2023 17:00:26 -0500 Subject: [PATCH] Add support for regex-based values for `domain=`/`from=`/`to=` options Related discussion: - https://github.com/uBlockOrigin/uBlock-issues/discussions/2234 Example of usage: @@*$ghide,domain=/img[a-z]{3,5}\.buzz/ Regex-based domain values can be negated just like plain or entity-based values: *$domain=~/regex.../ This new syntax does not apply to static extended filters. --- src/js/background.js | 4 +- src/js/static-filtering-parser.js | 49 +++++++++--- src/js/static-net-filtering.js | 124 ++++++++++++++++++++++++++---- src/js/storage.js | 9 ++- 4 files changed, 158 insertions(+), 28 deletions(-) diff --git a/src/js/background.js b/src/js/background.js index aa6c43f07..77e042462 100644 --- a/src/js/background.js +++ b/src/js/background.js @@ -176,8 +176,8 @@ const µBlock = { // jshint ignore:line // Read-only systemSettings: { - compiledMagic: 54, // Increase when compiled format changes - selfieMagic: 54, // Increase when selfie format changes + compiledMagic: 55, // Increase when compiled format changes + selfieMagic: 55, // Increase when selfie format changes }, // https://github.com/uBlockOrigin/uBlock-issues/issues/759#issuecomment-546654501 diff --git a/src/js/static-filtering-parser.js b/src/js/static-filtering-parser.js index 9581e6ac9..63bd112b2 100644 --- a/src/js/static-filtering-parser.js +++ b/src/js/static-filtering-parser.js @@ -1620,11 +1620,11 @@ export class AstFilterParser { ); switch ( nodeOptionType ) { case NODE_TYPE_NET_OPTION_NAME_DENYALLOW: - this.linkDown(next, this.parseDomainList(next, '|'), 0b0000); + this.linkDown(next, this.parseDomainList(next, '|'), 0b00000); break; case NODE_TYPE_NET_OPTION_NAME_FROM: case NODE_TYPE_NET_OPTION_NAME_TO: - this.linkDown(next, this.parseDomainList(next, '|', 0b1010)); + this.linkDown(next, this.parseDomainList(next, '|', 0b11010)); break; default: break; @@ -1642,7 +1642,7 @@ export class AstFilterParser { return this.getNodeTransform(valueNode); } - parseDomainList(parent, separator, mode = 0b0000) { + parseDomainList(parent, separator, mode = 0b00000) { const parentBeg = this.nodes[parent+NODE_BEG_INDEX]; const parentEnd = this.nodes[parent+NODE_END_INDEX]; const containerNode = this.allocTypedNode( @@ -1668,9 +1668,7 @@ export class AstFilterParser { end = s.indexOf(separator, beg); } else { end = s.indexOf('/', beg+1); - end = end !== -1 - ? s.indexOf(separator, end+1) - : s.indexOf(separator, beg); + end = s.indexOf(separator, end !== -1 ? end+1 : beg); } if ( end === -1 ) { end = listEnd; } if ( end !== beg ) { @@ -1683,8 +1681,9 @@ export class AstFilterParser { prev = this.linkRight(prev, domainNode); } else { domainNode = 0; - if ( this.interactive && separatorNode !== 0 ) { + if ( separatorNode !== 0 ) { this.addNodeFlags(separatorNode, NODE_FLAG_ERROR); + this.addFlags(AST_FLAG_HAS_ERROR); } } if ( s.charCodeAt(end) === separatorCode ) { @@ -1696,14 +1695,20 @@ export class AstFilterParser { parentBeg + end ); prev = this.linkRight(prev, separatorNode); - if ( this.interactive && domainNode === 0 ) { + if ( domainNode === 0 ) { this.addNodeFlags(separatorNode, NODE_FLAG_ERROR); + this.addFlags(AST_FLAG_HAS_ERROR); } } else { separatorNode = 0; } beg = end; } + // Dangling separator node + if ( separatorNode !== 0 ) { + this.addNodeFlags(separatorNode, NODE_FLAG_ERROR); + this.addFlags(AST_FLAG_HAS_ERROR); + } this.linkDown(containerNode, this.throwHeadNode(listNode)); return containerNode; } @@ -1724,12 +1729,13 @@ export class AstFilterParser { } if ( beg !== parentEnd ) { next = this.allocTypedNode(NODE_TYPE_OPTION_VALUE_DOMAIN, beg, parentEnd); - const hn = this.normalizeHostnameValue(this.getNodeString(next), mode); + const hn = this.normalizeDomainValue(this.getNodeString(next), mode); if ( hn !== undefined ) { if ( hn !== '' ) { this.setNodeTransform(next, hn); } else { this.addNodeFlags(parent, NODE_FLAG_ERROR); + this.addFlags(AST_FLAG_HAS_ERROR); } } if ( head === 0 ) { @@ -1737,10 +1743,32 @@ export class AstFilterParser { } else { this.linkRight(head, next); } + } else { + this.addNodeFlags(parent, NODE_FLAG_ERROR); + this.addFlags(AST_FLAG_HAS_ERROR); } return head; } + // mode bits: + // 0b00001: can use wildcard at any position + // 0b00010: can use entity-based hostnames + // 0b00100: can use single wildcard + // 0b01000: can be negated + // 0b10000: can be a regex + normalizeDomainValue(s, modeBits) { + if ( (modeBits & 0b10000) === 0 || + s.length <= 2 || + s.charCodeAt(0) !== 0x2F /* / */ || + exCharCodeAt(s, -1) !== 0x2F /* / */ + ) { + return this.normalizeHostnameValue(s, modeBits); + } + const source = this.normalizeRegexPattern(s); + if ( source === '' ) { return ''; } + return `/${source}/`; + } + parseExt(parent, anchorBeg, anchorLen) { const parentBeg = this.nodes[parent+NODE_BEG_INDEX]; const parentEnd = this.nodes[parent+NODE_END_INDEX]; @@ -1756,7 +1784,7 @@ export class AstFilterParser { ); this.addFlags(AST_FLAG_HAS_OPTIONS); this.addNodeToRegister(NODE_TYPE_EXT_OPTIONS, next); - this.linkDown(next, this.parseDomainList(next, ',', 0b1110)); + this.linkDown(next, this.parseDomainList(next, ',', 0b01110)); prev = this.linkRight(prev, next); } next = this.allocTypedNode( @@ -2276,7 +2304,6 @@ export class AstFilterParser { // 0b00010: can use entity-based hostnames // 0b00100: can use single wildcard // 0b01000: can be negated - // 0b10000: can be a regex // // returns: // undefined: no normalization needed, use original hostname diff --git a/src/js/static-net-filtering.js b/src/js/static-net-filtering.js index 4c3763f75..fb2794237 100644 --- a/src/js/static-net-filtering.js +++ b/src/js/static-net-filtering.js @@ -1492,21 +1492,22 @@ const compileDomainOpt = (ctors, iterable, prepend, units) => { const hostnameMisses = []; const entityHits = []; const entityMisses = []; + const regexHits = []; + const regexMisses = []; for ( const s of iterable ) { const len = s.length; const beg = len > 1 && s.charCodeAt(0) === 0x7E /* '~' */ ? 1 : 0; if ( len <= beg ) { continue; } - if ( s.endsWith('.*') === false ) { - if ( beg === 0 ) { - hostnameHits.push(s); - } else { - hostnameMisses.push(s.slice(1)); - } - } else if ( beg === 0 ) { - entityHits.push(s); - } else { - entityMisses.push(s.slice(1)); + if ( s.charCodeAt(beg) === 0x2F /* / */ ) { + if ( beg === 0 ) { regexHits.push(s); continue; } + regexMisses.push(s); continue; } + if ( s.endsWith('.*') === false ) { + if ( beg === 0 ) { hostnameHits.push(s); continue; } + hostnameMisses.push(s.slice(1)); continue; + } + if ( beg === 0 ) { entityHits.push(s); continue; } + entityMisses.push(s.slice(1)); continue; } const toTrie = []; let trieWhich = 0b00; @@ -1532,6 +1533,9 @@ const compileDomainOpt = (ctors, iterable, prepend, units) => { for ( const hn of entityHits ) { compiledHit.push(ctors[1].compile(hn)); } + for ( const hn of regexHits ) { + compiledHit.push(ctors[3].compile(hn)); + } if ( compiledHit.length > 1 ) { compiledHit[0] = FilterDomainHitAny.compile(compiledHit.slice()); compiledHit.length = 1; @@ -1550,14 +1554,17 @@ const compileDomainOpt = (ctors, iterable, prepend, units) => { const compiledMiss = []; if ( toTrie.length !== 0 ) { compiledMiss.push( - ctors[5].compile(toTrie.sort(), trieWhich) + ctors[6].compile(toTrie.sort(), trieWhich) ); } for ( const hn of hostnameMisses ) { - compiledMiss.push(ctors[3].compile(hn)); + compiledMiss.push(ctors[4].compile(hn)); } for ( const hn of entityMisses ) { - compiledMiss.push(ctors[4].compile(hn)); + compiledMiss.push(ctors[5].compile(hn)); + } + for ( const hn of regexMisses ) { + compiledHit.push(ctors[7].compile(hn)); } if ( prepend ) { if ( compiledHit.length !== 0 ) { @@ -1749,6 +1756,47 @@ class FilterDomainHitSet { /******************************************************************************/ +class FilterDomainRegexHit { + static getDomainOpt(idata) { + const ref = filterRefs[filterData[idata+1]]; + return ref.restr; + } + + static match(idata) { + const ref = filterRefs[filterData[idata+1]]; + if ( ref.$re === null ) { + ref.$re = new RegExp(ref.restr.slice(1,-1)); + } + return ref.$re.test(this.getMatchTarget()); + } + + static compile(restr) { + return [ this.fid, restr ]; + } + + static fromCompiled(args) { + const idata = filterDataAllocLen(2); + filterData[idata+0] = args[0]; // fid + filterData[idata+1] = filterRefAdd({ restr: args[1], $re: null }); + return idata; + } + + static dnrFromCompiled(args, rule) { + rule.condition = rule.condition || {}; + const prop = this.dnrConditionName; + if ( rule.condition[prop] === undefined ) { + rule.condition[prop] = []; + } + rule.condition[prop].push(args[1]); + } + + static dumpInfo(idata) { + return this.getDomainOpt(idata); + } +} + +/******************************************************************************/ + // Implement the following filter option: // - domain= // - from= @@ -1845,20 +1893,44 @@ class FilterFromDomainMissSet extends FilterFromDomainHitSet { } } +class FilterFromRegexHit extends FilterDomainRegexHit { + static getMatchTarget() { + return $docHostname; + } + + static logData(idata, details) { + details.fromDomains.push(`${this.getDomainOpt(idata)}`); + } +} + +class FilterFromRegexMiss extends FilterFromRegexHit { + static match(idata) { + return super.match(idata) === false; + } + + static logData(idata, details) { + details.fromDomains.push(`~${this.getDomainOpt(idata)}`); + } +} + registerFilterClass(FilterFromDomainHit); registerFilterClass(FilterFromDomainMiss); registerFilterClass(FilterFromEntityHit); registerFilterClass(FilterFromEntityMiss); registerFilterClass(FilterFromDomainHitSet); registerFilterClass(FilterFromDomainMissSet); +registerFilterClass(FilterFromRegexHit); +registerFilterClass(FilterFromRegexMiss); const fromOptClasses = [ FilterFromDomainHit, FilterFromEntityHit, FilterFromDomainHitSet, + FilterFromRegexHit, FilterFromDomainMiss, FilterFromEntityMiss, FilterFromDomainMissSet, + FilterFromRegexMiss, ]; const compileFromDomainOpt = (...args) => { @@ -1946,20 +2018,44 @@ class FilterToDomainMissSet extends FilterToDomainHitSet { } } +class FilterToRegexHit extends FilterDomainRegexHit { + static getMatchTarget() { + return $requestHostname; + } + + static logData(idata, details) { + details.toDomains.push(`${this.getDomainOpt(idata)}`); + } +} + +class FilterToRegexMiss extends FilterToRegexHit { + static match(idata) { + return super.match(idata) === false; + } + + static logData(idata, details) { + details.toDomains.push(`~${this.getDomainOpt(idata)}`); + } +} + registerFilterClass(FilterToDomainHit); registerFilterClass(FilterToDomainMiss); registerFilterClass(FilterToEntityHit); registerFilterClass(FilterToEntityMiss); registerFilterClass(FilterToDomainHitSet); registerFilterClass(FilterToDomainMissSet); +registerFilterClass(FilterToRegexHit); +registerFilterClass(FilterToRegexMiss); const toOptClasses = [ FilterToDomainHit, FilterToEntityHit, FilterToDomainHitSet, + FilterToRegexHit, FilterToDomainMiss, FilterToEntityMiss, FilterToDomainMissSet, + FilterToRegexMiss, ]; const compileToDomainOpt = (...args) => { @@ -3678,7 +3774,7 @@ class FilterCompiler { isJustOrigin() { if ( this.optionUnitBits !== this.FROM_BIT ) { return false; } if ( this.isRegex ) { return false; } - if ( this.fromDomainOpt.includes('~') ) { return false; } + if ( /[\/~]/.test(this.fromDomainOpt) ) { return false; } if ( this.pattern === '*' ) { return true; } if ( this.anchor !== 0b010 ) { return false; } if ( /^(?:http[s*]?:(?:\/\/)?)$/.test(this.pattern) ) { return true; } diff --git a/src/js/storage.js b/src/js/storage.js index 6753dcfab..4304972cd 100644 --- a/src/js/storage.js +++ b/src/js/storage.js @@ -1030,7 +1030,14 @@ self.addEventListener('hiddenSettingsChanged', ( ) => { parser.parse(line); if ( parser.isFilter() === false ) { continue; } - if ( parser.hasError() ) { continue; } + if ( parser.hasError() ) { + logger.writeOne({ + realm: 'message', + type: 'error', + text: `Invalid filter: ${parser.raw}` + }); + continue; + } if ( parser.isExtendedFilter() ) { staticExtFilteringEngine.compile(parser, writer);