diff --git a/src/js/background.js b/src/js/background.js index 1531a2bb8..7125119ac 100644 --- a/src/js/background.js +++ b/src/js/background.js @@ -137,8 +137,8 @@ const µBlock = (function() { // jshint ignore:line // Read-only systemSettings: { - compiledMagic: 8, // Increase when compiled format changes - selfieMagic: 9 // Increase when selfie format changes + compiledMagic: 10, // Increase when compiled format changes + selfieMagic: 10 // Increase when selfie format changes }, restoreBackupSettings: { diff --git a/src/js/static-net-filtering.js b/src/js/static-net-filtering.js index 3b775dda7..f9d24c6ff 100644 --- a/src/js/static-net-filtering.js +++ b/src/js/static-net-filtering.js @@ -101,16 +101,8 @@ const typeValueToTypeName = { 20: 'unsupported' }; -const BlockAnyTypeAnyParty = BlockAction | AnyType | AnyParty; -const BlockAnyType = BlockAction | AnyType; -const BlockAnyParty = BlockAction | AnyParty; - -const AllowAnyTypeAnyParty = AllowAction | AnyType | AnyParty; -const AllowAnyType = AllowAction | AnyType; -const AllowAnyParty = AllowAction | AnyParty; - -const genericHideException = AllowAction | AnyParty | typeNameToTypeValue.generichide, - genericHideImportant = BlockAction | AnyParty | typeNameToTypeValue.generichide | Important; +const AnyTypeAnyParty = AnyType | AnyParty; +const BlockImportant = BlockAction | Important; // ABP filters: https://adblockplus.org/en/filters // regex tester: http://regex101.com/ @@ -701,8 +693,19 @@ registerFilterClass(FilterRegex); // The optimal "class" is picked according to the content of the // `domain=` filter option. -const filterOrigin = { - compile: function(details, wrapped) { +const filterOrigin = new (class { + constructor() { + let trieDetails; + try { + trieDetails = JSON.parse( + vAPI.localStorage.getItem('FilterOrigin.trieDetails') + ); + } catch(ex) { + } + this.trieContainer = new HNTrieContainer(trieDetails); + } + + compile(details, wrapped) { const domainOpt = details.domainOpt; // One hostname if ( domainOpt.indexOf('|') === -1 ) { @@ -723,38 +726,32 @@ const filterOrigin = { } // Must be in one set, but not in the other. return FilterOriginMixedSet.compile(domainOpt, wrapped); - }, - logData: function(f, arg1, arg2) { + } + + logData(f, arg1, arg2) { const out = f.wrapped.logData(); out.compiled = [ f.fid, arg1, out.compiled ]; if ( out.opts !== undefined ) { out.opts += ','; } out.opts = `domain=${arg2 || arg1}`; return out; - }, - trieContainer: (function() { - let trieDetails; - try { - trieDetails = JSON.parse( - vAPI.localStorage.getItem('FilterOrigin.trieDetails') - ); - } catch(ex) { - } - return new HNTrieContainer(trieDetails); - })(), - readyToUse: function() { + } + + readyToUse() { return this.trieContainer.readyToUse(); - }, - reset: function() { + } + + reset() { return this.trieContainer.reset(); - }, - optimize: function() { + } + + optimize() { const trieDetails = this.trieContainer.optimize(); vAPI.localStorage.setItem( 'FilterOrigin.trieDetails', JSON.stringify(trieDetails) ); - }, -}; + } +})(); /******************************************************************************/ @@ -850,11 +847,13 @@ registerFilterClass(FilterOriginMiss); /******************************************************************************/ const FilterOriginHitSet = class { - constructor(domainOpt, wrapped) { + constructor(domainOpt, oneOf, wrapped) { this.domainOpt = domainOpt.length < 128 ? domainOpt : µb.stringDeduplicater.lookup(domainOpt); - this.oneOf = null; + this.oneOf = oneOf !== null + ? filterOrigin.trieContainer.createOne(oneOf) + : null; this.wrapped = wrapped; } @@ -873,17 +872,25 @@ const FilterOriginHitSet = class { } compile() { - return [ this.fid, this.domainOpt, this.wrapped.compile() ]; + return [ + this.fid, + this.domainOpt, + this.oneOf !== null + ? filterOrigin.trieContainer.compileOne(this.oneOf) + : null, + this.wrapped.compile() + ]; } static compile(domainOpt, wrapped) { - return [ FilterOriginHitSet.fid, domainOpt, wrapped ]; + return [ FilterOriginHitSet.fid, domainOpt, null, wrapped ]; } static load(args) { return new FilterOriginHitSet( args[1], - filterFromCompiledData(args[2]) + args[2], + filterFromCompiledData(args[3]) ); } }; @@ -893,11 +900,13 @@ registerFilterClass(FilterOriginHitSet); /******************************************************************************/ const FilterOriginMissSet = class { - constructor(domainOpt, wrapped) { + constructor(domainOpt, noneOf, wrapped) { this.domainOpt = domainOpt.length < 128 ? domainOpt : µb.stringDeduplicater.lookup(domainOpt); - this.noneOf = null; + this.noneOf = noneOf !== null + ? filterOrigin.trieContainer.createOne(noneOf) + : null; this.wrapped = wrapped; } @@ -916,17 +925,25 @@ const FilterOriginMissSet = class { } compile() { - return [ this.fid, this.domainOpt, this.wrapped.compile() ]; + return [ + this.fid, + this.domainOpt, + this.noneOf !== null + ? filterOrigin.trieContainer.compileOne(this.noneOf) + : null, + this.wrapped.compile() + ]; } static compile(domainOpt, wrapped) { - return [ FilterOriginMissSet.fid, domainOpt, wrapped ]; + return [ FilterOriginMissSet.fid, domainOpt, null, wrapped ]; } static load(args) { return new FilterOriginMissSet( args[1], - filterFromCompiledData(args[2]) + args[2], + filterFromCompiledData(args[3]) ); } }; @@ -936,12 +953,16 @@ registerFilterClass(FilterOriginMissSet); /******************************************************************************/ const FilterOriginMixedSet = class { - constructor(domainOpt, wrapped) { + constructor(domainOpt, oneOf, noneOf, wrapped) { this.domainOpt = domainOpt.length < 128 ? domainOpt : µb.stringDeduplicater.lookup(domainOpt); - this.oneOf = null; - this.noneOf = null; + this.oneOf = oneOf !== null + ? filterOrigin.trieContainer.createOne(oneOf) + : null; + this.noneOf = noneOf !== null + ? filterOrigin.trieContainer.createOne(noneOf) + : null; this.wrapped = wrapped; } @@ -971,17 +992,29 @@ const FilterOriginMixedSet = class { } compile() { - return [ this.fid, this.domainOpt, this.wrapped.compile() ]; + return [ + this.fid, + this.domainOpt, + this.oneOf !== null + ? filterOrigin.trieContainer.compileOne(this.oneOf) + : null, + this.noneOf !== null + ? filterOrigin.trieContainer.compileOne(this.noneOf) + : null, + this.wrapped.compile() + ]; } static compile(domainOpt, wrapped) { - return [ FilterOriginMixedSet.fid, domainOpt, wrapped ]; + return [ FilterOriginMixedSet.fid, domainOpt, null, null, wrapped ]; } static load(args) { return new FilterOriginMixedSet( args[1], - filterFromCompiledData(args[2]) + args[2], + args[3], + filterFromCompiledData(args[4]) ); } }; @@ -1129,6 +1162,95 @@ registerFilterClass(FilterHostnameDict); /******************************************************************************/ +// Dictionary of hostnames for filters which only purpose is to match +// the document origin. + +const FilterJustOrigin = class { + constructor(args) { + this.h = ''; // short-lived register + this.dict = filterOrigin.trieContainer.createOne(args); + } + + get size() { + return this.dict.size; + } + + add(hn) { + return this.dict.add(hn); + } + + match() { + const pos = this.dict.matches(pageHostnameRegister); + if ( pos === -1 ) { return false; } + this.h = pageHostnameRegister.slice(pos); + return true; + } + + logData() { + return { + raw: '*', + regex: '^', + compiled: this.h + }; + } + + compile() { + return [ this.fid, filterOrigin.trieContainer.compileOne(this.dict) ]; + } + + static load(args) { + return new FilterJustOrigin(args[1]); + } +}; + +registerFilterClass(FilterJustOrigin); + +/******************************************************************************/ + +const FilterHTTPSJustOrigin = class extends FilterJustOrigin { + match(url) { + return url.startsWith('https://') && super.match(); + } + + logData() { + return { + raw: '|https://', + regex: '^https://', + compiled: this.h + }; + } + + static load(args) { + return new FilterHTTPSJustOrigin(args[1]); + } +}; + +registerFilterClass(FilterHTTPSJustOrigin); + +/******************************************************************************/ + +const FilterHTTPJustOrigin = class extends FilterJustOrigin { + match(url) { + return url.startsWith('http://') && super.match(); + } + + logData() { + return { + raw: '|http://', + regex: '^http://', + compiled: this.h + }; + } + + static load(args) { + return new FilterHTTPJustOrigin(args[1]); + } +}; + +registerFilterClass(FilterHTTPJustOrigin); + +/******************************************************************************/ + const FilterPair = class { constructor(a, b) { this.f1 = a; @@ -1409,7 +1531,7 @@ const FilterParser = function() { this.reBadCSP = /(?:^|;)\s*report-(?:to|uri)\b/; this.reIsWildcarded = /[\^\*]/; this.domainOpt = ''; - this.noTokenHash = µb.urlTokenizer.tokenHashFromString('*'); + this.noTokenHash = µb.urlTokenizer.noTokenHash; this.unsupportedTypeBit = this.bitFromType('unsupported'); // All network request types to bitmap // bring origin to 0 (from 4 -- see typeNameToTypeValue) @@ -1819,14 +1941,14 @@ FilterParser.prototype.parse = function(raw) { // These "bad tokens" are collated manually. // Hostname-anchored with no wildcard always have a token index of 0. -var reHostnameToken = /^[0-9a-z]+/; -var reGoodToken = /[%0-9a-z]{2,}/g; -var reRegexToken = /[%0-9A-Za-z]{2,}/g; -var reRegexTokenAbort = /[([]/; -var reRegexBadPrefix = /(^|[^\\]\.|[*?{}\\])$/; -var reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*]|$)/; +const reHostnameToken = /^[0-9a-z]+/; +const reGoodToken = /[%0-9a-z]{2,}/g; +const reRegexToken = /[%0-9A-Za-z]{2,}/g; +const reRegexTokenAbort = /[([]/; +const reRegexBadPrefix = /(^|[^\\]\.|[*?{}\\])$/; +const reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*]|$)/; -var badTokens = new Set([ +const badTokens = new Set([ 'com', 'google', 'http', @@ -1842,13 +1964,13 @@ var badTokens = new Set([ FilterParser.prototype.findFirstGoodToken = function() { reGoodToken.lastIndex = 0; - var s = this.f, - matches, lpos, - badTokenMatch = null; + const s = this.f; + let matches; + let badTokenMatch = null; while ( (matches = reGoodToken.exec(s)) !== null ) { // https://github.com/gorhill/uBlock/issues/997 // Ignore token if preceded by wildcard. - lpos = matches.index; + const lpos = matches.index; if ( lpos !== 0 && s.charCodeAt(lpos - 1) === 0x2A /* '*' */ ) { continue; } @@ -1868,10 +1990,10 @@ FilterParser.prototype.findFirstGoodToken = function() { FilterParser.prototype.extractTokenFromRegex = function() { reRegexToken.lastIndex = 0; - var s = this.f, - matches, prefix; + const s = this.f; + let matches; while ( (matches = reRegexToken.exec(s)) !== null ) { - prefix = s.slice(0, matches.index); + const prefix = s.slice(0, matches.index); if ( reRegexTokenAbort.test(prefix) ) { return; } if ( reRegexBadPrefix.test(prefix) || @@ -1916,14 +2038,27 @@ FilterParser.prototype.makeToken = function() { } }; +/******************************************************************************/ + +FilterParser.prototype.isJustOrigin = function() { + return this.datatype === undefined && + this.redirect === false && + this.domainOpt !== '' && + /^(?:\*|https?:(?:\/\/)?)$/.test(this.f) && + this.domainOpt.indexOf('~') === -1; +}; + /******************************************************************************/ /******************************************************************************/ const FilterContainer = function() { this.filterParser = new FilterParser(); this.urlTokenizer = µb.urlTokenizer; - this.noTokenHash = this.urlTokenizer.tokenHashFromString('*'); - this.dotTokenHash = this.urlTokenizer.tokenHashFromString('.'); + this.noTokenHash = this.urlTokenizer.noTokenHash; + this.dotTokenHash = this.urlTokenizer.dotTokenHash; + this.anyTokenHash = this.urlTokenizer.anyTokenHash; + this.anyHTTPSTokenHash = this.urlTokenizer.anyHTTPSTokenHash; + this.anyHTTPTokenHash = this.urlTokenizer.anyHTTPTokenHash; this.reset(); }; @@ -1945,15 +2080,16 @@ FilterContainer.prototype.reset = function() { this.dataFilters = new Map(); this.filterParser.reset(); - // This will invalidate all hn tries throughout uBO: - filterOrigin.reset(); + // This will invalidate all tries FilterHostnameDict.reset(); + filterOrigin.reset(); FilterBucket.reset(); // Runtime registers - this.cbRegister = undefined; - this.thRegister = undefined; - this.fRegister = null; + this.urlRegister = ''; + this.catbitsRegister = 0; + this.tokenRegister = 0; + this.filterRegister = null; }; /******************************************************************************/ @@ -2014,6 +2150,33 @@ FilterContainer.prototype.freeze = function() { continue; } + if ( tokenHash === this.anyTokenHash ) { + if ( entry === undefined ) { + entry = new FilterJustOrigin(); + bucket.set(this.anyTokenHash, entry); + } + entry.add(fdata); + continue; + } + + if ( tokenHash === this.anyHTTPSTokenHash ) { + if ( entry === undefined ) { + entry = new FilterHTTPSJustOrigin(); + bucket.set(this.anyHTTPSTokenHash, entry); + } + entry.add(fdata); + continue; + } + + if ( tokenHash === this.anyHTTPTokenHash ) { + if ( entry === undefined ) { + entry = new FilterHTTPJustOrigin(); + bucket.set(this.anyHTTPTokenHash, entry); + } + entry.add(fdata); + continue; + } + if ( entry === undefined ) { bucket.set(tokenHash, filterFromCompiledData(fdata)); continue; @@ -2036,8 +2199,8 @@ FilterContainer.prototype.freeze = function() { } this.filterParser.reset(); - this.goodFilters = new Set(); - filterOrigin.optimize(); + this.badFilters.clear(); + this.goodFilters.clear(); FilterHostnameDict.optimize(); FilterBucket.optimize(); this.frozen = true; @@ -2078,11 +2241,17 @@ FilterContainer.prototype.toSelfie = function(path) { return selfie; }; + filterOrigin.optimize(); + return Promise.all([ µBlock.assets.put( `${path}/FilterHostnameDict.trieContainer`, FilterHostnameDict.trieContainer.serialize(µBlock.base128) ), + µBlock.assets.put( + `${path}/FilterOrigin.trieContainer`, + filterOrigin.trieContainer.serialize(µBlock.base128) + ), µBlock.assets.put( `${path}/FilterBucket.trieContainer`, FilterBucket.trieContainer.serialize(µBlock.base128) @@ -2114,6 +2283,13 @@ FilterContainer.prototype.fromSelfie = function(path) { ); return true; }), + µBlock.assets.get(`${path}/FilterOrigin.trieContainer`).then(details => { + filterOrigin.trieContainer.unserialize( + details.content, + µBlock.base128 + ); + return true; + }), µBlock.assets.get(`${path}/FilterBucket.trieContainer`).then(details => { FilterBucket.trieContainer.unserialize( details.content, @@ -2205,9 +2381,15 @@ FilterContainer.prototype.compile = function(raw, writer) { } else if ( parsed.hostnamePure ) { fdata = FilterPlainHostname.compile(parsed); } else if ( parsed.f === '*' ) { + if ( parsed.isJustOrigin() ) { + parsed.tokenHash = this.anyTokenHash; + for ( const hn of parsed.domainOpt.split('|') ) { + this.compileToAtomicFilter(parsed, hn, writer); + } + return true; + } fdata = FilterTrue.compile(); } else if ( parsed.anchor === 0x5 ) { - // https://github.com/gorhill/uBlock/issues/1669 fdata = FilterGenericHnAndRightAnchored.compile(parsed); } else if ( parsed.anchor === 0x4 ) { if ( @@ -2222,6 +2404,22 @@ FilterContainer.prototype.compile = function(raw, writer) { } else if ( parsed.wildcarded || parsed.tokenHash === parsed.noTokenHash ) { fdata = FilterGeneric.compile(parsed); } else if ( parsed.anchor === 0x2 ) { + if ( parsed.isJustOrigin() ) { + if ( parsed.f === 'https://' ) { + parsed.tokenHash = this.anyHTTPSTokenHash; + for ( const hn of parsed.domainOpt.split('|') ) { + this.compileToAtomicFilter(parsed, hn, writer); + } + return true; + } + if ( parsed.f === 'http://' ) { + parsed.tokenHash = this.anyHTTPTokenHash; + for ( const hn of parsed.domainOpt.split('|') ) { + this.compileToAtomicFilter(parsed, hn, writer); + } + return true; + } + } fdata = FilterPlainLeftAnchored.compile(parsed); } else if ( parsed.anchor === 0x1 ) { fdata = FilterPlainRightAnchored.compile(parsed); @@ -2310,15 +2508,9 @@ FilterContainer.prototype.fromCompiledContent = function(reader) { } // 1 = network filters: bad filter directives - // Since we are going to keep bad filter fingerprints around, we ensure - // they are "detached" from the parent string from which they are sliced. - // We keep bad filter fingerprints around to use them when user - // incrementally add filters (through "Block element" for example). reader.select(1); while ( reader.next() ) { - if ( this.badFilters.has(reader.line) === false ) { - this.badFilters.add(µb.orphanizeString(reader.line)); - } + this.badFilters.add(reader.line); } }; @@ -2415,18 +2607,16 @@ FilterContainer.prototype.matchAndFetchData = function(dataType, requestURL, out /******************************************************************************/ -// bucket: Map -// url: string - -FilterContainer.prototype.matchTokens = function(bucket, url) { +FilterContainer.prototype.matchTokens = function(bucket) { // Hostname-only filters let f = bucket.get(this.dotTokenHash); if ( f !== undefined && f.match() === true ) { - this.thRegister = this.dotTokenHash; - this.fRegister = f; + this.tokenRegister = this.dotTokenHash; + this.filterRegister = f; return true; } + const url = this.urlRegister; const tokenHashes = this.urlTokenizer.getTokens(); let i = 0; for (;;) { @@ -2434,27 +2624,91 @@ FilterContainer.prototype.matchTokens = function(bucket, url) { if ( tokenHash === 0 ) { break; } f = bucket.get(tokenHash); if ( f !== undefined && f.match(url, tokenHashes[i+1]) === true ) { - this.thRegister = tokenHash; - this.fRegister = f; + this.tokenRegister = tokenHash; + this.filterRegister = f; return true; } i += 2; } - // Untokenizable filters - f = bucket.get(this.noTokenHash); - if ( f !== undefined && f.match(url, 0) === true ) { - this.thRegister = this.noTokenHash; - this.fRegister = f; - return true; - } - return false; }; /******************************************************************************/ -// Specialized handlers +FilterContainer.prototype.realmMatchStringExactType = function( + realmBits, + typeBits, + partyBits +) { + let bucket; + let catBits = realmBits | typeBits; + if ( (bucket = this.categories.get(catBits)) ) { + if ( this.matchTokens(bucket) ) { + this.catbitsRegister = catBits; + return true; + } + } + if ( partyBits !== 0 ) { + catBits = realmBits | typeBits | partyBits; + if ( (bucket = this.categories.get(catBits)) ) { + if ( this.matchTokens(bucket) ) { + this.catbitsRegister = catBits; + return true; + } + } + } + return false; +}; + +/******************************************************************************/ + +FilterContainer.prototype.realmMatchString = function( + realmBits, + typeBits, + partyBits +) { + let bucket; + let catBits = realmBits | AnyTypeAnyParty; + if ( (bucket = this.categories.get(catBits)) ) { + if ( this.matchTokens(bucket) ) { + this.catbitsRegister = catBits; + return true; + } + } + if ( partyBits !== 0 ) { + catBits = realmBits | AnyType | partyBits; + if ( (bucket = this.categories.get(catBits)) ) { + if ( this.matchTokens(bucket) ) { + this.catbitsRegister = catBits; + return true; + } + } + } + if ( typeBits !== 0 ) { + catBits = realmBits | AnyParty | typeBits; + if ( (bucket = this.categories.get(catBits)) ) { + if ( this.matchTokens(bucket) ) { + this.catbitsRegister = catBits; + return true; + } + } + } + if ( typeBits !== 0 && partyBits !== 0 ) { + catBits = realmBits | typeBits | partyBits; + if ( (bucket = this.categories.get(catBits)) ) { + if ( this.matchTokens(bucket) ) { + this.catbitsRegister = catBits; + return true; + } + } + } + return false; +}; + +/******************************************************************************/ + +// Specialized handler // https://github.com/gorhill/uBlock/issues/1477 // Special case: blocking-generichide filter ALWAYS exists, it is implicit -- @@ -2464,28 +2718,26 @@ FilterContainer.prototype.matchTokens = function(bucket, url) { // User may want to override `generichide` exception filters. FilterContainer.prototype.matchStringGenericHide = function(requestURL) { - let url = this.urlTokenizer.setURL(requestURL); + const typeBits = typeNameToTypeValue['generichide']; - // https://github.com/gorhill/uBlock/issues/2225 - // Important: - // - `pageHostnameRegister` is used by FilterOrigin?.match(). - // - `requestHostnameRegister` is used by FilterHostnameDict.match(). - pageHostnameRegister = requestHostnameRegister = µb.URI.hostnameFromURI(url); + // Prime tokenizer: we get a normalized URL in return. + this.urlRegister = this.urlTokenizer.setURL(requestURL); + this.filterRegister = null; - let bucket = this.categories.get(genericHideException); - if ( !bucket || this.matchTokens(bucket, url) === false ) { - this.fRegister = null; - return 0; + // These registers will be used by various filters + pageHostnameRegister = requestHostnameRegister = + µb.URI.hostnameFromURI(requestURL); + + // Exception filters + if ( this.realmMatchStringExactType(AllowAction, typeBits, FirstParty) ) { + // Important block filters. + if ( this.realmMatchStringExactType(BlockImportant, typeBits, FirstParty) ) { + return 1; + } + return 2; } + return 0; - bucket = this.categories.get(genericHideImportant); - if ( bucket && this.matchTokens(bucket, url) ) { - this.cbRegister = genericHideImportant; - return 1; - } - - this.cbRegister = genericHideException; - return 2; }; /******************************************************************************/ @@ -2495,244 +2747,84 @@ FilterContainer.prototype.matchStringGenericHide = function(requestURL) { // not the generic handling. FilterContainer.prototype.matchStringExactType = function(fctxt, requestType) { - // Special cases. - if ( requestType === 'generichide' ) { - return this.matchStringGenericHide(fctxt.url); - } - let type = typeNameToTypeValue[requestType]; - if ( type === undefined ) { return 0; } + const typeBits = typeNameToTypeValue[requestType]; + if ( typeBits === undefined ) { return 0; } + const partyBits = fctxt.is3rdPartyToDoc() ? ThirdParty : FirstParty; // Prime tokenizer: we get a normalized URL in return. - let url = this.urlTokenizer.setURL(fctxt.url); + this.urlRegister = this.urlTokenizer.setURL(fctxt.url); + this.filterRegister = null; // These registers will be used by various filters pageHostnameRegister = fctxt.getDocHostname(); requestHostnameRegister = fctxt.getHostname(); - let party = fctxt.is3rdPartyToDoc() ? ThirdParty : FirstParty; - let categories = this.categories, - catBits, bucket; - - this.fRegister = null; - - // https://github.com/chrisaljoudi/uBlock/issues/139 - // Test against important block filters - catBits = BlockAnyParty | Important | type; - if ( (bucket = categories.get(catBits)) ) { - if ( this.matchTokens(bucket, url) ) { - this.cbRegister = catBits; - return 1; - } + // Important block filters. + if ( this.realmMatchStringExactType(BlockImportant, typeBits, partyBits) ) { + return 1; } - catBits = BlockAction | Important | type | party; - if ( (bucket = categories.get(catBits)) ) { - if ( this.matchTokens(bucket, url) ) { - this.cbRegister = catBits; - return 1; - } - } - - // Test against block filters - catBits = BlockAnyParty | type; - if ( (bucket = categories.get(catBits)) ) { - if ( this.matchTokens(bucket, url) ) { - this.cbRegister = catBits; - } - } - if ( this.fRegister === null ) { - catBits = BlockAction | type | party; - if ( (bucket = categories.get(catBits)) ) { - if ( this.matchTokens(bucket, url) ) { - this.cbRegister = catBits; - } - } - } - - // If there is no block filter, no need to test against allow filters - if ( this.fRegister === null ) { - return 0; - } - - // Test against allow filters - catBits = AllowAnyParty | type; - if ( (bucket = categories.get(catBits)) ) { - if ( this.matchTokens(bucket, url) ) { - this.cbRegister = catBits; + // Block filters + if ( this.realmMatchString(BlockAction, typeBits, partyBits) ) { + // Exception filters + if ( this.realmMatchStringExactType(AllowAction, typeBits, partyBits) ) { return 2; } + return 1; } - catBits = AllowAction | type | party; - if ( (bucket = categories.get(catBits)) ) { - if ( this.matchTokens(bucket, url) ) { - this.cbRegister = catBits; - return 2; - } - } - - return 1; + return 0; }; /******************************************************************************/ FilterContainer.prototype.matchString = function(fctxt) { // https://github.com/chrisaljoudi/uBlock/issues/519 - // Use exact type match for anything beyond `other` - // Also, be prepared to support unknown types - let type = typeNameToTypeValue[fctxt.type]; - if ( type === undefined ) { - type = otherTypeBitValue; - } else if ( type === 0 || type > otherTypeBitValue ) { + // Use exact type match for anything beyond `other` + // Also, be prepared to support unknown types + let typeBits = typeNameToTypeValue[fctxt.type]; + if ( typeBits === undefined ) { + typeBits = otherTypeBitValue; + } else if ( typeBits === 0 || typeBits > otherTypeBitValue ) { return this.matchStringExactType(fctxt, fctxt.type); } - - // The logic here is simple: - // - // block = !whitelisted && blacklisted - // or equivalent - // allow = whitelisted || !blacklisted - - // Statistically, hits on a URL in order of likelihood: - // 1. No hit - // 2. Hit on a block filter - // 3. Hit on an allow filter - // - // High likelihood of "no hit" means to optimize we need to reduce as much - // as possible the number of filters to test. - // - // Then, because of the order of probabilities, we should test only - // block filters first, and test allow filters if and only if there is a - // hit on a block filter. Since there is a high likelihood of no hit, - // testing allow filter by default is likely wasted work, hence allow - // filters are tested *only* if there is a (unlikely) hit on a block - // filter. + const partyBits = fctxt.is3rdPartyToDoc() ? ThirdParty : FirstParty; // Prime tokenizer: we get a normalized URL in return. - const url = this.urlTokenizer.setURL(fctxt.url); + this.urlRegister = this.urlTokenizer.setURL(fctxt.url); + this.filterRegister = null; // These registers will be used by various filters pageHostnameRegister = fctxt.getDocHostname(); requestHostnameRegister = fctxt.getHostname(); - this.fRegister = null; - - const party = fctxt.is3rdPartyToDoc() - ? ThirdParty - : FirstParty; - const categories = this.categories; - let catBits, bucket; - - // https://github.com/chrisaljoudi/uBlock/issues/139 - // Test against important block filters. - // The purpose of the `important` option is to reverse the order of - // evaluation. Normally, it is "evaluate block then evaluate allow", with - // the `important` property it is "evaluate allow then evaluate block". - catBits = BlockAnyTypeAnyParty | Important; - if ( (bucket = categories.get(catBits)) ) { - if ( this.matchTokens(bucket, url) ) { - this.cbRegister = catBits; - return 1; - } + // Important block filters. + if ( this.realmMatchString(BlockImportant, typeBits, partyBits) ) { + return 1; } - catBits = BlockAnyType | Important | party; - if ( (bucket = categories.get(catBits)) ) { - if ( this.matchTokens(bucket, url) ) { - this.cbRegister = catBits; - return 1; - } - } - catBits = BlockAnyParty | Important | type; - if ( (bucket = categories.get(catBits)) ) { - if ( this.matchTokens(bucket, url) ) { - this.cbRegister = catBits; - return 1; - } - } - catBits = BlockAction | Important | type | party; - if ( (bucket = categories.get(catBits)) ) { - if ( this.matchTokens(bucket, url) ) { - this.cbRegister = catBits; - return 1; - } - } - - // Test against block filters - catBits = BlockAnyTypeAnyParty; - if ( (bucket = categories.get(catBits)) ) { - if ( this.matchTokens(bucket, url) ) { - this.cbRegister = catBits; - } - } - if ( this.fRegister === null ) { - catBits = BlockAnyType | party; - if ( (bucket = categories.get(catBits)) ) { - if ( this.matchTokens(bucket, url) ) { - this.cbRegister = catBits; - } - } - if ( this.fRegister === null ) { - catBits = BlockAnyParty | type; - if ( (bucket = categories.get(catBits)) ) { - if ( this.matchTokens(bucket, url) ) { - this.cbRegister = catBits; - } - } - if ( this.fRegister === null ) { - catBits = BlockAction | type | party; - if ( (bucket = categories.get(catBits)) ) { - if ( this.matchTokens(bucket, url) ) { - this.cbRegister = catBits; - } - } - } - } - } - - // If there is no block filter, no need to test against allow filters - if ( this.fRegister === null ) { - return 0; - } - - // Test against allow filters - catBits = AllowAnyTypeAnyParty; - if ( (bucket = categories.get(catBits)) ) { - if ( this.matchTokens(bucket, url) ) { - this.cbRegister = catBits; + // Block filters + if ( this.realmMatchString(BlockAction, typeBits, partyBits) ) { + // Exception filters + if ( this.realmMatchString(AllowAction, typeBits, partyBits) ) { return 2; } + return 1; } - catBits = AllowAnyType | party; - if ( (bucket = categories.get(catBits)) ) { - if ( this.matchTokens(bucket, url) ) { - this.cbRegister = catBits; - return 2; - } - } - catBits = AllowAnyParty | type; - if ( (bucket = categories.get(catBits)) ) { - if ( this.matchTokens(bucket, url) ) { - this.cbRegister = catBits; - return 2; - } - } - catBits = AllowAction | type | party; - if ( (bucket = categories.get(catBits)) ) { - if ( this.matchTokens(bucket, url) ) { - this.cbRegister = catBits; - return 2; - } - } - - return 1; + return 0; }; /******************************************************************************/ FilterContainer.prototype.toLogData = function() { - if ( this.fRegister === null ) { return; } - const logData = toLogDataInternal(this.cbRegister, this.thRegister, this.fRegister); + if ( this.filterRegister === null ) { return; } + const logData = toLogDataInternal( + this.catbitsRegister, + this.tokenRegister, + this.filterRegister + ); logData.source = 'static'; - logData.tokenHash = this.thRegister; - logData.result = this.fRegister === null ? 0 : (this.cbRegister & 1 ? 2 : 1); + logData.tokenHash = this.tokenRegister; + logData.result = this.filterRegister === null + ? 0 + : (this.catbitsRegister & 1 ? 2 : 1); return logData; }; @@ -2775,7 +2867,9 @@ FilterContainer.prototype.benchmark = function(action) { const r = this.matchString(fctxt); if ( recorded !== undefined ) { recorded.push(r); } if ( expected !== undefined && r !== expected[i] ) { - throw 'Mismatch with reference results'; + console.log('Mismatch with reference results:'); + console.log(`\tExpected ${expected[i]}, got ${r}:`); + console.log(`\turl=${fctxt.url} docOrigin=${fctxt.getDocOrigin()}`); } } const t1 = self.performance.now(); @@ -2796,15 +2890,60 @@ FilterContainer.prototype.benchmark = function(action) { return 'ok'; }; -/******************************************************************************/ +/******************************************************************************- + + With default filter lists: + + As of 2019-04-18: + + {bits: "0", token: "ad", size: 926, f: FilterBucket} + {bits: "0", token: "ads", size: 636, f: FilterBucket} + {bits: "41", token: "phncdn", size: 253, f: FilterBucket} + {bits: "0", token: "analytic", size: 174, f: FilterBucket} + {bits: "0", token: "tracking", size: 155, f: FilterBucket} + {bits: "48", token: "http", size: 146, f: FilterBucket} + {bits: "48", token: "https", size: 139, f: FilterBucket} + {bits: "58", token: "http", size: 122, f: FilterBucket} + {bits: "0", token: "adv", size: 121, f: FilterBucket} + {bits: "58", token: "https", size: 118, f: FilterBucket} + {bits: "0", token: "advertis", size: 102, f: FilterBucket} + {bits: "8", token: "doublecl", size: 96, f: FilterBucket} + {bits: "41", token: "imasdk", size: 90, f: FilterBucket} + {bits: "0", token: "cdn", size: 89, f: FilterBucket} + {bits: "0", token: "track", size: 87, f: FilterBucket} + {bits: "0", token: "stats", size: 82, f: FilterBucket} + {bits: "0", token: "banner", size: 74, f: FilterBucket} + {bits: "0", token: "log", size: 72, f: FilterBucket} + {bits: "0", token: "ga", size: 71, f: FilterBucket} + {bits: "0", token: "gif", size: 67, f: FilterBucket} + {bits: "0", token: "cloudfro", size: 64, f: FilterBucket} + {bits: "0", token: "amazonaw", size: 61, f: FilterBucket} + {bits: "41", token: "ajax", size: 58, f: FilterBucket} + {bits: "0", token: "tracker", size: 56, f: FilterBucket} + {bits: "40", token: "pagead2", size: 53, f: FilterBucket} + {bits: "0", token: "affiliat", size: 53, f: FilterBucket} + +*/ FilterContainer.prototype.bucketHistogram = function() { const results = []; for ( const [ bits, category ] of this.categories ) { for ( const [ th, f ] of category ) { - if ( f instanceof FilterBucket === false ) { continue; } - const token = µBlock.urlTokenizer.stringFromTokenHash(th); - results.push({ bits, token, size: f.size, f }); + if ( f instanceof FilterBucket ) { + const token = µBlock.urlTokenizer.stringFromTokenHash(th); + results.push({ bits: bits.toString(16), token, size: f.size, f }); + continue; + } + if ( f instanceof FilterHostnameDict ) { + const token = µBlock.urlTokenizer.stringFromTokenHash(th); + results.push({ bits: bits.toString(16), token, size: f.size, f }); + continue; + } + if ( f instanceof FilterJustOrigin ) { + const token = µBlock.urlTokenizer.stringFromTokenHash(th); + results.push({ bits: bits.toString(16), token, size: f.size, f }); + continue; + } } } results.sort((a, b) => { @@ -2815,9 +2954,9 @@ FilterContainer.prototype.bucketHistogram = function() { /******************************************************************************* - As of 2019-04-13: + With default filter lists: - Filter classes histogram with default filter lists: + As of 2019-04-13: {"FilterPlainHnAnchored" => 12619} {"FilterPlainPrefix1" => 8743} diff --git a/src/js/strie.js b/src/js/strie.js index 046793ea3..3ecbda22d 100644 --- a/src/js/strie.js +++ b/src/js/strie.js @@ -46,29 +46,29 @@ const STRIE_CHAR1_SLOT = STRIE_TRIE0_SLOT + 3; // 67 / 268 const STRIE_TRIE0_START = STRIE_TRIE0_SLOT + 4 << 2; // 272 -const STrieContainer = function(details) { - if ( details instanceof Object === false ) { details = {}; } - const len = (details.byteLength || 0) + STRIE_PAGE_SIZE-1 & ~(STRIE_PAGE_SIZE-1); - this.buf = new Uint8Array(Math.max(len, 131072)); - this.buf32 = new Uint32Array(this.buf.buffer); - this.buf32[STRIE_TRIE0_SLOT] = STRIE_TRIE0_START; - this.buf32[STRIE_TRIE1_SLOT] = this.buf32[STRIE_TRIE0_SLOT]; - this.buf32[STRIE_CHAR0_SLOT] = details.char0 || 65536; - this.buf32[STRIE_CHAR1_SLOT] = this.buf32[STRIE_CHAR0_SLOT]; -}; +const STrieContainer = class { -STrieContainer.prototype = { + constructor(details) { + if ( details instanceof Object === false ) { details = {}; } + const len = (details.byteLength || 0) + STRIE_PAGE_SIZE-1 & ~(STRIE_PAGE_SIZE-1); + this.buf = new Uint8Array(Math.max(len, 131072)); + this.buf32 = new Uint32Array(this.buf.buffer); + this.buf32[STRIE_TRIE0_SLOT] = STRIE_TRIE0_START; + this.buf32[STRIE_TRIE1_SLOT] = this.buf32[STRIE_TRIE0_SLOT]; + this.buf32[STRIE_CHAR0_SLOT] = details.char0 || 65536; + this.buf32[STRIE_CHAR1_SLOT] = this.buf32[STRIE_CHAR0_SLOT]; + } //-------------------------------------------------------------------------- // Public methods //-------------------------------------------------------------------------- - reset: function() { + reset() { this.buf32[STRIE_TRIE1_SLOT] = this.buf32[STRIE_TRIE0_SLOT]; this.buf32[STRIE_CHAR1_SLOT] = this.buf32[STRIE_CHAR0_SLOT]; - }, + } - matches: function(iroot, a, al) { + matches(iroot, a, al) { const ar = a.length; const char0 = this.buf32[STRIE_CHAR0_SLOT]; let icell = iroot; @@ -102,9 +102,9 @@ STrieContainer.prototype = { if ( icell === 0 || this.buf32[icell+2] === 0 ) { return al; } if ( al === ar ) { return -1; } } - }, + } - createOne: function(args) { + createOne(args) { if ( Array.isArray(args) ) { return new this.STrieRef(this, args[0], args[1]); } @@ -118,13 +118,13 @@ STrieContainer.prototype = { this.buf32[iroot+1] = 0; this.buf32[iroot+2] = 0; return new this.STrieRef(this, iroot, 0); - }, + } - compileOne: function(trieRef) { + compileOne(trieRef) { return [ trieRef.iroot, trieRef.size ]; - }, + } - add: function(iroot, s) { + add(iroot, s) { const lschar = s.length; if ( lschar === 0 ) { return 0; } let ischar = 0; @@ -221,26 +221,17 @@ STrieContainer.prototype = { } return 1; } - }, + } - optimize: function() { + optimize() { this.shrinkBuf(); return { byteLength: this.buf.byteLength, char0: this.buf32[STRIE_CHAR0_SLOT], }; - }, + } - fromIterable: function(hostnames, add) { - if ( add === undefined ) { add = 'add'; } - const trieRef = this.createOne(); - for ( const hn of hostnames ) { - trieRef[add](hn); - } - return trieRef; - }, - - serialize: function(encoder) { + serialize(encoder) { if ( encoder instanceof Object ) { return encoder.encode( this.buf32.buffer, @@ -254,9 +245,9 @@ STrieContainer.prototype = { this.buf32[STRIE_CHAR1_SLOT] + 3 >>> 2 ) ); - }, + } - unserialize: function(selfie, decoder) { + unserialize(selfie, decoder) { const shouldDecode = typeof selfie === 'string'; let byteLength = shouldDecode ? decoder.decodeSize(selfie) @@ -272,23 +263,13 @@ STrieContainer.prototype = { } else { this.buf32.set(selfie); } - }, - - //-------------------------------------------------------------------------- - // Class to hold reference to a specific trie - //-------------------------------------------------------------------------- - - STrieRef: function(container, iroot, size) { - this.container = container; - this.iroot = iroot; - this.size = size; - }, + } //-------------------------------------------------------------------------- // Private methods //-------------------------------------------------------------------------- - addCell: function(idown, iright, v) { + addCell(idown, iright, v) { let icell = this.buf32[STRIE_TRIE1_SLOT]; this.buf32[STRIE_TRIE1_SLOT] = icell + 12; icell >>>= 2; @@ -296,9 +277,9 @@ STrieContainer.prototype = { this.buf32[icell+1] = iright; this.buf32[icell+2] = v; return icell; - }, + } - addSegment: function(segment) { + addSegment(segment) { const lsegchar = segment.length; if ( lsegchar === 0 ) { return 0; } let char1 = this.buf32[STRIE_CHAR1_SLOT]; @@ -309,9 +290,9 @@ STrieContainer.prototype = { } while ( i !== lsegchar ); this.buf32[STRIE_CHAR1_SLOT] = char1; return (lsegchar << 24) | isegchar; - }, + } - growBuf: function(trieGrow, charGrow) { + growBuf(trieGrow, charGrow) { const char0 = Math.max( (this.buf32[STRIE_TRIE1_SLOT] + trieGrow + STRIE_PAGE_SIZE-1) & ~(STRIE_PAGE_SIZE-1), this.buf32[STRIE_CHAR0_SLOT] @@ -322,16 +303,16 @@ STrieContainer.prototype = { this.buf.length ); this.resizeBuf(bufLen, char0); - }, + } - shrinkBuf: function() { + shrinkBuf() { const char0 = this.buf32[STRIE_TRIE1_SLOT] + 24; const char1 = char0 + this.buf32[STRIE_CHAR1_SLOT] - this.buf32[STRIE_CHAR0_SLOT]; const bufLen = char1 + 256; this.resizeBuf(bufLen, char0); - }, + } - resizeBuf: function(bufLen, char0) { + resizeBuf(bufLen, char0) { bufLen = bufLen + STRIE_PAGE_SIZE-1 & ~(STRIE_PAGE_SIZE-1); if ( bufLen === this.buf.length && @@ -375,23 +356,35 @@ STrieContainer.prototype = { this.buf32[STRIE_CHAR0_SLOT] = char0; this.buf32[STRIE_CHAR1_SLOT] = char0 + charDataLen; } - }, + } }; -/******************************************************************************/ +/******************************************************************************* -STrieContainer.prototype.STrieRef.prototype = { - add: function(pattern) { + Class to hold reference to a specific trie + +*/ + +STrieContainer.prototype.STrieRef = class { + constructor(container, iroot, size) { + this.container = container; + this.iroot = iroot; + this.size = size; + } + + add(pattern) { if ( this.container.add(this.iroot, pattern) === 1 ) { this.size += 1; return true; } return false; - }, - matches: function(a, al) { + } + + matches(a, al) { return this.container.matches(this.iroot, a, al); - }, - [Symbol.iterator]: function() { + } + + [Symbol.iterator]() { return { value: undefined, done: false, @@ -441,5 +434,5 @@ STrieContainer.prototype.STrieRef.prototype = { forks: [], textDecoder: new TextDecoder() }; - }, + } }; diff --git a/src/js/utils.js b/src/js/utils.js index b50de5fd1..536dd307f 100644 --- a/src/js/utils.js +++ b/src/js/utils.js @@ -41,70 +41,101 @@ // Benchmark for string-based tokens vs. safe-integer token values: // https://gorhill.github.io/obj-vs-set-vs-map/tokenize-to-str-vs-to-int.html -µBlock.urlTokenizer = { - setURL: function(url) { +µBlock.urlTokenizer = new (class { + constructor() { + this._chars = '0123456789%abcdefghijklmnopqrstuvwxyz'; + this._validTokenChars = new Uint8Array(128); + for ( let i = 0, n = this._chars.length; i < n; i++ ) { + this._validTokenChars[this._chars.charCodeAt(i)] = i + 1; + } + + this._charsEx = '0123456789%abcdefghijklmnopqrstuvwxyz*.'; + this._validTokenCharsEx = new Uint8Array(128); + for ( let i = 0, n = this._charsEx.length; i < n; i++ ) { + this._validTokenCharsEx[this._charsEx.charCodeAt(i)] = i + 1; + } + + this.dotTokenHash = this.tokenHashFromString('.'); + this.anyTokenHash = this.tokenHashFromString('..'); + this.anyHTTPSTokenHash = this.tokenHashFromString('..https'); + this.anyHTTPTokenHash = this.tokenHashFromString('..http'); + this.noTokenHash = this.tokenHashFromString('*'); + + this._urlIn = ''; + this._urlOut = ''; + this._tokenized = false; + this._tokens = [ 0 ]; + } + + setURL(url) { if ( url !== this._urlIn ) { this._urlIn = url; this._urlOut = url.toLowerCase(); this._tokenized = false; } return this._urlOut; - }, + } // Tokenize on demand. - getTokens: function() { - if ( this._tokenized === false ) { - this._tokenize(); - this._tokenized = true; + getTokens() { + if ( this._tokenized ) { return this._tokens; } + let i = this._tokenize(); + i = this._appendTokenAt(i, this.anyTokenHash, 0); + if ( this._urlOut.startsWith('https://') ) { + i = this._appendTokenAt(i, this.anyHTTPSTokenHash, 0); + } else if ( this._urlOut.startsWith('http://') ) { + i = this._appendTokenAt(i, this.anyHTTPTokenHash, 0); } + i = this._appendTokenAt(i, this.noTokenHash, 0); + this._tokens[i] = 0; + this._tokenized = true; return this._tokens; - }, + } - tokenHashFromString: function(s) { - var l = s.length; + _appendTokenAt(i, th, ti) { + this._tokens[i+0] = th; + this._tokens[i+1] = ti; + return i + 2; + } + + tokenHashFromString(s) { + const l = s.length; if ( l === 0 ) { return 0; } - if ( l === 1 ) { - if ( s === '*' ) { return 63; } - if ( s === '.' ) { return 62; } - } - var vtc = this._validTokenChars, - th = vtc[s.charCodeAt(0)]; - for ( var i = 1; i !== 8 && i !== l; i++ ) { + const vtc = this._validTokenCharsEx; + let th = vtc[s.charCodeAt(0)]; + for ( let i = 1; i !== 8 && i !== l; i++ ) { th = th * 64 + vtc[s.charCodeAt(i)]; } return th; - }, + } - stringFromTokenHash: function(th) { + stringFromTokenHash(th) { if ( th === 0 ) { return ''; } - if ( th === 63 ) { return '*'; } - if ( th === 62 ) { return '.'; } - const chars = '0123456789%abcdefghijklmnopqrstuvwxyz'; let s = ''; while ( th > 0 ) { - s = `${chars.charAt((th & 0b111111)-1)}${s}`; + s = `${this._charsEx.charAt((th & 0b111111)-1)}${s}`; th /= 64; } return s; - }, + } // https://github.com/chrisaljoudi/uBlock/issues/1118 // We limit to a maximum number of tokens. - _tokenize: function() { - var tokens = this._tokens, - url = this._urlOut, - l = url.length; - if ( l === 0 ) { tokens[0] = 0; return; } + _tokenize() { + const tokens = this._tokens; + let url = this._urlOut; + let l = url.length; + if ( l === 0 ) { return 0; } if ( l > 2048 ) { url = url.slice(0, 2048); l = 2048; } - var i = 0, j = 0, v, n, ti, th, - vtc = this._validTokenChars; + const vtc = this._validTokenChars; + let i = 0, j = 0, v, n, ti, th; for (;;) { for (;;) { - if ( i === l ) { tokens[j] = 0; return; } + if ( i === l ) { return j; } v = vtc[url.charCodeAt(i++)]; if ( v !== 0 ) { break; } } @@ -117,25 +148,12 @@ th = th * 64 + v; n += 1; } - tokens[j++] = th; - tokens[j++] = ti; + tokens[j+0] = th; + tokens[j+1] = ti; + j += 2; } - }, - - _urlIn: '', - _urlOut: '', - _tokenized: false, - _tokens: [ 0 ], - _validTokenChars: (function() { - var vtc = new Uint8Array(128), - chars = '0123456789%abcdefghijklmnopqrstuvwxyz', - i = chars.length; - while ( i-- ) { - vtc[chars.charCodeAt(i)] = i + 1; - } - return vtc; - })() -}; + } +})(); /******************************************************************************/