Coallesce origin hit filters into their own bucket

Performance-related work.

There is a fair number of filters which can't be tokenized
in uBO's own filter lists. Majority of those filters also
declare a `domain=` option, examples:

    *$script,redirect-rule=noopjs,domain=...
    *$script,3p,domain=...,denyallow=...
    *$frame,3p,domain=...

Such filters can be found in uBO's asset viewer using the
following search expression:

    /^\*?\$[^\n]*?domain=/

Some filter buckets will contain many of those filters, for
instance one of the bucket holding untokenizable `redirect=`
filters has over 170 entries, which must be all visited when
collating all matching `redirect=` filters.

When a bucket contains many such filters, I found that it's
worth to extract all the non-negated hostname values from
`domain=` options into a single hntrie and perform a pre-test
at match() time to find out whether the current origin of a
network request matches any one of the collected hostnames,
so as to avoid iterating through all the filters.

Since there is rarely a match() for vast majority of network
requests with `domain=` option, this pre-test saves a good
amount of work, and this is measurable with the built-in
benchmark.
This commit is contained in:
Raymond Hill 2020-11-06 12:04:03 -05:00
parent fe2c4a4914
commit b265f2644d
No known key found for this signature in database
GPG Key ID: 25E1490B761470C2
2 changed files with 228 additions and 94 deletions

View File

@ -139,8 +139,8 @@ const µBlock = (( ) => { // jshint ignore:line
// Read-only // Read-only
systemSettings: { systemSettings: {
compiledMagic: 31, // Increase when compiled format changes compiledMagic: 32, // Increase when compiled format changes
selfieMagic: 31, // Increase when selfie format changes selfieMagic: 32, // Increase when selfie format changes
}, },
// https://github.com/uBlockOrigin/uBlock-issues/issues/759#issuecomment-546654501 // https://github.com/uBlockOrigin/uBlock-issues/issues/759#issuecomment-546654501

View File

@ -358,6 +358,12 @@ const filterFromCtor = function(ctor, ...args) {
return iunit; return iunit;
}; };
const filterUnitFromFilter = function(f) {
const iunit = filterUnits.length;
filterUnits.push(f);
return iunit;
};
const filterUnitFromCompiled = function(args) { const filterUnitFromCompiled = function(args) {
const ctor = filterClasses[args[0]]; const ctor = filterClasses[args[0]];
const keygen = ctor.keyFromArgs; const keygen = ctor.keyFromArgs;
@ -1237,6 +1243,10 @@ const FilterOriginHit = class {
this.n = n; this.n = n;
} }
get domainOpt() {
return filterOrigin.trieContainer.extractHostname(this.i, this.n);
}
match() { match() {
return filterOrigin.trieContainer.matchesHostname( return filterOrigin.trieContainer.matchesHostname(
$docHostname, $docHostname,
@ -1250,11 +1260,7 @@ const FilterOriginHit = class {
} }
logData(details) { logData(details) {
details.domains.push(this.getHostname()); details.domains.push(this.domainOpt);
}
getHostname() {
return filterOrigin.trieContainer.extractHostname(this.i, this.n);
} }
static compile(hostname) { static compile(hostname) {
@ -1273,6 +1279,8 @@ const FilterOriginHit = class {
} }
}; };
FilterOriginHit.prototype.hasOriginHit = true;
registerFilterClass(FilterOriginHit); registerFilterClass(FilterOriginHit);
/******************************************************************************/ /******************************************************************************/
@ -1283,7 +1291,7 @@ const FilterOriginMiss = class extends FilterOriginHit {
} }
logData(details) { logData(details) {
details.domains.push(`~${this.getHostname()}`); details.domains.push(`~${this.domainOpt}`);
} }
static compile(hostname) { static compile(hostname) {
@ -1302,6 +1310,8 @@ const FilterOriginMiss = class extends FilterOriginHit {
} }
}; };
FilterOriginMiss.prototype.hasOriginHit = false;
registerFilterClass(FilterOriginMiss); registerFilterClass(FilterOriginMiss);
/******************************************************************************/ /******************************************************************************/
@ -1354,6 +1364,8 @@ const FilterOriginHitSet = class {
} }
}; };
FilterOriginHitSet.prototype.hasOriginHit = true;
registerFilterClass(FilterOriginHitSet); registerFilterClass(FilterOriginHitSet);
/******************************************************************************/ /******************************************************************************/
@ -1386,6 +1398,8 @@ const FilterOriginMissSet = class extends FilterOriginHitSet {
} }
}; };
FilterOriginMissSet.prototype.hasOriginHit = false;
registerFilterClass(FilterOriginMissSet); registerFilterClass(FilterOriginMissSet);
/******************************************************************************/ /******************************************************************************/
@ -1556,7 +1570,7 @@ const FilterModifierResult = class {
const FilterCollection = class { const FilterCollection = class {
constructor(i = 0) { constructor(i = 0) {
this.i = i | 0; this.i = i;
} }
get size() { get size() {
@ -1570,9 +1584,11 @@ const FilterCollection = class {
this.i = filterSequenceAdd(iunit, j); this.i = filterSequenceAdd(iunit, j);
} }
shift() { shift(drop = false) {
const sequences = filterSequences; const sequences = filterSequences;
if ( drop ) {
filterUnits[sequences[this.i+0]] = null; filterUnits[sequences[this.i+0]] = null;
}
this.i = sequences[this.i+1]; this.i = sequences[this.i+1];
} }
@ -1618,8 +1634,9 @@ const FilterCollection = class {
return new ctor(i0, args[1].length); return new ctor(i0, args[1].length);
} }
static fromSelfie(ctor, args) { static fromSelfie(args, bucket) {
return new ctor(args[1]); bucket.i = args[1];
return bucket;
} }
}; };
@ -1645,8 +1662,11 @@ const FilterCompositeAny = class extends FilterCollection {
return FilterCollection.fromCompiled(FilterCompositeAny, args); return FilterCollection.fromCompiled(FilterCompositeAny, args);
} }
static fromSelfie(args) { static fromSelfie(args, bucket) {
return FilterCollection.fromSelfie(FilterCompositeAny, args); if ( bucket === undefined ) {
bucket = new FilterCompositeAny();
}
return super.fromSelfie(args, bucket);
} }
}; };
@ -1694,9 +1714,26 @@ const FilterCompositeAll = class extends FilterCollection {
return filterUnits[filterSequences[this.i]].isBidiTrieable === true; return filterUnits[filterSequences[this.i]].isBidiTrieable === true;
} }
get hasOriginHit() {
return this.forEach(iunit => {
if ( filterUnits[iunit].hasOriginHit === true ) {
return true;
}
});
}
get domainOpt() {
return this.forEach(iunit => {
const f = filterUnits[iunit];
if ( f.hasOriginHit === true ) {
return f.domainOpt;
}
});
}
toBidiTrie() { toBidiTrie() {
const details = filterUnits[filterSequences[this.i]].toBidiTrie(); const details = filterUnits[filterSequences[this.i]].toBidiTrie();
this.shift(); this.shift(true);
return details; return details;
} }
@ -1708,8 +1745,11 @@ const FilterCompositeAll = class extends FilterCollection {
return FilterCollection.fromCompiled(FilterCompositeAll, args); return FilterCollection.fromCompiled(FilterCompositeAll, args);
} }
static fromSelfie(args) { static fromSelfie(args, bucket) {
return FilterCollection.fromSelfie(FilterCompositeAll, args); if ( bucket === undefined ) {
bucket = new FilterCompositeAll();
}
return super.fromSelfie(args, bucket);
} }
}; };
@ -1925,7 +1965,10 @@ registerFilterClass(FilterHTTPJustOrigin);
const FilterPlainTrie = class { const FilterPlainTrie = class {
constructor(trie) { constructor(trie) {
this.plainTrie = trie; this.plainTrie = trie !== undefined
? trie
: bidiTrie.createOne();
this.$matchedUnit = 0;
} }
match() { match() {
@ -1949,6 +1992,37 @@ const FilterPlainTrie = class {
} }
} }
addUnitToTrie(iunit) {
const f = filterUnits[iunit];
const trieDetails = f.toBidiTrie();
const id = this.plainTrie.add(
trieDetails.i,
trieDetails.n,
trieDetails.itok
);
// No point storing a pattern with conditions if the bidi-trie already
// contain a pattern with no conditions.
const ix = this.plainTrie.getExtra(id);
if ( ix === 1 ) {
filterUnits[iunit] = null;
return;
}
// If the newly stored pattern has no condition, short-circuit existing
// ones since they will always be short-circuited by the condition-less
// pattern.
if ( f instanceof FilterPatternPlain ) {
this.plainTrie.setExtra(id, 1);
filterUnits[iunit] = null;
return;
}
// FilterCompositeAll is assumed here, i.e. with conditions.
if ( f.n === 1 ) {
filterUnits[iunit] = null;
iunit = filterSequences[f.i];
}
this.plainTrie.setExtra(id, filterSequenceAdd(iunit, ix));
}
toSelfie() { toSelfie() {
return [ this.fid, bidiTrie.compileOne(this.plainTrie) ]; return [ this.fid, bidiTrie.compileOne(this.plainTrie) ];
} }
@ -1958,27 +2032,27 @@ const FilterPlainTrie = class {
} }
}; };
FilterPlainTrie.prototype.$matchedUnit = 0;
registerFilterClass(FilterPlainTrie); registerFilterClass(FilterPlainTrie);
/******************************************************************************/ /******************************************************************************/
const FilterBucket = class extends FilterCollection { const FilterBucket = class extends FilterCollection {
constructor(n = 0) {
super();
this.n = n;
this.$matchedUnit = 0;
}
get size() {
return this.n;
}
match() { match() {
if ( this.plainTrie !== null ) {
if ( this.plainTrie.matches($tokenBeg, this) !== 0 ) {
this.$matchedTrie = true;
this.$matchedUnit = this.plainTrie.$iu;
return true;
}
}
const sequences = filterSequences; const sequences = filterSequences;
const units = filterUnits; const units = filterUnits;
let i = this.i; let i = this.i;
while ( i !== 0 ) { while ( i !== 0 ) {
if ( units[sequences[i+0]].match() ) { if ( units[sequences[i+0]].match() ) {
this.$matchedTrie = false;
this.$matchedUnit = sequences[i+0]; this.$matchedUnit = sequences[i+0];
return true; return true;
} }
@ -1998,108 +2072,154 @@ const FilterBucket = class extends FilterCollection {
} }
} }
unshift(iunit) {
super.unshift(iunit);
this.n += 1;
}
shift() {
super.shift();
this.n -= 1;
}
logData(details) { logData(details) {
if ( this.$matchedTrie ) {
const s = $requestURL.slice(this.plainTrie.$l, this.plainTrie.$r);
details.pattern.push(s);
details.regex.push(restrFromPlainPattern(s));
}
if ( this.$matchedUnit !== -1 ) {
filterUnits[this.$matchedUnit].logData(details); filterUnits[this.$matchedUnit].logData(details);
} }
}
toSelfie() { toSelfie() {
const selfie = super.toSelfie(); return [ this.fid, this.n, super.toSelfie() ];
if ( this.plainTrie !== null ) {
selfie.push(bidiTrie.compileOne(this.plainTrie));
} }
return selfie;
static fromSelfie(args, bucket) {
if ( bucket === undefined ) {
bucket = new FilterBucket(args[1]);
}
return super.fromSelfie(args[2], bucket);
} }
optimize() { optimize() {
if ( this.n >= 3 ) {
const f = this.optimizePatternTests();
if ( f !== undefined ) {
if ( this.i === 0 ) { return f; }
this.unshift(filterUnitFromFilter(f));
}
}
if ( this.n >= 10 ) {
const f = this.optimizeOriginHitTests();
if ( f !== undefined ) {
if ( this.i === 0 ) { return f; }
this.unshift(filterUnitFromFilter(f));
}
}
}
optimizePatternTests() {
const units = filterUnits; const units = filterUnits;
const sequences = filterSequences;
let n = 0; let n = 0;
let i = this.i; let i = this.i;
do { do {
if ( units[filterSequences[i+0]].isBidiTrieable ) { n += 1; } if ( units[sequences[i+0]].isBidiTrieable ) { n += 1; }
i = filterSequences[i+1]; i = sequences[i+1];
} while ( i !== 0 && n < 3 ); } while ( i !== 0 && n < 3 );
if ( n < 3 ) { return; } if ( n < 3 ) { return; }
if ( this.plainTrie === null ) { const ftrie = new FilterPlainTrie();
this.plainTrie = bidiTrie.createOne();
}
i = this.i; i = this.i;
let iprev = 0; let iprev = 0;
for (;;) { for (;;) {
const iunit = filterSequences[i+0]; const iunit = sequences[i+0];
const inext = filterSequences[i+1]; const inext = sequences[i+1];
if ( units[iunit].isBidiTrieable ) { if ( units[iunit].isBidiTrieable ) {
this._addToTrie(iunit); ftrie.addUnitToTrie(iunit);
if ( iprev !== 0 ) { if ( iprev !== 0 ) {
filterSequences[iprev+1] = inext; sequences[iprev+1] = inext;
} else { } else {
this.i = inext; this.i = inext;
} }
this.n -= 1;
} else { } else {
iprev = i; iprev = i;
} }
if ( inext === 0 ) { break; } if ( inext === 0 ) { break; }
i = inext; i = inext;
} }
if ( this.i === 0 ) { return ftrie;
return new FilterPlainTrie(this.plainTrie);
}
} }
_addToTrie(iunit) { optimizeOriginHitTests() {
const f = filterUnits[iunit]; const units = filterUnits;
const trieDetails = f.toBidiTrie(); let candidateCount = -10;
const id = this.plainTrie.add( const shouldPreTest = this.forEach(iunit => {
trieDetails.i, if ( units[iunit].hasOriginHit !== true ) { return; }
trieDetails.n, candidateCount += 1;
trieDetails.itok if ( candidateCount === 0 ) { return true; }
); });
// No point storing a pattern with conditions if the bidi-trie already if ( shouldPreTest !== true ) { return; }
// contain a pattern with no conditions. const sequences = filterSequences;
let ix = this.plainTrie.getExtra(id); const bucket = new FilterBucketOfOriginHits();
if ( ix === 1 ) { const domainOpts = [];
filterUnits[iunit] = null; let i = this.i;
return; let iprev = 0;
for (;;) {
const iunit = sequences[i+0];
const inext = sequences[i+1];
const f = units[iunit];
if ( f.hasOriginHit ) {
domainOpts.push(f.domainOpt);
bucket.unshift(iunit);
if ( iprev !== 0 ) {
sequences[iprev+1] = inext;
} else {
this.i = inext;
} }
// If the newly stored pattern has no condition, shortcut existing this.n -= 1;
// ones since they will always be short-circuited by the } else {
// condition-less pattern. iprev = i;
if ( f instanceof FilterPatternPlain ) {
this.plainTrie.setExtra(id, 1);
filterUnits[iunit] = null;
return;
} }
// FilterCompositeAll is assumed here, i.e. with conditions. if ( inext === 0 ) { break; }
if ( f.n === 1 ) { i = inext;
filterUnits[iunit] = null;
iunit = filterSequences[f.i];
}
this.plainTrie.setExtra(id, filterSequenceAdd(iunit, ix));
}
static fromSelfie(args) {
const bucket = FilterCollection.fromSelfie(FilterBucket, args);
if ( args.length > 2 && Array.isArray(args[2]) ) {
bucket.plainTrie = bidiTrie.createOne(args[2]);
} }
bucket.originTestUnit =
filterFromCtor(FilterOriginHitSet, domainOpts.join('|'));
return bucket; return bucket;
} }
}; };
FilterBucket.prototype.plainTrie = null;
FilterBucket.prototype.$matchedUnit = 0;
FilterBucket.prototype.$matchedTrie = false;
registerFilterClass(FilterBucket); registerFilterClass(FilterBucket);
/******************************************************************************/ /******************************************************************************/
const FilterBucketOfOriginHits = class extends FilterBucket {
constructor(i = 0) {
super();
this.originTestUnit = i;
}
match() {
return filterUnits[this.originTestUnit].match() && super.match();
}
matchAndFetchModifiers(env) {
if ( filterUnits[this.originTestUnit].match() ) {
super.matchAndFetchModifiers(env);
}
}
toSelfie() {
return [ this.fid, this.originTestUnit, super.toSelfie() ];
}
static fromSelfie(args) {
const bucket = new FilterBucketOfOriginHits(args[1]);
return super.fromSelfie(args[2], bucket);
}
};
registerFilterClass(FilterBucketOfOriginHits);
/******************************************************************************/
const FILTER_UNITS_MIN = filterUnits.length; const FILTER_UNITS_MIN = filterUnits.length;
const FILTER_SEQUENCES_MIN = filterSequenceWritePtr; const FILTER_SEQUENCES_MIN = filterSequenceWritePtr;
@ -2438,7 +2558,13 @@ const FilterParser = class {
this.typeBits &= ~this.notTypes; this.typeBits &= ~this.notTypes;
if ( this.typeBits === 0 ) { return false; } if ( this.typeBits === 0 ) { return false; }
} }
// CSP directives implicitly apply only to document/subdocument.
if ( this.modifyType === parser.OPTTokenCsp ) {
if ( this.typeBits === 0 ) {
this.parseTypeOption(parser.OPTTokenDoc, false);
this.parseTypeOption(parser.OPTTokenFrame, false);
}
}
// https://github.com/gorhill/uBlock/issues/2283 // https://github.com/gorhill/uBlock/issues/2283
// Abort if type is only for unsupported types, otherwise // Abort if type is only for unsupported types, otherwise
// toggle off `unsupported` bit. // toggle off `unsupported` bit.
@ -2596,7 +2722,7 @@ const FilterParser = class {
) { ) {
continue; continue;
} }
if ( token.startsWith('b') ) { if ( token.charCodeAt(0) === 0x62 /* 'b' */ ) {
const match = /\\+$/.exec(prefix); const match = /\\+$/.exec(prefix);
if ( match !== null && (match[0].length & 1) !== 0 ) { if ( match !== null && (match[0].length & 1) !== 0 ) {
prefix += 'b'; prefix += 'b';
@ -3212,6 +3338,13 @@ FilterContainer.prototype.matchAndFetchModifiers = function(
? this.categories.get(catBits11) ? this.categories.get(catBits11)
: undefined; : undefined;
if (
bucket00 === undefined && bucket01 === undefined &&
bucket10 === undefined && bucket11 === undefined
) {
return;
}
const results = []; const results = [];
const env = { const env = {
modifier: vAPI.StaticFilteringParser.netOptionTokenIds.get(modifierType) || 0, modifier: vAPI.StaticFilteringParser.netOptionTokenIds.get(modifierType) || 0,
@ -3763,6 +3896,7 @@ FilterContainer.prototype.benchmark = async function(action, target) {
fctxt.setURL(request.url); fctxt.setURL(request.url);
fctxt.setDocOriginFromURL(request.frameUrl); fctxt.setDocOriginFromURL(request.frameUrl);
fctxt.setType(request.cpt); fctxt.setType(request.cpt);
this.redirectURL = undefined;
const r = this.matchString(fctxt); const r = this.matchString(fctxt);
matchCount += 1; matchCount += 1;
if ( recorded !== undefined ) { recorded.push(r); } if ( recorded !== undefined ) { recorded.push(r); }