mirror of https://github.com/gorhill/uBlock.git
Introduce three more specialized filter classes to avoid regexes
Performance- and memory-related work. Three more classes have been created to avoid regex-based filters internally. Purpose is to enforce filters which have only one single wildcard in their pattern, a common occurrence. The filter pattern is split in two literal string segments. Similar as above, with the added condition that the filter is hostname-anchored (`||`). The "Wildcard2" variant is a further specialization to enforce filters where the only wildcard is immediately preceded by the `^` special character, again a very common occurrence. Using two literal string segments in lieu of regexes allows to quickly detect a mismatch by just testing the first segment. Additionally, this reduces memory footprint as regexes are much more expensive memory-wise than plain strings. These three new filter classes allow to replace the use of 5276 regex-based filters internally with plain string-based filters. Often-called isHnAnchored() has been further fine-tuned to avoid as much work as possible. I have also observed that using an arrow function for closure-purpose helps measurably performance, as per built-in benchmark.
This commit is contained in:
parent
dfd6076a5e
commit
99390390fc
|
@ -137,7 +137,7 @@ const µBlock = (function() { // jshint ignore:line
|
|||
|
||||
// Read-only
|
||||
systemSettings: {
|
||||
compiledMagic: 10, // Increase when compiled format changes
|
||||
compiledMagic: 11, // Increase when compiled format changes
|
||||
selfieMagic: 11 // Increase when selfie format changes
|
||||
},
|
||||
|
||||
|
|
|
@ -102,6 +102,8 @@ const typeValueToTypeName = {
|
|||
|
||||
const BlockImportant = BlockAction | Important;
|
||||
|
||||
const reIsWildcarded = /[\^\*]/;
|
||||
|
||||
// ABP filters: https://adblockplus.org/en/filters
|
||||
// regex tester: http://regex101.com/
|
||||
|
||||
|
@ -110,10 +112,39 @@ const BlockImportant = BlockAction | Important;
|
|||
// See the following as short-lived registers, used during evaluation. They are
|
||||
// valid until the next evaluation.
|
||||
|
||||
let pageHostnameRegister = '',
|
||||
requestHostnameRegister = '';
|
||||
//var filterRegister = null;
|
||||
//var categoryRegister = '';
|
||||
let pageHostnameRegister = '';
|
||||
let requestHostnameRegister = '';
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
// First character of match must be within the hostname part of the url.
|
||||
//
|
||||
// https://github.com/gorhill/uBlock/issues/1929
|
||||
// Match only hostname label boundaries.
|
||||
|
||||
const isHnAnchored = (( ) => {
|
||||
let lastLen = 0, lastBeg = -1, lastEnd = -1;
|
||||
|
||||
return (url, matchStart) => {
|
||||
const len = requestHostnameRegister.length;
|
||||
if ( len !== lastLen || url.endsWith('://', lastBeg) === false ) {
|
||||
lastBeg = len !== 0 ? url.indexOf('://') : -1;
|
||||
if ( lastBeg !== -1 ) {
|
||||
lastBeg += 3;
|
||||
lastEnd = lastBeg + len;
|
||||
} else {
|
||||
lastEnd = -1;
|
||||
}
|
||||
}
|
||||
return matchStart < lastEnd && (
|
||||
matchStart === lastBeg ||
|
||||
matchStart > lastBeg &&
|
||||
url.charCodeAt(matchStart - 1) === 0x2E /* '.' */
|
||||
);
|
||||
};
|
||||
})();
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
// Local helpers
|
||||
|
||||
|
@ -204,27 +235,6 @@ const toLogDataInternal = function(categoryBits, tokenHash, filter) {
|
|||
return logData;
|
||||
};
|
||||
|
||||
// First character of match must be within the hostname part of the url.
|
||||
//
|
||||
// https://github.com/gorhill/uBlock/issues/1929
|
||||
// Match only hostname label boundaries.
|
||||
const isHnAnchored = (function() {
|
||||
let hostname = '';
|
||||
let beg = -1, end = -1;
|
||||
|
||||
return function(url, matchStart) {
|
||||
if ( requestHostnameRegister !== hostname ) {
|
||||
const hn = requestHostnameRegister;
|
||||
beg = hn !== '' ? url.indexOf(hn) : -1;
|
||||
end = beg !== -1 ? beg + hn.length : -1;
|
||||
hostname = hn;
|
||||
}
|
||||
if ( matchStart < beg || matchStart >= end ) { return false; }
|
||||
return matchStart === beg ||
|
||||
url.charCodeAt(matchStart - 1) === 0x2E /* '.' */;
|
||||
};
|
||||
})();
|
||||
|
||||
/*******************************************************************************
|
||||
|
||||
Each filter class will register itself in the map. A filter class
|
||||
|
@ -536,6 +546,52 @@ FilterPlainHnAnchored.prototype.trieableId = 1;
|
|||
|
||||
registerFilterClass(FilterPlainHnAnchored);
|
||||
|
||||
/*******************************************************************************
|
||||
|
||||
Filters with only one single occurrence of wildcard `*`
|
||||
|
||||
*/
|
||||
|
||||
const FilterWildcard1 = class {
|
||||
constructor(s0, s1) {
|
||||
this.s0 = s0;
|
||||
this.s1 = s1;
|
||||
}
|
||||
|
||||
match(url) {
|
||||
const pos = url.indexOf(this.s0);
|
||||
return pos !== -1 && url.indexOf(this.s1, pos + this.s0.length) !== -1;
|
||||
}
|
||||
|
||||
logData() {
|
||||
return {
|
||||
raw: `${this.s0}*${this.s1}`,
|
||||
regex: rawToRegexStr(`${this.s0}*${this.s1}`, 0),
|
||||
compiled: this.compile()
|
||||
};
|
||||
}
|
||||
|
||||
compile() {
|
||||
return [ this.fid, this.s0, this.s1 ];
|
||||
}
|
||||
|
||||
static compile(details) {
|
||||
if ( details.anchor !== 0 ) { return; }
|
||||
const s = details.f;
|
||||
let pos = s.indexOf('*');
|
||||
if ( pos === -1 ) { return; }
|
||||
if ( reIsWildcarded.test(s.slice(pos + 1)) ) { return; }
|
||||
if ( reIsWildcarded.test(s.slice(0, pos)) ) { return; }
|
||||
return [ FilterWildcard1.fid, s.slice(0, pos), s.slice(pos + 1) ];
|
||||
}
|
||||
|
||||
static load(args) {
|
||||
return new FilterWildcard1(args[1], args[2]);
|
||||
}
|
||||
};
|
||||
|
||||
registerFilterClass(FilterWildcard1);
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
const FilterGeneric = class {
|
||||
|
@ -571,6 +627,8 @@ const FilterGeneric = class {
|
|||
}
|
||||
|
||||
static compile(details) {
|
||||
const compiled = FilterWildcard1.compile(details);
|
||||
if ( compiled !== undefined ) { return compiled; }
|
||||
return [ FilterGeneric.fid, details.f, details.anchor ];
|
||||
}
|
||||
|
||||
|
@ -583,6 +641,117 @@ FilterGeneric.prototype.re = null;
|
|||
|
||||
registerFilterClass(FilterGeneric);
|
||||
|
||||
/*******************************************************************************
|
||||
|
||||
Hostname-anchored filters with only one occurrence of wildcard `*`
|
||||
|
||||
*/
|
||||
|
||||
const FilterWildcard1HnAnchored = class {
|
||||
constructor(s0, s1) {
|
||||
this.s0 = s0;
|
||||
this.s1 = s1;
|
||||
}
|
||||
|
||||
match(url) {
|
||||
const pos = url.indexOf(this.s0);
|
||||
return pos !== -1 &&
|
||||
isHnAnchored(url, pos) &&
|
||||
url.indexOf(this.s1, pos + this.s0.length) !== -1;
|
||||
}
|
||||
|
||||
logData() {
|
||||
return {
|
||||
raw: `||${this.s0}*${this.s1}`,
|
||||
regex: rawToRegexStr(`${this.s0}*${this.s1}`, 0),
|
||||
compiled: this.compile()
|
||||
};
|
||||
}
|
||||
|
||||
compile() {
|
||||
return [ this.fid, this.s0, this.s1 ];
|
||||
}
|
||||
|
||||
static compile(details) {
|
||||
if ( (details.anchor & 0x0b001) !== 0 ) { return; }
|
||||
const s = details.f;
|
||||
let pos = s.indexOf('*');
|
||||
if ( pos === -1 ) { return; }
|
||||
if ( reIsWildcarded.test(s.slice(pos + 1)) ) { return; }
|
||||
const needSeparator =
|
||||
pos !== 0 && s.charCodeAt(pos - 1) === 0x5E /* '^' */;
|
||||
if ( needSeparator ) { pos -= 1; }
|
||||
if ( reIsWildcarded.test(s.slice(0, pos)) ) { return; }
|
||||
if ( needSeparator ) {
|
||||
return FilterWildcard2HnAnchored.compile(details, pos);
|
||||
}
|
||||
return [
|
||||
FilterWildcard1HnAnchored.fid,
|
||||
s.slice(0, pos),
|
||||
s.slice(pos + 1),
|
||||
];
|
||||
}
|
||||
|
||||
static load(args) {
|
||||
return new FilterWildcard1HnAnchored(args[1], args[2]);
|
||||
}
|
||||
};
|
||||
|
||||
registerFilterClass(FilterWildcard1HnAnchored);
|
||||
|
||||
/*******************************************************************************
|
||||
|
||||
Hostname-anchored filters with one occurrence of the wildcard
|
||||
sequence `^*` and no other wildcard-equivalent character
|
||||
|
||||
*/
|
||||
|
||||
const FilterWildcard2HnAnchored = class {
|
||||
constructor(s0, s1) {
|
||||
this.s0 = s0;
|
||||
this.s1 = s1;
|
||||
}
|
||||
|
||||
match(url) {
|
||||
const pos0 = url.indexOf(this.s0);
|
||||
if ( pos0 === -1 || isHnAnchored(url, pos0) === false ) {
|
||||
return false;
|
||||
}
|
||||
const pos1 = pos0 + this.s0.length;
|
||||
const pos2 = url.indexOf(this.s1, pos1);
|
||||
return pos2 !== -1 &&
|
||||
this.reSeparators.test(url.slice(pos1, pos2));
|
||||
}
|
||||
|
||||
logData() {
|
||||
return {
|
||||
raw: `||${this.s0}^*${this.s1}`,
|
||||
regex: rawToRegexStr(`${this.s0}^*${this.s1}`, 0),
|
||||
compiled: this.compile()
|
||||
};
|
||||
}
|
||||
|
||||
compile() {
|
||||
return [ this.fid, this.s0, this.s1 ];
|
||||
}
|
||||
|
||||
static compile(details, pos) {
|
||||
return [
|
||||
FilterWildcard2HnAnchored.fid,
|
||||
details.f.slice(0, pos),
|
||||
details.f.slice(pos + 2),
|
||||
];
|
||||
}
|
||||
|
||||
static load(args) {
|
||||
return new FilterWildcard2HnAnchored(args[1], args[2]);
|
||||
}
|
||||
};
|
||||
|
||||
FilterWildcard2HnAnchored.prototype.reSeparators = /[^0-9a-z.%_-]/;
|
||||
|
||||
registerFilterClass(FilterWildcard2HnAnchored);
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
const FilterGenericHnAnchored = class {
|
||||
|
@ -610,6 +779,8 @@ const FilterGenericHnAnchored = class {
|
|||
}
|
||||
|
||||
static compile(details) {
|
||||
const compiled = FilterWildcard1HnAnchored.compile(details);
|
||||
if ( compiled !== undefined ) { return compiled; }
|
||||
return [ FilterGenericHnAnchored.fid, details.f ];
|
||||
}
|
||||
|
||||
|
@ -1377,7 +1548,10 @@ const FilterBucket = class {
|
|||
return true;
|
||||
}
|
||||
}
|
||||
if ( this.plainHnAnchoredTrie !== null && isHnAnchored(url, tokenBeg) ) {
|
||||
if (
|
||||
this.plainHnAnchoredTrie !== null &&
|
||||
isHnAnchored(url, tokenBeg)
|
||||
) {
|
||||
const pos = this.plainHnAnchoredTrie.matches(url, tokenBeg);
|
||||
if ( pos !== -1 ) {
|
||||
this.plainHnAnchoredFilter.s = url.slice(tokenBeg, pos);
|
||||
|
@ -1524,7 +1698,6 @@ const FilterParser = function() {
|
|||
this.reHasUnicode = /[^\x00-\x7F]/;
|
||||
this.reWebsocketAny = /^ws[s*]?(?::\/?\/?)?\*?$/;
|
||||
this.reBadCSP = /(?:^|;)\s*report-(?:to|uri)\b/;
|
||||
this.reIsWildcarded = /[\^\*]/;
|
||||
this.domainOpt = '';
|
||||
this.noTokenHash = µb.urlTokenizer.noTokenHash;
|
||||
this.unsupportedTypeBit = this.bitFromType('unsupported');
|
||||
|
@ -1917,7 +2090,7 @@ FilterParser.prototype.parse = function(raw) {
|
|||
this.anchor = 0;
|
||||
}
|
||||
|
||||
this.wildcarded = this.reIsWildcarded.test(s);
|
||||
this.wildcarded = reIsWildcarded.test(s);
|
||||
|
||||
// This might look weird but we gain memory footprint by not going through
|
||||
// toLowerCase(), at least on Chromium. Because copy-on-write?
|
||||
|
@ -2985,6 +3158,36 @@ FilterContainer.prototype.bucketHistogram = function() {
|
|||
- FilterPlainHnAnchored and FilterPlainPrefix1 are good candidates
|
||||
for storing in a plain string trie.
|
||||
|
||||
As of 2019-04-25:
|
||||
|
||||
{"FilterPlainHnAnchored" => 11078}
|
||||
{"FilterPlainPrefix1" => 7195}
|
||||
{"FilterPrefix1Trie" => 5720}
|
||||
{"FilterOriginHit" => 3561}
|
||||
{"FilterWildcard2HnAnchored" => 2943}
|
||||
{"FilterPair" => 2391}
|
||||
{"FilterBucket" => 1922}
|
||||
{"FilterWildcard1HnAnchored" => 1910}
|
||||
{"FilterHnAnchoredTrie" => 1586}
|
||||
{"FilterPlainHostname" => 1391}
|
||||
{"FilterOriginHitSet" => 1155}
|
||||
{"FilterPlain" => 634}
|
||||
{"FilterWildcard1" => 423}
|
||||
{"FilterGenericHnAnchored" => 389}
|
||||
{"FilterOriginMiss" => 302}
|
||||
{"FilterGeneric" => 163}
|
||||
{"FilterOriginMissSet" => 150}
|
||||
{"FilterRegex" => 124}
|
||||
{"FilterPlainRightAnchored" => 110}
|
||||
{"FilterGenericHnAndRightAnchored" => 95}
|
||||
{"FilterHostnameDict" => 59}
|
||||
{"FilterPlainLeftAnchored" => 30}
|
||||
{"FilterJustOrigin" => 22}
|
||||
{"FilterHTTPJustOrigin" => 19}
|
||||
{"FilterHTTPSJustOrigin" => 18}
|
||||
{"FilterExactMatch" => 5}
|
||||
{"FilterOriginMixedSet" => 3}
|
||||
|
||||
*/
|
||||
|
||||
FilterContainer.prototype.filterClassHistogram = function() {
|
||||
|
|
Loading…
Reference in New Issue