Introduce three more specialized filter classes to avoid regexes

Performance- and memory-related work. Three more classes have
been created to avoid regex-based filters internally.

Purpose is to enforce filters which have only one single
wildcard in their pattern, a common occurrence. The filter
pattern is split in two literal string segments.

Similar as above, with the added condition that the filter is
hostname-anchored (`||`). The "Wildcard2" variant is a further
specialization to enforce filters where the only wildcard
is immediately preceded by the `^` special character, again
a very common occurrence.

Using two literal string segments in lieu of regexes allows to
quickly detect a mismatch by just testing the first segment.
Additionally, this reduces memory footprint as regexes are
much more expensive memory-wise than plain strings.

These three new filter classes allow to replace the use of
5276 regex-based filters internally with plain string-based
filters.

Often-called isHnAnchored() has been further fine-tuned to
avoid as much work as possible. I have also observed that
using an arrow function for closure-purpose helps measurably
performance, as per built-in benchmark.
This commit is contained in:
Raymond Hill 2019-04-25 17:48:08 -04:00
parent dfd6076a5e
commit 99390390fc
No known key found for this signature in database
GPG Key ID: 25E1490B761470C2
2 changed files with 232 additions and 29 deletions

View File

@ -137,7 +137,7 @@ const µBlock = (function() { // jshint ignore:line
// Read-only
systemSettings: {
compiledMagic: 10, // Increase when compiled format changes
compiledMagic: 11, // Increase when compiled format changes
selfieMagic: 11 // Increase when selfie format changes
},

View File

@ -102,6 +102,8 @@ const typeValueToTypeName = {
const BlockImportant = BlockAction | Important;
const reIsWildcarded = /[\^\*]/;
// ABP filters: https://adblockplus.org/en/filters
// regex tester: http://regex101.com/
@ -110,10 +112,39 @@ const BlockImportant = BlockAction | Important;
// See the following as short-lived registers, used during evaluation. They are
// valid until the next evaluation.
let pageHostnameRegister = '',
requestHostnameRegister = '';
//var filterRegister = null;
//var categoryRegister = '';
let pageHostnameRegister = '';
let requestHostnameRegister = '';
/******************************************************************************/
// First character of match must be within the hostname part of the url.
//
// https://github.com/gorhill/uBlock/issues/1929
// Match only hostname label boundaries.
const isHnAnchored = (( ) => {
let lastLen = 0, lastBeg = -1, lastEnd = -1;
return (url, matchStart) => {
const len = requestHostnameRegister.length;
if ( len !== lastLen || url.endsWith('://', lastBeg) === false ) {
lastBeg = len !== 0 ? url.indexOf('://') : -1;
if ( lastBeg !== -1 ) {
lastBeg += 3;
lastEnd = lastBeg + len;
} else {
lastEnd = -1;
}
}
return matchStart < lastEnd && (
matchStart === lastBeg ||
matchStart > lastBeg &&
url.charCodeAt(matchStart - 1) === 0x2E /* '.' */
);
};
})();
/******************************************************************************/
// Local helpers
@ -204,27 +235,6 @@ const toLogDataInternal = function(categoryBits, tokenHash, filter) {
return logData;
};
// First character of match must be within the hostname part of the url.
//
// https://github.com/gorhill/uBlock/issues/1929
// Match only hostname label boundaries.
const isHnAnchored = (function() {
let hostname = '';
let beg = -1, end = -1;
return function(url, matchStart) {
if ( requestHostnameRegister !== hostname ) {
const hn = requestHostnameRegister;
beg = hn !== '' ? url.indexOf(hn) : -1;
end = beg !== -1 ? beg + hn.length : -1;
hostname = hn;
}
if ( matchStart < beg || matchStart >= end ) { return false; }
return matchStart === beg ||
url.charCodeAt(matchStart - 1) === 0x2E /* '.' */;
};
})();
/*******************************************************************************
Each filter class will register itself in the map. A filter class
@ -536,6 +546,52 @@ FilterPlainHnAnchored.prototype.trieableId = 1;
registerFilterClass(FilterPlainHnAnchored);
/*******************************************************************************
Filters with only one single occurrence of wildcard `*`
*/
const FilterWildcard1 = class {
constructor(s0, s1) {
this.s0 = s0;
this.s1 = s1;
}
match(url) {
const pos = url.indexOf(this.s0);
return pos !== -1 && url.indexOf(this.s1, pos + this.s0.length) !== -1;
}
logData() {
return {
raw: `${this.s0}*${this.s1}`,
regex: rawToRegexStr(`${this.s0}*${this.s1}`, 0),
compiled: this.compile()
};
}
compile() {
return [ this.fid, this.s0, this.s1 ];
}
static compile(details) {
if ( details.anchor !== 0 ) { return; }
const s = details.f;
let pos = s.indexOf('*');
if ( pos === -1 ) { return; }
if ( reIsWildcarded.test(s.slice(pos + 1)) ) { return; }
if ( reIsWildcarded.test(s.slice(0, pos)) ) { return; }
return [ FilterWildcard1.fid, s.slice(0, pos), s.slice(pos + 1) ];
}
static load(args) {
return new FilterWildcard1(args[1], args[2]);
}
};
registerFilterClass(FilterWildcard1);
/******************************************************************************/
const FilterGeneric = class {
@ -571,6 +627,8 @@ const FilterGeneric = class {
}
static compile(details) {
const compiled = FilterWildcard1.compile(details);
if ( compiled !== undefined ) { return compiled; }
return [ FilterGeneric.fid, details.f, details.anchor ];
}
@ -583,6 +641,117 @@ FilterGeneric.prototype.re = null;
registerFilterClass(FilterGeneric);
/*******************************************************************************
Hostname-anchored filters with only one occurrence of wildcard `*`
*/
const FilterWildcard1HnAnchored = class {
constructor(s0, s1) {
this.s0 = s0;
this.s1 = s1;
}
match(url) {
const pos = url.indexOf(this.s0);
return pos !== -1 &&
isHnAnchored(url, pos) &&
url.indexOf(this.s1, pos + this.s0.length) !== -1;
}
logData() {
return {
raw: `||${this.s0}*${this.s1}`,
regex: rawToRegexStr(`${this.s0}*${this.s1}`, 0),
compiled: this.compile()
};
}
compile() {
return [ this.fid, this.s0, this.s1 ];
}
static compile(details) {
if ( (details.anchor & 0x0b001) !== 0 ) { return; }
const s = details.f;
let pos = s.indexOf('*');
if ( pos === -1 ) { return; }
if ( reIsWildcarded.test(s.slice(pos + 1)) ) { return; }
const needSeparator =
pos !== 0 && s.charCodeAt(pos - 1) === 0x5E /* '^' */;
if ( needSeparator ) { pos -= 1; }
if ( reIsWildcarded.test(s.slice(0, pos)) ) { return; }
if ( needSeparator ) {
return FilterWildcard2HnAnchored.compile(details, pos);
}
return [
FilterWildcard1HnAnchored.fid,
s.slice(0, pos),
s.slice(pos + 1),
];
}
static load(args) {
return new FilterWildcard1HnAnchored(args[1], args[2]);
}
};
registerFilterClass(FilterWildcard1HnAnchored);
/*******************************************************************************
Hostname-anchored filters with one occurrence of the wildcard
sequence `^*` and no other wildcard-equivalent character
*/
const FilterWildcard2HnAnchored = class {
constructor(s0, s1) {
this.s0 = s0;
this.s1 = s1;
}
match(url) {
const pos0 = url.indexOf(this.s0);
if ( pos0 === -1 || isHnAnchored(url, pos0) === false ) {
return false;
}
const pos1 = pos0 + this.s0.length;
const pos2 = url.indexOf(this.s1, pos1);
return pos2 !== -1 &&
this.reSeparators.test(url.slice(pos1, pos2));
}
logData() {
return {
raw: `||${this.s0}^*${this.s1}`,
regex: rawToRegexStr(`${this.s0}^*${this.s1}`, 0),
compiled: this.compile()
};
}
compile() {
return [ this.fid, this.s0, this.s1 ];
}
static compile(details, pos) {
return [
FilterWildcard2HnAnchored.fid,
details.f.slice(0, pos),
details.f.slice(pos + 2),
];
}
static load(args) {
return new FilterWildcard2HnAnchored(args[1], args[2]);
}
};
FilterWildcard2HnAnchored.prototype.reSeparators = /[^0-9a-z.%_-]/;
registerFilterClass(FilterWildcard2HnAnchored);
/******************************************************************************/
const FilterGenericHnAnchored = class {
@ -610,6 +779,8 @@ const FilterGenericHnAnchored = class {
}
static compile(details) {
const compiled = FilterWildcard1HnAnchored.compile(details);
if ( compiled !== undefined ) { return compiled; }
return [ FilterGenericHnAnchored.fid, details.f ];
}
@ -1377,7 +1548,10 @@ const FilterBucket = class {
return true;
}
}
if ( this.plainHnAnchoredTrie !== null && isHnAnchored(url, tokenBeg) ) {
if (
this.plainHnAnchoredTrie !== null &&
isHnAnchored(url, tokenBeg)
) {
const pos = this.plainHnAnchoredTrie.matches(url, tokenBeg);
if ( pos !== -1 ) {
this.plainHnAnchoredFilter.s = url.slice(tokenBeg, pos);
@ -1524,7 +1698,6 @@ const FilterParser = function() {
this.reHasUnicode = /[^\x00-\x7F]/;
this.reWebsocketAny = /^ws[s*]?(?::\/?\/?)?\*?$/;
this.reBadCSP = /(?:^|;)\s*report-(?:to|uri)\b/;
this.reIsWildcarded = /[\^\*]/;
this.domainOpt = '';
this.noTokenHash = µb.urlTokenizer.noTokenHash;
this.unsupportedTypeBit = this.bitFromType('unsupported');
@ -1917,7 +2090,7 @@ FilterParser.prototype.parse = function(raw) {
this.anchor = 0;
}
this.wildcarded = this.reIsWildcarded.test(s);
this.wildcarded = reIsWildcarded.test(s);
// This might look weird but we gain memory footprint by not going through
// toLowerCase(), at least on Chromium. Because copy-on-write?
@ -2985,6 +3158,36 @@ FilterContainer.prototype.bucketHistogram = function() {
- FilterPlainHnAnchored and FilterPlainPrefix1 are good candidates
for storing in a plain string trie.
As of 2019-04-25:
{"FilterPlainHnAnchored" => 11078}
{"FilterPlainPrefix1" => 7195}
{"FilterPrefix1Trie" => 5720}
{"FilterOriginHit" => 3561}
{"FilterWildcard2HnAnchored" => 2943}
{"FilterPair" => 2391}
{"FilterBucket" => 1922}
{"FilterWildcard1HnAnchored" => 1910}
{"FilterHnAnchoredTrie" => 1586}
{"FilterPlainHostname" => 1391}
{"FilterOriginHitSet" => 1155}
{"FilterPlain" => 634}
{"FilterWildcard1" => 423}
{"FilterGenericHnAnchored" => 389}
{"FilterOriginMiss" => 302}
{"FilterGeneric" => 163}
{"FilterOriginMissSet" => 150}
{"FilterRegex" => 124}
{"FilterPlainRightAnchored" => 110}
{"FilterGenericHnAndRightAnchored" => 95}
{"FilterHostnameDict" => 59}
{"FilterPlainLeftAnchored" => 30}
{"FilterJustOrigin" => 22}
{"FilterHTTPJustOrigin" => 19}
{"FilterHTTPSJustOrigin" => 18}
{"FilterExactMatch" => 5}
{"FilterOriginMixedSet" => 3}
*/
FilterContainer.prototype.filterClassHistogram = function() {