Ignore unknown tokens in urlTokenizer.getTokens()

Given that all tokens extracted from one single URL are potentially
iterated multiple times in a single URL-matching cycle, it pays to
ignore extracted tokens which are known to not be used anywhere in
the static filtering engine.

The gain in processing a single network request in the static
filtering engine can become especially high when dealing with
long and random-looking URLs, which URLs have a high likelihood
of containing a majority of tokens which are known to not be in
use.
This commit is contained in:
Raymond Hill 2019-04-26 17:14:00 -04:00
parent 19ece97b0c
commit 69a43e07c4
No known key found for this signature in database
GPG Key ID: 25E1490B761470C2
3 changed files with 47 additions and 14 deletions

View File

@ -138,7 +138,7 @@ const µBlock = (function() { // jshint ignore:line
// Read-only
systemSettings: {
compiledMagic: 12, // Increase when compiled format changes
selfieMagic: 11 // Increase when selfie format changes
selfieMagic: 12 // Increase when selfie format changes
},
restoreBackupSettings: {

View File

@ -797,7 +797,7 @@ const FilterWildcard2HnAnchored = class {
}
};
FilterWildcard2HnAnchored.prototype.reSeparators = /[^0-9a-z.%_-]/;
FilterWildcard2HnAnchored.prototype.reSeparators = /[^\w%.-]/;
registerFilterClass(FilterWildcard2HnAnchored);
@ -2163,7 +2163,7 @@ const reGoodToken = /[%0-9a-z]{2,}/g;
const reRegexToken = /[%0-9A-Za-z]{2,}/g;
const reRegexTokenAbort = /[([]/;
const reRegexBadPrefix = /(^|[^\\]\.|[*?{}\\])$/;
const reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*]|$)/;
const reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*.]|$)/;
const badTokens = new Set([
'com',
@ -2296,6 +2296,7 @@ FilterContainer.prototype.reset = function() {
this.categories = new Map();
this.dataFilters = new Map();
this.filterParser.reset();
this.urlTokenizer.resetKnownTokens();
// This will invalidate all tries
FilterHostnameDict.reset();
@ -2317,6 +2318,7 @@ FilterContainer.prototype.freeze = function() {
const filterDataHolderId = FilterDataHolder.fid;
const redirectTypeValue = typeNameToTypeValue.redirect;
const unserialize = µb.CompiledLineIO.unserialize;
const knownTokens = this.urlTokenizer.knownTokens;
for ( const line of this.goodFilters ) {
if ( this.badFilters.has(line) ) {
@ -2348,6 +2350,7 @@ FilterContainer.prototype.freeze = function() {
entry.next = bucket;
}
this.dataFilters.set(tokenHash, entry);
knownTokens[tokenHash & 0xFFFF] = 1;
continue;
}
@ -2394,6 +2397,8 @@ FilterContainer.prototype.freeze = function() {
continue;
}
knownTokens[tokenHash & 0xFFFF] = 1;
if ( entry === undefined ) {
bucket.set(tokenHash, filterFromCompiledData(fdata));
continue;
@ -2484,6 +2489,7 @@ FilterContainer.prototype.toSelfie = function(path) {
discardedCount: this.discardedCount,
categories: categoriesToSelfie(this.categories),
dataFilters: dataFiltersToSelfie(this.dataFilters),
urlTokenizer: this.urlTokenizer.toSelfie(),
})
)
]);
@ -2525,6 +2531,7 @@ FilterContainer.prototype.fromSelfie = function(path) {
this.allowFilterCount = selfie.allowFilterCount;
this.blockFilterCount = selfie.blockFilterCount;
this.discardedCount = selfie.discardedCount;
this.urlTokenizer.fromSelfie(selfie.urlTokenizer);
for ( const [ catbits, bucket ] of selfie.categories ) {
const tokenMap = new Map();
for ( const [ token, fdata ] of bucket ) {
@ -2742,8 +2749,8 @@ FilterContainer.prototype.matchAndFetchData = function(dataType, requestURL, out
toAdd = new Map(),
toRemove = new Map();
let tokenHashes = this.urlTokenizer.getTokens(),
i = 0;
const tokenHashes = this.urlTokenizer.getTokens();
let i = 0;
while ( i < 32 ) {
let tokenHash = tokenHashes[i++];
if ( tokenHash === 0 ) { break; }

View File

@ -65,6 +65,9 @@
this._urlOut = '';
this._tokenized = false;
this._tokens = [ 0 ];
this.knownTokens = new Uint8Array(65536);
this.resetKnownTokens();
}
setURL(url) {
@ -76,6 +79,15 @@
return this._urlOut;
}
resetKnownTokens() {
this.knownTokens.fill(0);
this.knownTokens[this.dotTokenHash & 0xFFFF] = 1;
this.knownTokens[this.anyTokenHash & 0xFFFF] = 1;
this.knownTokens[this.anyHTTPSTokenHash & 0xFFFF] = 1;
this.knownTokens[this.anyHTTPTokenHash & 0xFFFF] = 1;
this.knownTokens[this.noTokenHash & 0xFFFF] = 1;
}
// Tokenize on demand.
getTokens() {
if ( this._tokenized ) { return this._tokens; }
@ -92,12 +104,6 @@
return this._tokens;
}
_appendTokenAt(i, th, ti) {
this._tokens[i+0] = th;
this._tokens[i+1] = ti;
return i + 2;
}
tokenHashFromString(s) {
const l = s.length;
if ( l === 0 ) { return 0; }
@ -119,9 +125,26 @@
return s;
}
toSelfie() {
return µBlock.base64.encode(
this.knownTokens.buffer,
this.knownTokens.byteLength
);
}
fromSelfie(selfie) {
return µBlock.base64.decode(selfie, this.knownTokens.buffer);
}
// https://github.com/chrisaljoudi/uBlock/issues/1118
// We limit to a maximum number of tokens.
_appendTokenAt(i, th, ti) {
this._tokens[i+0] = th;
this._tokens[i+1] = ti;
return i + 2;
}
_tokenize() {
const tokens = this._tokens;
let url = this._urlOut;
@ -131,6 +154,7 @@
url = url.slice(0, 2048);
l = 2048;
}
const knownTokens = this.knownTokens;
const vtc = this._validTokenChars;
let i = 0, j = 0, v, n, ti, th;
for (;;) {
@ -148,9 +172,11 @@
th = th * 64 + v;
n += 1;
}
tokens[j+0] = th;
tokens[j+1] = ti;
j += 2;
if ( knownTokens[th & 0xFFFF] !== 0 ) {
tokens[j+0] = th;
tokens[j+1] = ti;
j += 2;
}
}
}
})();