mirror of https://github.com/gorhill/uBlock.git
Ignore unknown tokens in urlTokenizer.getTokens()
Given that all tokens extracted from one single URL are potentially iterated multiple times in a single URL-matching cycle, it pays to ignore extracted tokens which are known to not be used anywhere in the static filtering engine. The gain in processing a single network request in the static filtering engine can become especially high when dealing with long and random-looking URLs, which URLs have a high likelihood of containing a majority of tokens which are known to not be in use.
This commit is contained in:
parent
19ece97b0c
commit
69a43e07c4
|
@ -138,7 +138,7 @@ const µBlock = (function() { // jshint ignore:line
|
|||
// Read-only
|
||||
systemSettings: {
|
||||
compiledMagic: 12, // Increase when compiled format changes
|
||||
selfieMagic: 11 // Increase when selfie format changes
|
||||
selfieMagic: 12 // Increase when selfie format changes
|
||||
},
|
||||
|
||||
restoreBackupSettings: {
|
||||
|
|
|
@ -797,7 +797,7 @@ const FilterWildcard2HnAnchored = class {
|
|||
}
|
||||
};
|
||||
|
||||
FilterWildcard2HnAnchored.prototype.reSeparators = /[^0-9a-z.%_-]/;
|
||||
FilterWildcard2HnAnchored.prototype.reSeparators = /[^\w%.-]/;
|
||||
|
||||
registerFilterClass(FilterWildcard2HnAnchored);
|
||||
|
||||
|
@ -2163,7 +2163,7 @@ const reGoodToken = /[%0-9a-z]{2,}/g;
|
|||
const reRegexToken = /[%0-9A-Za-z]{2,}/g;
|
||||
const reRegexTokenAbort = /[([]/;
|
||||
const reRegexBadPrefix = /(^|[^\\]\.|[*?{}\\])$/;
|
||||
const reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*]|$)/;
|
||||
const reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*.]|$)/;
|
||||
|
||||
const badTokens = new Set([
|
||||
'com',
|
||||
|
@ -2296,6 +2296,7 @@ FilterContainer.prototype.reset = function() {
|
|||
this.categories = new Map();
|
||||
this.dataFilters = new Map();
|
||||
this.filterParser.reset();
|
||||
this.urlTokenizer.resetKnownTokens();
|
||||
|
||||
// This will invalidate all tries
|
||||
FilterHostnameDict.reset();
|
||||
|
@ -2317,6 +2318,7 @@ FilterContainer.prototype.freeze = function() {
|
|||
const filterDataHolderId = FilterDataHolder.fid;
|
||||
const redirectTypeValue = typeNameToTypeValue.redirect;
|
||||
const unserialize = µb.CompiledLineIO.unserialize;
|
||||
const knownTokens = this.urlTokenizer.knownTokens;
|
||||
|
||||
for ( const line of this.goodFilters ) {
|
||||
if ( this.badFilters.has(line) ) {
|
||||
|
@ -2348,6 +2350,7 @@ FilterContainer.prototype.freeze = function() {
|
|||
entry.next = bucket;
|
||||
}
|
||||
this.dataFilters.set(tokenHash, entry);
|
||||
knownTokens[tokenHash & 0xFFFF] = 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -2394,6 +2397,8 @@ FilterContainer.prototype.freeze = function() {
|
|||
continue;
|
||||
}
|
||||
|
||||
knownTokens[tokenHash & 0xFFFF] = 1;
|
||||
|
||||
if ( entry === undefined ) {
|
||||
bucket.set(tokenHash, filterFromCompiledData(fdata));
|
||||
continue;
|
||||
|
@ -2484,6 +2489,7 @@ FilterContainer.prototype.toSelfie = function(path) {
|
|||
discardedCount: this.discardedCount,
|
||||
categories: categoriesToSelfie(this.categories),
|
||||
dataFilters: dataFiltersToSelfie(this.dataFilters),
|
||||
urlTokenizer: this.urlTokenizer.toSelfie(),
|
||||
})
|
||||
)
|
||||
]);
|
||||
|
@ -2525,6 +2531,7 @@ FilterContainer.prototype.fromSelfie = function(path) {
|
|||
this.allowFilterCount = selfie.allowFilterCount;
|
||||
this.blockFilterCount = selfie.blockFilterCount;
|
||||
this.discardedCount = selfie.discardedCount;
|
||||
this.urlTokenizer.fromSelfie(selfie.urlTokenizer);
|
||||
for ( const [ catbits, bucket ] of selfie.categories ) {
|
||||
const tokenMap = new Map();
|
||||
for ( const [ token, fdata ] of bucket ) {
|
||||
|
@ -2742,8 +2749,8 @@ FilterContainer.prototype.matchAndFetchData = function(dataType, requestURL, out
|
|||
toAdd = new Map(),
|
||||
toRemove = new Map();
|
||||
|
||||
let tokenHashes = this.urlTokenizer.getTokens(),
|
||||
i = 0;
|
||||
const tokenHashes = this.urlTokenizer.getTokens();
|
||||
let i = 0;
|
||||
while ( i < 32 ) {
|
||||
let tokenHash = tokenHashes[i++];
|
||||
if ( tokenHash === 0 ) { break; }
|
||||
|
|
|
@ -65,6 +65,9 @@
|
|||
this._urlOut = '';
|
||||
this._tokenized = false;
|
||||
this._tokens = [ 0 ];
|
||||
|
||||
this.knownTokens = new Uint8Array(65536);
|
||||
this.resetKnownTokens();
|
||||
}
|
||||
|
||||
setURL(url) {
|
||||
|
@ -76,6 +79,15 @@
|
|||
return this._urlOut;
|
||||
}
|
||||
|
||||
resetKnownTokens() {
|
||||
this.knownTokens.fill(0);
|
||||
this.knownTokens[this.dotTokenHash & 0xFFFF] = 1;
|
||||
this.knownTokens[this.anyTokenHash & 0xFFFF] = 1;
|
||||
this.knownTokens[this.anyHTTPSTokenHash & 0xFFFF] = 1;
|
||||
this.knownTokens[this.anyHTTPTokenHash & 0xFFFF] = 1;
|
||||
this.knownTokens[this.noTokenHash & 0xFFFF] = 1;
|
||||
}
|
||||
|
||||
// Tokenize on demand.
|
||||
getTokens() {
|
||||
if ( this._tokenized ) { return this._tokens; }
|
||||
|
@ -92,12 +104,6 @@
|
|||
return this._tokens;
|
||||
}
|
||||
|
||||
_appendTokenAt(i, th, ti) {
|
||||
this._tokens[i+0] = th;
|
||||
this._tokens[i+1] = ti;
|
||||
return i + 2;
|
||||
}
|
||||
|
||||
tokenHashFromString(s) {
|
||||
const l = s.length;
|
||||
if ( l === 0 ) { return 0; }
|
||||
|
@ -119,9 +125,26 @@
|
|||
return s;
|
||||
}
|
||||
|
||||
toSelfie() {
|
||||
return µBlock.base64.encode(
|
||||
this.knownTokens.buffer,
|
||||
this.knownTokens.byteLength
|
||||
);
|
||||
}
|
||||
|
||||
fromSelfie(selfie) {
|
||||
return µBlock.base64.decode(selfie, this.knownTokens.buffer);
|
||||
}
|
||||
|
||||
// https://github.com/chrisaljoudi/uBlock/issues/1118
|
||||
// We limit to a maximum number of tokens.
|
||||
|
||||
_appendTokenAt(i, th, ti) {
|
||||
this._tokens[i+0] = th;
|
||||
this._tokens[i+1] = ti;
|
||||
return i + 2;
|
||||
}
|
||||
|
||||
_tokenize() {
|
||||
const tokens = this._tokens;
|
||||
let url = this._urlOut;
|
||||
|
@ -131,6 +154,7 @@
|
|||
url = url.slice(0, 2048);
|
||||
l = 2048;
|
||||
}
|
||||
const knownTokens = this.knownTokens;
|
||||
const vtc = this._validTokenChars;
|
||||
let i = 0, j = 0, v, n, ti, th;
|
||||
for (;;) {
|
||||
|
@ -148,9 +172,11 @@
|
|||
th = th * 64 + v;
|
||||
n += 1;
|
||||
}
|
||||
tokens[j+0] = th;
|
||||
tokens[j+1] = ti;
|
||||
j += 2;
|
||||
if ( knownTokens[th & 0xFFFF] !== 0 ) {
|
||||
tokens[j+0] = th;
|
||||
tokens[j+1] = ti;
|
||||
j += 2;
|
||||
}
|
||||
}
|
||||
}
|
||||
})();
|
||||
|
|
Loading…
Reference in New Issue