mirror of https://github.com/gorhill/uBlock.git
Ignore unknown tokens in urlTokenizer.getTokens()
Given that all tokens extracted from one single URL are potentially iterated multiple times in a single URL-matching cycle, it pays to ignore extracted tokens which are known to not be used anywhere in the static filtering engine. The gain in processing a single network request in the static filtering engine can become especially high when dealing with long and random-looking URLs, which URLs have a high likelihood of containing a majority of tokens which are known to not be in use.
This commit is contained in:
parent
19ece97b0c
commit
69a43e07c4
|
@ -138,7 +138,7 @@ const µBlock = (function() { // jshint ignore:line
|
||||||
// Read-only
|
// Read-only
|
||||||
systemSettings: {
|
systemSettings: {
|
||||||
compiledMagic: 12, // Increase when compiled format changes
|
compiledMagic: 12, // Increase when compiled format changes
|
||||||
selfieMagic: 11 // Increase when selfie format changes
|
selfieMagic: 12 // Increase when selfie format changes
|
||||||
},
|
},
|
||||||
|
|
||||||
restoreBackupSettings: {
|
restoreBackupSettings: {
|
||||||
|
|
|
@ -797,7 +797,7 @@ const FilterWildcard2HnAnchored = class {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
FilterWildcard2HnAnchored.prototype.reSeparators = /[^0-9a-z.%_-]/;
|
FilterWildcard2HnAnchored.prototype.reSeparators = /[^\w%.-]/;
|
||||||
|
|
||||||
registerFilterClass(FilterWildcard2HnAnchored);
|
registerFilterClass(FilterWildcard2HnAnchored);
|
||||||
|
|
||||||
|
@ -2163,7 +2163,7 @@ const reGoodToken = /[%0-9a-z]{2,}/g;
|
||||||
const reRegexToken = /[%0-9A-Za-z]{2,}/g;
|
const reRegexToken = /[%0-9A-Za-z]{2,}/g;
|
||||||
const reRegexTokenAbort = /[([]/;
|
const reRegexTokenAbort = /[([]/;
|
||||||
const reRegexBadPrefix = /(^|[^\\]\.|[*?{}\\])$/;
|
const reRegexBadPrefix = /(^|[^\\]\.|[*?{}\\])$/;
|
||||||
const reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*]|$)/;
|
const reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*.]|$)/;
|
||||||
|
|
||||||
const badTokens = new Set([
|
const badTokens = new Set([
|
||||||
'com',
|
'com',
|
||||||
|
@ -2296,6 +2296,7 @@ FilterContainer.prototype.reset = function() {
|
||||||
this.categories = new Map();
|
this.categories = new Map();
|
||||||
this.dataFilters = new Map();
|
this.dataFilters = new Map();
|
||||||
this.filterParser.reset();
|
this.filterParser.reset();
|
||||||
|
this.urlTokenizer.resetKnownTokens();
|
||||||
|
|
||||||
// This will invalidate all tries
|
// This will invalidate all tries
|
||||||
FilterHostnameDict.reset();
|
FilterHostnameDict.reset();
|
||||||
|
@ -2317,6 +2318,7 @@ FilterContainer.prototype.freeze = function() {
|
||||||
const filterDataHolderId = FilterDataHolder.fid;
|
const filterDataHolderId = FilterDataHolder.fid;
|
||||||
const redirectTypeValue = typeNameToTypeValue.redirect;
|
const redirectTypeValue = typeNameToTypeValue.redirect;
|
||||||
const unserialize = µb.CompiledLineIO.unserialize;
|
const unserialize = µb.CompiledLineIO.unserialize;
|
||||||
|
const knownTokens = this.urlTokenizer.knownTokens;
|
||||||
|
|
||||||
for ( const line of this.goodFilters ) {
|
for ( const line of this.goodFilters ) {
|
||||||
if ( this.badFilters.has(line) ) {
|
if ( this.badFilters.has(line) ) {
|
||||||
|
@ -2348,6 +2350,7 @@ FilterContainer.prototype.freeze = function() {
|
||||||
entry.next = bucket;
|
entry.next = bucket;
|
||||||
}
|
}
|
||||||
this.dataFilters.set(tokenHash, entry);
|
this.dataFilters.set(tokenHash, entry);
|
||||||
|
knownTokens[tokenHash & 0xFFFF] = 1;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2394,6 +2397,8 @@ FilterContainer.prototype.freeze = function() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
knownTokens[tokenHash & 0xFFFF] = 1;
|
||||||
|
|
||||||
if ( entry === undefined ) {
|
if ( entry === undefined ) {
|
||||||
bucket.set(tokenHash, filterFromCompiledData(fdata));
|
bucket.set(tokenHash, filterFromCompiledData(fdata));
|
||||||
continue;
|
continue;
|
||||||
|
@ -2484,6 +2489,7 @@ FilterContainer.prototype.toSelfie = function(path) {
|
||||||
discardedCount: this.discardedCount,
|
discardedCount: this.discardedCount,
|
||||||
categories: categoriesToSelfie(this.categories),
|
categories: categoriesToSelfie(this.categories),
|
||||||
dataFilters: dataFiltersToSelfie(this.dataFilters),
|
dataFilters: dataFiltersToSelfie(this.dataFilters),
|
||||||
|
urlTokenizer: this.urlTokenizer.toSelfie(),
|
||||||
})
|
})
|
||||||
)
|
)
|
||||||
]);
|
]);
|
||||||
|
@ -2525,6 +2531,7 @@ FilterContainer.prototype.fromSelfie = function(path) {
|
||||||
this.allowFilterCount = selfie.allowFilterCount;
|
this.allowFilterCount = selfie.allowFilterCount;
|
||||||
this.blockFilterCount = selfie.blockFilterCount;
|
this.blockFilterCount = selfie.blockFilterCount;
|
||||||
this.discardedCount = selfie.discardedCount;
|
this.discardedCount = selfie.discardedCount;
|
||||||
|
this.urlTokenizer.fromSelfie(selfie.urlTokenizer);
|
||||||
for ( const [ catbits, bucket ] of selfie.categories ) {
|
for ( const [ catbits, bucket ] of selfie.categories ) {
|
||||||
const tokenMap = new Map();
|
const tokenMap = new Map();
|
||||||
for ( const [ token, fdata ] of bucket ) {
|
for ( const [ token, fdata ] of bucket ) {
|
||||||
|
@ -2742,8 +2749,8 @@ FilterContainer.prototype.matchAndFetchData = function(dataType, requestURL, out
|
||||||
toAdd = new Map(),
|
toAdd = new Map(),
|
||||||
toRemove = new Map();
|
toRemove = new Map();
|
||||||
|
|
||||||
let tokenHashes = this.urlTokenizer.getTokens(),
|
const tokenHashes = this.urlTokenizer.getTokens();
|
||||||
i = 0;
|
let i = 0;
|
||||||
while ( i < 32 ) {
|
while ( i < 32 ) {
|
||||||
let tokenHash = tokenHashes[i++];
|
let tokenHash = tokenHashes[i++];
|
||||||
if ( tokenHash === 0 ) { break; }
|
if ( tokenHash === 0 ) { break; }
|
||||||
|
|
|
@ -65,6 +65,9 @@
|
||||||
this._urlOut = '';
|
this._urlOut = '';
|
||||||
this._tokenized = false;
|
this._tokenized = false;
|
||||||
this._tokens = [ 0 ];
|
this._tokens = [ 0 ];
|
||||||
|
|
||||||
|
this.knownTokens = new Uint8Array(65536);
|
||||||
|
this.resetKnownTokens();
|
||||||
}
|
}
|
||||||
|
|
||||||
setURL(url) {
|
setURL(url) {
|
||||||
|
@ -76,6 +79,15 @@
|
||||||
return this._urlOut;
|
return this._urlOut;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
resetKnownTokens() {
|
||||||
|
this.knownTokens.fill(0);
|
||||||
|
this.knownTokens[this.dotTokenHash & 0xFFFF] = 1;
|
||||||
|
this.knownTokens[this.anyTokenHash & 0xFFFF] = 1;
|
||||||
|
this.knownTokens[this.anyHTTPSTokenHash & 0xFFFF] = 1;
|
||||||
|
this.knownTokens[this.anyHTTPTokenHash & 0xFFFF] = 1;
|
||||||
|
this.knownTokens[this.noTokenHash & 0xFFFF] = 1;
|
||||||
|
}
|
||||||
|
|
||||||
// Tokenize on demand.
|
// Tokenize on demand.
|
||||||
getTokens() {
|
getTokens() {
|
||||||
if ( this._tokenized ) { return this._tokens; }
|
if ( this._tokenized ) { return this._tokens; }
|
||||||
|
@ -92,12 +104,6 @@
|
||||||
return this._tokens;
|
return this._tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
_appendTokenAt(i, th, ti) {
|
|
||||||
this._tokens[i+0] = th;
|
|
||||||
this._tokens[i+1] = ti;
|
|
||||||
return i + 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
tokenHashFromString(s) {
|
tokenHashFromString(s) {
|
||||||
const l = s.length;
|
const l = s.length;
|
||||||
if ( l === 0 ) { return 0; }
|
if ( l === 0 ) { return 0; }
|
||||||
|
@ -119,9 +125,26 @@
|
||||||
return s;
|
return s;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
toSelfie() {
|
||||||
|
return µBlock.base64.encode(
|
||||||
|
this.knownTokens.buffer,
|
||||||
|
this.knownTokens.byteLength
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
fromSelfie(selfie) {
|
||||||
|
return µBlock.base64.decode(selfie, this.knownTokens.buffer);
|
||||||
|
}
|
||||||
|
|
||||||
// https://github.com/chrisaljoudi/uBlock/issues/1118
|
// https://github.com/chrisaljoudi/uBlock/issues/1118
|
||||||
// We limit to a maximum number of tokens.
|
// We limit to a maximum number of tokens.
|
||||||
|
|
||||||
|
_appendTokenAt(i, th, ti) {
|
||||||
|
this._tokens[i+0] = th;
|
||||||
|
this._tokens[i+1] = ti;
|
||||||
|
return i + 2;
|
||||||
|
}
|
||||||
|
|
||||||
_tokenize() {
|
_tokenize() {
|
||||||
const tokens = this._tokens;
|
const tokens = this._tokens;
|
||||||
let url = this._urlOut;
|
let url = this._urlOut;
|
||||||
|
@ -131,6 +154,7 @@
|
||||||
url = url.slice(0, 2048);
|
url = url.slice(0, 2048);
|
||||||
l = 2048;
|
l = 2048;
|
||||||
}
|
}
|
||||||
|
const knownTokens = this.knownTokens;
|
||||||
const vtc = this._validTokenChars;
|
const vtc = this._validTokenChars;
|
||||||
let i = 0, j = 0, v, n, ti, th;
|
let i = 0, j = 0, v, n, ti, th;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
|
@ -148,9 +172,11 @@
|
||||||
th = th * 64 + v;
|
th = th * 64 + v;
|
||||||
n += 1;
|
n += 1;
|
||||||
}
|
}
|
||||||
tokens[j+0] = th;
|
if ( knownTokens[th & 0xFFFF] !== 0 ) {
|
||||||
tokens[j+1] = ti;
|
tokens[j+0] = th;
|
||||||
j += 2;
|
tokens[j+1] = ti;
|
||||||
|
j += 2;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
})();
|
})();
|
||||||
|
|
Loading…
Reference in New Issue