Add support for entity-matching in `domain=` filter option

Related issue:
- https://github.com/uBlockOrigin/uBlock-issues/issues/1008

This commit adds support entity-matching in the filter
option `domain=`. Example:

    pattern$domain=google.*

The `*` above is meant to match any suffix from the Public
Suffix List. The semantic is exactly the same as the
already existing entity-matching support in static
extended filtering:

- https://github.com/gorhill/uBlock/wiki/Static-filter-syntax#entity

Additionally, in this commit:

Fix cases where "just-origin" filters of the form `|http*://`
were erroneously normalized to `|http://`. The proper
normalization of `|http*://` is `*`.

Add support to store hostname strings into the character
buffer of a hntrie container. As of commit time, there are
5,544 instances of FilterOriginHit, and 732 instances of
FilterOriginMiss, which filters require storing/matching a
single hostname string. Those strings are now stored in the
character buffer of the already existing origin-related
 hntrie container. (The same approach is used for plain
patterns which are not part of a bidi-trie.)
This commit is contained in:
Raymond Hill 2020-05-24 10:46:16 -04:00
parent 56a3aff857
commit 3c67d2b89f
No known key found for this signature in database
GPG Key ID: 25E1490B761470C2
3 changed files with 377 additions and 208 deletions

View File

@ -138,8 +138,8 @@ const µBlock = (( ) => { // jshint ignore:line
// Read-only
systemSettings: {
compiledMagic: 27, // Increase when compiled format changes
selfieMagic: 26, // Increase when selfie format changes
compiledMagic: 28, // Increase when compiled format changes
selfieMagic: 28, // Increase when selfie format changes
},
// https://github.com/uBlockOrigin/uBlock-issues/issues/759#issuecomment-546654501

View File

@ -407,6 +407,49 @@ const HNTrieContainer = class {
return true;
}
// The following *Hostname() methods can be used to store hostname strings
// outside the trie. This is useful to store/match hostnames which are
// not part of a collection, and yet still benefit from storing the strings
// into a trie container's character buffer.
// TODO: WASM version of matchesHostname()
storeHostname(hn) {
let n = hn.length;
if ( n > 255 ) {
hn = hn.slice(-255);
n = 255;
}
if ( (this.buf.length - this.buf32[CHAR1_SLOT]) < n ) {
this.growBuf(0, n);
}
const offset = this.buf32[CHAR1_SLOT];
this.buf32[CHAR1_SLOT] = offset + n;
const buf8 = this.buf;
for ( let i = 0; i < n; i++ ) {
buf8[offset+i] = hn.charCodeAt(i);
}
return offset - this.buf32[CHAR0_SLOT];
}
extractHostname(i, n) {
const textDecoder = new TextDecoder();
const offset = this.buf32[CHAR0_SLOT] + i;
return textDecoder.decode(this.buf.subarray(offset, offset + n));
}
matchesHostname(hn, i, n) {
this.setNeedle(hn);
const buf8 = this.buf;
const hr = buf8[255];
if ( n > hr ) { return false; }
const hl = hr - n;
const nl = this.buf32[CHAR0_SLOT] + i;
for ( let j = 0; j < n; j++ ) {
if ( buf8[nl+j] !== buf8[hl+j] ) { return false; }
}
return n === hr || hn.charCodeAt(hl-1) === 0x2E /* '.' */;
}
async enableWASM() {
if ( typeof WebAssembly !== 'object' ) { return false; }
if ( this.wasmMemory instanceof WebAssembly.Memory ) { return true; }

View File

@ -173,12 +173,26 @@ const typeValueFromCatBits = catBits => (catBits >>> 4) & 0b11111;
let $requestURL = '';
let $requestHostname = '';
let $docHostname = '';
let $docDomain = '';
let $tokenBeg = 0;
let $patternMatchLeft = 0;
let $patternMatchRight = 0;
// EXPERIMENT: $requestTypeBit
let $requestTypeBit = 0;
const $docEntity = {
entity: undefined,
compute() {
if ( this.entity === undefined ) {
const pos = $docDomain.indexOf('.');
this.entity = pos !== -1
? $docHostname.slice(0, pos - $docDomain.length)
: '';
}
return this.entity;
},
reset() {
this.entity = undefined;
},
};
/******************************************************************************/
@ -1072,39 +1086,6 @@ registerFilterClass(FilterTrailingSeparator);
/******************************************************************************/
const FilterType = class {
constructor(bits) {
this.typeBits = bits;
}
match() {
return (this.typeBits & $requestTypeBit) !== 0;
}
logData() {
}
toSelfie() {
return [ this.fid, this.typeBits ];
}
static compile(details) {
return [ FilterType.fid, details.typeBits & allNetworkTypesBits ];
}
static fromCompiled(args) {
return new FilterType(args[1]);
}
static fromSelfie(args) {
return new FilterType(args[1]);
}
};
registerFilterClass(FilterType);
/******************************************************************************/
const FilterRegex = class {
constructor(s) {
this.s = s;
@ -1162,60 +1143,79 @@ registerFilterClass(FilterRegex);
// The optimal "class" is picked according to the content of the
// `domain=` filter option.
const filterOrigin = new (class {
const filterOrigin = (( ) => {
const FilterOrigin = class {
constructor() {
this.trieContainer = new µb.HNTrieContainer();
}
compile(details, prepend, units) {
const domainOpt = details.domainOpt;
let compiledMiss, compiledHit;
// One hostname
if ( domainOpt.indexOf('|') === -1 ) {
// Must be a miss
if ( domainOpt.charCodeAt(0) === 0x7E /* '~' */ ) {
compiledMiss = FilterOriginMiss.compile(domainOpt);
compile(domainOpt, prepend, units) {
const hostnameHits = [];
const hostnameMisses = [];
const entityHits = [];
const entityMisses = [];
for ( const s of FilterParser.domainOptIterator(domainOpt) ) {
const len = s.length;
const beg = len > 1 && s.charCodeAt(0) === 0x7E ? 1 : 0;
const end = len > 2 &&
s.charCodeAt(len - 1) === 0x2A /* '*' */ &&
s.charCodeAt(len - 2) === 0x2E /* '.' */
? len - 2 : len;
if ( end <= beg ) { continue; }
if ( end === len ) {
if ( beg === 0 ) {
hostnameHits.push(s);
} else {
hostnameMisses.push(s.slice(1));
}
// Must be a hit
else {
compiledHit = FilterOriginHit.compile(domainOpt);
} else {
if ( beg === 0 ) {
entityHits.push(s.slice(0, -2));
} else {
entityMisses.push(s.slice(1, -2));
}
}
// Many hostnames.
// Must be in set (none negated).
else if ( domainOpt.indexOf('~') === -1 ) {
compiledHit = FilterOriginHitSet.compile(domainOpt);
}
// Must not be in set (all negated).
else if ( /^~(?:[^|~]+\|~)+[^|~]+$/.test(domainOpt) ) {
compiledMiss = FilterOriginMissSet.compile(domainOpt);
const compiledHit = [];
if ( entityHits.length !== 0 ) {
for ( const entity of entityHits ) {
compiledHit.push(FilterOriginEntityHit.compile(entity));
}
// Must be in one set, but not in the other.
else {
const hostnames = domainOpt.split('|');
const missSet = hostnames.filter(hn => {
if ( hn.charCodeAt(0) === 0x7E /* '~' */ ) {
return hn;
}
});
const hitSet = hostnames.filter(hn => {
if ( hn.charCodeAt(0) !== 0x7E /* '~' */ ) {
return hn;
if ( hostnameHits.length === 1 ) {
compiledHit.push(FilterOriginHit.compile(hostnameHits[0]));
} else if ( hostnameHits.length > 1 ) {
compiledHit.push(FilterOriginHitSet.compile(hostnameHits.join('|')));
}
});
compiledMiss = missSet.length === 1
? FilterOriginMiss.compile(missSet[0])
: FilterOriginMissSet.compile(missSet.join('|'));
compiledHit = hitSet.length === 1
? FilterOriginHit.compile(hitSet[0])
: FilterOriginHitSet.compile(hitSet.join('|'));
if ( compiledHit.length > 1 ) {
compiledHit[0] = [ FilterCompositeAny.compile(compiledHit.slice()) ];
compiledHit.length = 1;
}
const compiledMiss = [];
if ( entityMisses.length !== 0 ) {
for ( const entity of entityMisses ) {
compiledMiss.push(FilterOriginEntityMiss.compile(entity));
}
}
if ( hostnameMisses.length === 1 ) {
compiledMiss.push(FilterOriginMiss.compile(hostnameMisses[0]));
} else if ( hostnameMisses.length > 1 ) {
compiledMiss.push(FilterOriginMissSet.compile(hostnameMisses.join('|')));
}
if ( prepend ) {
if ( compiledHit ) { units.unshift(compiledHit); }
if ( compiledMiss ) { units.unshift(compiledMiss); }
if ( compiledHit.length !== 0 ) {
units.unshift(compiledHit[0]);
}
if ( compiledMiss.length !== 0 ) {
units.unshift(...compiledMiss);
}
} else {
if ( compiledMiss ) { units.push(compiledMiss); }
if ( compiledHit ) { units.push(compiledHit); }
if ( compiledMiss.length !== 0 ) {
units.push(...compiledMiss);
}
if ( compiledHit.length !== 0 ) {
units.push(compiledHit[0]);
}
}
}
@ -1241,45 +1241,51 @@ const filterOrigin = new (class {
fromSelfie() {
}
};
return new FilterOrigin();
})();
/******************************************************************************/
const FilterOriginHit = class {
constructor(hostname) {
this.hostname = hostname;
constructor(i, n) {
this.i = i;
this.n = n;
}
match() {
const haystack = $docHostname;
const needle = this.hostname;
const offset = haystack.length - needle.length;
if ( offset < 0 ) { return false; }
if ( haystack.charCodeAt(offset) !== needle.charCodeAt(0) ) {
return false;
}
if ( haystack.endsWith(needle) === false ) { return false; }
return offset === 0 || haystack.charCodeAt(offset-1) === 0x2E /* '.' */;
return filterOrigin.trieContainer.matchesHostname(
$docHostname,
this.i,
this.n
);
}
toSelfie() {
return [ this.fid, this.hostname ];
return [ this.fid, this.i, this.n ];
}
logData(details) {
details.domains.push(this.hostname);
details.domains.push(this.getHostname());
}
static compile(domainOpt) {
return [ FilterOriginHit.fid, domainOpt ];
getHostname() {
return filterOrigin.trieContainer.extractHostname(this.i, this.n);
}
static compile(hostname) {
return [ FilterOriginHit.fid, hostname ];
}
static fromCompiled(args) {
return new FilterOriginHit(args[1]);
return new FilterOriginHit(
filterOrigin.trieContainer.storeHostname(args[1]),
args[1].length
);
}
static fromSelfie(args) {
return new FilterOriginHit(args[1]);
return new FilterOriginHit(args[1], args[2]);
}
};
@ -1287,43 +1293,28 @@ registerFilterClass(FilterOriginHit);
/******************************************************************************/
const FilterOriginMiss = class {
constructor(hostname) {
this.hostname = hostname.slice(1);
}
const FilterOriginMiss = class extends FilterOriginHit {
match() {
const haystack = $docHostname;
if ( haystack.endsWith(this.hostname) ) {
const offset = haystack.length - this.hostname.length;
if (
offset === 0 ||
haystack.charCodeAt(offset-1) === 0x2E /* '.' */
) {
return false;
}
}
return true;
return super.match() === false;
}
logData(details) {
details.domains.push(`~${this.hostname}`);
details.domains.push(`~${this.getHostname()}`);
}
toSelfie() {
return [ this.fid, `~${this.hostname}` ];
}
static compile(domainOpt) {
return [ FilterOriginMiss.fid, domainOpt ];
static compile(hostname) {
return [ FilterOriginMiss.fid, hostname ];
}
static fromCompiled(args) {
return new FilterOriginMiss(args[1]);
return new FilterOriginMiss(
filterOrigin.trieContainer.storeHostname(args[1]),
args[1].length
);
}
static fromSelfie(args) {
return new FilterOriginMiss(args[1]);
return new FilterOriginMiss(args[1], args[2]);
}
};
@ -1342,7 +1333,7 @@ const FilterOriginHitSet = class {
match() {
if ( this.oneOf === null ) {
this.oneOf = filterOrigin.trieContainer.fromIterable(
this.domainOpt.split('|')
FilterParser.domainOptIterator(this.domainOpt)
);
}
return this.oneOf.matches($docHostname) !== -1;
@ -1383,35 +1374,15 @@ registerFilterClass(FilterOriginHitSet);
/******************************************************************************/
const FilterOriginMissSet = class {
constructor(domainOpt, noneOf = null) {
this.domainOpt = domainOpt;
this.noneOf = noneOf !== null
? filterOrigin.trieContainer.createOne(noneOf)
: null;
}
const FilterOriginMissSet = class extends FilterOriginHitSet {
match() {
if ( this.noneOf === null ) {
this.noneOf = filterOrigin.trieContainer.fromIterable(
this.domainOpt.replace(/~/g, '').split('|')
);
}
return this.noneOf.matches($docHostname) === -1;
return super.match() === false;
}
logData(details) {
details.domains.push(this.domainOpt);
}
toSelfie() {
return [
this.fid,
this.domainOpt,
this.noneOf !== null
? filterOrigin.trieContainer.compileOne(this.noneOf)
: null
];
details.domains.push(
'~' + this.domainOpt.replace('|', '|~')
);
}
static compile(domainOpt) {
@ -1435,6 +1406,74 @@ registerFilterClass(FilterOriginMissSet);
/******************************************************************************/
const FilterOriginEntityHit = class {
constructor(entity) {
this.entity = entity;
}
match() {
const entity = $docEntity.compute();
if ( entity === '' ) { return false; }
const offset = entity.length - this.entity.length;
if ( offset < 0 ) { return false; }
if ( entity.charCodeAt(offset) !== this.entity.charCodeAt(0) ) {
return false;
}
if ( entity.endsWith(this.entity) === false ) { return false; }
return offset === 0 || entity.charCodeAt(offset-1) === 0x2E /* '.' */;
}
toSelfie() {
return [ this.fid, this.entity ];
}
logData(details) {
details.domains.push(`${this.entity}.*`);
}
static compile(entity) {
return [ FilterOriginEntityHit.fid, entity ];
}
static fromCompiled(args) {
return new FilterOriginEntityHit(args[1]);
}
static fromSelfie(args) {
return new FilterOriginEntityHit(args[1]);
}
};
registerFilterClass(FilterOriginEntityHit);
/******************************************************************************/
const FilterOriginEntityMiss = class extends FilterOriginEntityHit {
match() {
return super.match() === false;
}
logData(details) {
details.domains.push(`~${this.entity}.*`);
}
static compile(entity) {
return [ FilterOriginEntityMiss.fid, entity ];
}
static fromCompiled(args) {
return new FilterOriginEntityMiss(args[1]);
}
static fromSelfie(args) {
return new FilterOriginEntityMiss(args[1]);
}
};
registerFilterClass(FilterOriginEntityMiss);
/******************************************************************************/
const FilterDataHolder = class {
constructor(dataType, data) {
this.dataType = dataType;
@ -1549,6 +1588,12 @@ const FilterCollection = class {
} while ( i !== 0 );
}
logData(details) {
this.forEach(iunit => {
filterUnits[iunit].logData(details);
});
}
toSelfie() {
return [ this.fid, this.i ];
}
@ -1580,7 +1625,36 @@ const FilterCollection = class {
/******************************************************************************/
const FilterComposite = class extends FilterCollection {
const FilterCompositeAny = class extends FilterCollection {
match() {
const sequences = filterSequences;
const units = filterUnits;
let i = this.i;
while ( i !== 0 ) {
if ( units[sequences[i+0]].match() ) { return true; }
i = sequences[i+1];
}
return false;
}
static compile(fdata) {
return FilterCollection.compile(FilterCompositeAny, fdata);
}
static fromCompiled(args) {
return FilterCollection.fromCompiled(FilterCompositeAny, args);
}
static fromSelfie(args) {
return FilterCollection.fromSelfie(FilterCompositeAny, args);
}
};
registerFilterClass(FilterCompositeAny);
/******************************************************************************/
const FilterCompositeAll = class extends FilterCollection {
match() {
const sequences = filterSequences;
const units = filterUnits;
@ -1622,26 +1696,20 @@ const FilterComposite = class extends FilterCollection {
return details;
}
logData(details) {
this.forEach(iunit => {
filterUnits[iunit].logData(details);
});
}
static compile(fdata) {
return FilterCollection.compile(FilterComposite, fdata);
return FilterCollection.compile(FilterCompositeAll, fdata);
}
static fromCompiled(args) {
return FilterCollection.fromCompiled(FilterComposite, args);
return FilterCollection.fromCompiled(FilterCompositeAll, args);
}
static fromSelfie(args) {
return FilterCollection.fromSelfie(FilterComposite, args);
return FilterCollection.fromSelfie(FilterCompositeAll, args);
}
};
registerFilterClass(FilterComposite);
registerFilterClass(FilterCompositeAll);
/******************************************************************************/
@ -2001,7 +2069,7 @@ const FilterBucket = class extends FilterCollection {
filterUnits[iunit] = null;
return;
}
// FilterComposite is assumed here, i.e. with conditions.
// FilterCompositeAll is assumed here, i.e. with conditions.
if ( f.n === 1 ) {
filterUnits[iunit] = null;
iunit = filterSequences[f.i];
@ -2037,7 +2105,7 @@ const FilterParser = class {
this.cantWebsocket = vAPI.cantWebsocket;
this.domainOpt = '';
this.noTokenHash = urlTokenizer.noTokenHash;
this.reBadDomainOptChars = /[*+?^${}()[\]\\]/;
this.reBadDomainOptChars = /[+?^${}()[\]\\]/;
this.reHostnameRule1 = /^\w[\w.-]*[a-z]$/i;
this.reHostnameRule2 = /^\w[\w.-]*[a-z]\^?$/i;
this.reCanTrimCarets1 = /^[^*]*$/;
@ -2651,6 +2719,47 @@ const FilterParser = class {
) &&
this.domainOpt.indexOf('~') === -1;
}
domainIsEntity(s) {
const l = s.length;
return l > 2 &&
s.charCodeAt(l-1) === 0x2A /* '*' */ &&
s.charCodeAt(l-2) === 0x2E /* '.' */;
}
static domainOptIterator(domainOpt) {
return new FilterParser.DomainOptIterator(domainOpt);
}
};
/******************************************************************************/
FilterParser.DomainOptIterator = class {
constructor(domainOpt) {
this.domainOpt = domainOpt;
this.i = 0;
this.value = undefined;
this.done = false;
}
next() {
if ( this.i === -1 ) {
this.value = undefined;
this.done = true;
return this;
}
let pos = this.domainOpt.indexOf('|', this.i);
if ( pos !== -1 ) {
this.value = this.domainOpt.slice(this.i, pos);
this.i = pos + 1;
} else {
this.value = this.domainOpt.slice(this.i);
this.i = -1;
}
return this;
}
[Symbol.iterator]() {
return this;
}
};
/******************************************************************************/
@ -3013,37 +3122,50 @@ FilterContainer.prototype.compile = function(raw, writer) {
parsed.makeToken();
const units = [];
// Special pattern/option cases:
// - `*$domain=...`
// - `|http://$domain=...`
// - `|https://$domain=...`
// The semantic of "just-origin" filters is that contrary to normal
// filters, the original filter is split into as many filters as there
// are entries in the `domain=` option.
if ( parsed.isJustOrigin() ) {
const hostnames = parsed.domainOpt.split('|');
if ( parsed.f === '*' ) {
const tokenHash = parsed.tokenHash;
if ( parsed.f === '*' || parsed.f.startsWith('http*') ) {
parsed.tokenHash = this.anyTokenHash;
} else if /* 'https:' */ ( parsed.f.startsWith('https') ) {
parsed.tokenHash = this.anyHTTPSTokenHash;
} else /* 'http:' */ {
parsed.tokenHash = this.anyHTTPTokenHash;
}
for ( const hn of hostnames ) {
const entities = [];
for ( const hn of FilterParser.domainOptIterator(parsed.domainOpt) ) {
if ( parsed.domainIsEntity(hn) === false ) {
this.compileToAtomicFilter(parsed, hn, writer);
} else {
entities.push(hn);
}
}
if ( entities.length === 0 ) { return true; }
parsed.tokenHash = tokenHash;
const leftAnchored = (parsed.anchor & 0b010) !== 0;
for ( const entity of entities ) {
const units = [];
filterPattern.compile(parsed, units);
if ( leftAnchored ) { units.push(FilterAnchorLeft.compile()); }
filterOrigin.compile(entity, true, units);
this.compileToAtomicFilter(
parsed, FilterCompositeAll.compile(units), writer
);
}
return true;
}
const units = [];
// Pattern
filterPattern.compile(parsed, units);
// Type
// EXPERIMENT: $requestTypeBit
//if ( (parsed.typeBits & allNetworkTypesBits) !== 0 ) {
// units.unshift(FilterType.compile(parsed));
// parsed.typeBits &= ~allNetworkTypesBits;
//}
// Anchor
if ( (parsed.anchor & 0b100) !== 0 ) {
if ( parsed.isPureHostname ) {
@ -3061,7 +3183,7 @@ FilterContainer.prototype.compile = function(raw, writer) {
// Origin
if ( parsed.domainOpt !== '' ) {
filterOrigin.compile(
parsed,
parsed.domainOpt,
units.length !== 0 && filterClasses[units[0][0]].isSlow === true,
units
);
@ -3079,7 +3201,7 @@ FilterContainer.prototype.compile = function(raw, writer) {
const fdata = units.length === 1
? units[0]
: FilterComposite.compile(units);
: FilterCompositeAll.compile(units);
this.compileToAtomicFilter(parsed, fdata, writer);
@ -3211,6 +3333,8 @@ FilterContainer.prototype.realmMatchAndFetchData = function(
FilterContainer.prototype.matchAndFetchData = function(fctxt, type) {
$requestURL = urlTokenizer.setURL(fctxt.url);
$docHostname = fctxt.getDocHostname();
$docDomain = fctxt.getDocDomain();
$docEntity.reset();
$requestHostname = fctxt.getHostname();
const partyBits = fctxt.is3rdPartyToDoc() ? ThirdParty : FirstParty;
@ -3399,7 +3523,9 @@ FilterContainer.prototype.matchStringReverse = function(type, url) {
this.$filterUnit = 0;
// These registers will be used by various filters
$docHostname = $requestHostname = µb.URI.hostnameFromURI(url);
$docHostname = $requestHostname = vAPI.hostnameFromNetworkURL(url);
$docDomain = vAPI.domainFromHostname($docHostname);
$docEntity.reset();
// Exception filters
if ( this.realmMatchString(AllowAction, typeBits, FirstParty) ) {
@ -3431,8 +3557,6 @@ FilterContainer.prototype.matchString = function(fctxt, modifiers = 0) {
modifiers |= 0b0001;
}
}
// EXPERIMENT: $requestTypeBit
//$requestTypeBit = 1 << ((typeValue >>> 4) - 1);
if ( (modifiers & 0b0001) !== 0 ) {
if ( typeValue === undefined ) { return 0; }
typeValue |= 0x80000000;
@ -3446,6 +3570,8 @@ FilterContainer.prototype.matchString = function(fctxt, modifiers = 0) {
// These registers will be used by various filters
$docHostname = fctxt.getDocHostname();
$docDomain = fctxt.getDocDomain();
$docEntity.reset();
$requestHostname = fctxt.getHostname();
// Important block filters.
@ -3666,7 +3792,7 @@ FilterContainer.prototype.bucketHistogram = function() {
"FilterHostnameDict" Content => 60772}
"FilterPatternPlain" => 26432}
"FilterComposite" => 17125}
"FilterCompositeAll" => 17125}
"FilterPlainTrie Content" => 13519}
"FilterAnchorHnLeft" => 11931}
"FilterOriginHit" => 5524}
@ -3729,7 +3855,7 @@ FilterContainer.prototype.filterClassHistogram = function() {
filterClassDetails.get(1001).count += f.size;
continue;
}
if ( f instanceof FilterComposite ) {
if ( f instanceof FilterCompositeAll ) {
let i = f.i;
while ( i !== 0 ) {
countFilter(filterUnits[filterSequences[i+0]]);