This commit is contained in:
gorhill 2014-09-19 10:59:44 -04:00
parent 8744dece44
commit 51bb22097c
4 changed files with 138 additions and 96 deletions

View File

@ -89,7 +89,7 @@ return {
firstUpdateAfter: 5 * oneMinute,
nextUpdateAfter: 7 * oneHour,
selfieMagic: 'ccolmudazpvm',
selfieMagic: 'rniacaqskjwz',
selfieAfter: 7 * oneMinute,
pageStores: {},

View File

@ -36,7 +36,7 @@ var LiquidDict = function() {
// Somewhat arbitrary: I need to come up with hard data to know at which
// point binary search is better than indexOf.
this.cutoff = 500;
this.cutoff = 256;
};
/******************************************************************************/
@ -127,7 +127,7 @@ LiquidDict.prototype.test = function(word) {
return bucket[word] !== undefined;
}
if ( bucket.charAt(0) === ' ' ) {
return bucket.indexOf(' ' + word + ' ') >= 0;
return bucket.indexOf(' ' + word + ' ') !== -1;
}
// binary search
var len = word.length;

View File

@ -80,6 +80,7 @@ var reIgnoreComment = /^\[|^!/;
var reHostnameRule = /^[0-9a-z][0-9a-z.-]+[0-9a-z]$/;
var reHostnameToken = /^[0-9a-z]+/g;
var reGoodToken = /[%0-9a-z]{2,}/g;
var reURLPostHostnameAnchors = /[\/?#]/;
var typeNameToTypeValue = {
'stylesheet': 2 << 9,
@ -166,6 +167,9 @@ Filters family tree:
- anchored at end
- no hostname
- specific hostname
- anchored within hostname
- no hostname
- specific hostname (not implemented)
- one wildcard
- anywhere
@ -177,6 +181,9 @@ Filters family tree:
- anchored at end
- no hostname
- specific hostname
- anchored within hostname
- no hostname (not implemented)
- specific hostname (not implemented)
- more than one wildcard
- anywhere
@ -188,6 +195,9 @@ Filters family tree:
- anchored at end
- no hostname
- specific hostname
- anchored within hostname
- no hostname (not implemented)
- specific hostname (not implemented)
*/
@ -458,6 +468,41 @@ FilterPlainRightAnchoredHostname.fromSelfie = function(s) {
/******************************************************************************/
// https://github.com/gorhill/uBlock/issues/235
// The filter is left-anchored somewhere within the hostname part of the URL.
var FilterPlainHnAnchored = function(s) {
this.s = s;
};
FilterPlainHnAnchored.prototype.match = function(url, tokenBeg) {
if ( url.substr(tokenBeg, this.s.length) !== this.s ) {
return false;
}
// Valid only if hostname-valid characters to the left of token
var pos = url.indexOf('://');
return pos !== -1 &&
reURLPostHostnameAnchors.test(url.slice(pos + 3, tokenBeg)) === false;
};
FilterPlainHnAnchored.prototype.fid = 'h|a';
FilterPlainHnAnchored.prototype.toString = function() {
return '||' + this.s;
};
FilterPlainHnAnchored.prototype.toSelfie = function() {
return this.s;
};
FilterPlainHnAnchored.fromSelfie = function(s) {
return new FilterPlainHnAnchored(s);
};
// https://www.youtube.com/watch?v=71YS6xDB-E4
/******************************************************************************/
// With a single wildcard, regex is not optimal.
// See:
// http://jsperf.com/regexp-vs-indexof-abp-miss/3
@ -770,6 +815,24 @@ FilterManyWildcardsHostname.fromSelfie = function(s) {
/******************************************************************************/
// TODO: Some buckets may grow quite large (see histogram excerpt below).
// Evaluate the gain from having an internal dictionary for such large
// buckets: the key would be created by concatenating the char preceding and
// following the token. The dict would contain smaller buckets, and there
// would be a special bucket for those filters for which a prefix, suffix, or
// both is missing.
// I used to do this, but at a higher level, during tokenization, and in the
// end I found out the overhead was to much. I believe it will be a gain
// here because the special treatment would be only for a few specific tokens,
// not systematically done for all tokens.
// key=Ȁ ad count=655
// key=Ȁ ads count=432
// key=̀ doubleclick count= 94
// key=Ȁ adv count= 89
// key=Ȁ google count= 67
// key=Ȁ banner count= 55
var FilterBucket = function(a, b) {
this.f = null;
this.filters = [];
@ -842,6 +905,9 @@ var makeFilter = function(details, tokenBeg) {
if ( details.anchor > 0 ) {
return new FilterPlainRightAnchored(s);
}
if ( details.hostnameAnchored ) {
return new FilterPlainHnAnchored(s);
}
if ( tokenBeg === 0 ) {
return new FilterPlainPrefix0(s);
}
@ -983,7 +1049,8 @@ FilterParser.prototype.reset = function() {
this.f = '';
this.firstParty = false;
this.fopts = '';
this.hostname = false;
this.hostnameAnchored = false;
this.hostnamePure = false;
this.hostnames.length = 0;
this.notHostname = false;
this.thirdParty = false;
@ -1060,6 +1127,12 @@ FilterParser.prototype.parse = function(s) {
// important!
this.reset();
if ( reHostnameRule.test(s) ) {
this.f = s;
this.hostnamePure = this.hostnameAnchored = true;
return this;
}
// element hiding filter?
if ( s.indexOf('##') >= 0 || s.indexOf('#@') >= 0 ) {
this.elemHiding = true;
@ -1087,7 +1160,7 @@ FilterParser.prototype.parse = function(s) {
// hostname anchoring
if ( s.slice(0, 2) === '||' ) {
this.hostname = true;
this.hostnameAnchored = true;
s = s.slice(2);
}
@ -1110,7 +1183,12 @@ FilterParser.prototype.parse = function(s) {
s = s.replace(/\*\*+/g, '*');
// remove leading and trailing wildcards
this.f = trimChar(s, '*');
s = trimChar(s, '*');
// pure hostname-based?
this.hostnamePure = this.hostnameAnchored && reHostnameRule.test(s);
this.f = s;
if ( !this.fopts ) {
return this;
@ -1274,6 +1352,7 @@ FilterContainer.prototype.fromSelfie = function(selfie) {
'|ah': FilterPlainLeftAnchoredHostname,
'a|': FilterPlainRightAnchored,
'a|h': FilterPlainRightAnchoredHostname,
'h|a': FilterPlainHnAnchored,
'*': FilterSingleWildcard,
'*h': FilterSingleWildcardHostname,
'0*': FilterSingleWildcardPrefix0,
@ -1345,30 +1424,6 @@ FilterContainer.prototype.makeCategoryKey = function(category) {
/******************************************************************************/
FilterContainer.prototype.addAnyPartyHostname = function(hostname) {
if ( this.blockedAnyPartyHostnames.add(hostname) ) {
this.acceptedCount++;
this.blockFilterCount++;
return true;
}
this.duplicateCount++;
return false;
};
/******************************************************************************/
FilterContainer.prototype.add3rdPartyHostname = function(hostname) {
if ( this.blocked3rdPartyHostnames.add(hostname) ) {
this.acceptedCount++;
this.blockFilterCount++;
return true;
}
this.duplicateCount++;
return false;
};
/******************************************************************************/
FilterContainer.prototype.add = function(s) {
// ORDER OF TESTS IS IMPORTANT!
@ -1396,31 +1451,35 @@ FilterContainer.prototype.add = function(s) {
return false;
}
this.processedFilterCount += 1;
this.acceptedCount += 1;
// Pure hostnames, use more efficient liquid dict
if ( parsed.hostnamePure && parsed.action === BlockAction ) {
if ( parsed.fopts === '' ) {
if ( this.blockedAnyPartyHostnames.add(parsed.f) ) {
this.blockFilterCount++;
} else {
this.duplicateCount++;
}
return true;
}
if ( parsed.fopts === 'third-party' ) {
if ( this.blocked3rdPartyHostnames.add(parsed.f) ) {
this.blockFilterCount++;
} else {
this.duplicateCount++;
}
return true;
}
}
if ( this.duplicates[s] ) {
this.duplicateCount++;
return false;
}
this.duplicates[s] = true;
this.processedFilterCount += 1;
// Ignore optionless hostname rules, these will be taken care of by µBlock.
if ( parsed.hostname && parsed.fopts === '' && parsed.action === BlockAction && reHostnameRule.test(parsed.f) ) {
return false;
}
this.acceptedCount += 1;
// Pure third-party hostnames, use more efficient liquid dict
if ( reHostnameRule.test(parsed.f) && parsed.hostname && parsed.action === BlockAction ) {
if ( parsed.fopts === 'third-party' ) {
return this.blocked3rdPartyHostnames.add(parsed.f);
}
if ( parsed.fopts === '' ) {
return this.blockedAnyPartyHostnames.add(parsed.f);
}
}
var r = this.addFilter(parsed);
if ( r === false ) {
return false;
@ -1439,16 +1498,22 @@ FilterContainer.prototype.add = function(s) {
FilterContainer.prototype.addFilter = function(parsed) {
// TODO: avoid duplicates
var matches = parsed.hostname ? findHostnameToken(parsed.f) : findFirstGoodToken(parsed.f);
var matches = parsed.hostnameAnchored ?
findHostnameToken(parsed.f) :
findFirstGoodToken(parsed.f);
if ( !matches || !matches[0].length ) {
return false;
}
var tokenBeg = matches.index;
var tokenEnd = parsed.hostname ? reHostnameToken.lastIndex : reGoodToken.lastIndex;
var tokenEnd = parsed.hostnameAnchored ?
reHostnameToken.lastIndex :
reGoodToken.lastIndex;
var filter;
var i = parsed.hostnames.length;
// Applies to specific domains
if ( i !== 0 && !parsed.notHostname ) {
while ( i-- ) {
filter = makeHostnameFilter(parsed, tokenBeg, parsed.hostnames[i]);
@ -1466,6 +1531,8 @@ FilterContainer.prototype.addFilter = function(parsed) {
return true;
}
// Applies to all domains, with exception(s)
// https://github.com/gorhill/uBlock/issues/191
// Invert the purpose of the filter for negated hostnames
if ( i !== 0 && parsed.notHostname ) {
@ -1498,6 +1565,8 @@ FilterContainer.prototype.addFilter = function(parsed) {
return true;
}
// Applies to all domains without exceptions
filter = makeFilter(parsed, tokenBeg);
if ( !filter ) {
return false;
@ -1630,18 +1699,15 @@ FilterContainer.prototype.matchTokens = function(url) {
// specialized to deal with other complex filters.
FilterContainer.prototype.matchAnyPartyHostname = function(requestHostname) {
// Quick test first
if ( this.blockedAnyPartyHostnames.test(requestHostname) ) {
return '||' + requestHostname + '^';
}
// Check parent hostnames if quick test failed
var hostnames = µb.URI.parentHostnamesFromHostname(requestHostname);
for ( var i = 0, n = hostnames.length; i < n; i++ ) {
if ( this.blockedAnyPartyHostnames.test(hostnames[i]) ) {
return '||' + hostnames[i] + '^';
var pos;
while ( this.blockedAnyPartyHostnames.test(requestHostname) !== true ) {
pos = requestHostname.indexOf('.');
if ( pos === -1 ) {
return false;
}
requestHostname = requestHostname.slice(pos + 1);
}
return false;
return '||' + requestHostname + '^';
};
/******************************************************************************/
@ -1655,18 +1721,15 @@ FilterContainer.prototype.matchAnyPartyHostname = function(requestHostname) {
// specialized to deal with other complex filters.
FilterContainer.prototype.match3rdPartyHostname = function(requestHostname) {
// Quick test first
if ( this.blocked3rdPartyHostnames.test(requestHostname) ) {
return '||' + requestHostname + '^$third-party';
}
// Check parent hostnames if quick test failed
var hostnames = µb.URI.parentHostnamesFromHostname(requestHostname);
for ( var i = 0, n = hostnames.length; i < n; i++ ) {
if ( this.blocked3rdPartyHostnames.test(hostnames[i]) ) {
return '||' + hostnames[i] + '^$third-party';
var pos;
while ( this.blocked3rdPartyHostnames.test(requestHostname) !== true ) {
pos = requestHostname.indexOf('.');
if ( pos === -1 ) {
return false;
}
requestHostname = requestHostname.slice(pos + 1);
}
return false;
return '||' + requestHostname + '^$third-party';
};
/******************************************************************************/

View File

@ -339,9 +339,7 @@
var parseCosmeticFilters = this.userSettings.parseAllABPHideFilters;
var duplicateCount = netFilteringEngine.duplicateCount + cosmeticFilteringEngine.duplicateCount;
var acceptedCount = netFilteringEngine.acceptedCount + cosmeticFilteringEngine.acceptedCount;
var reLocalhost = /(^|\s)(localhost\.localdomain|localhost|local|broadcasthost|0\.0\.0\.0|127\.0\.0\.1|::1|fe80::1%lo0)(?=\s|$)/g;
var reAdblockFilter = /^[^a-z0-9:]|[^a-z0-9]$|[^a-z0-9_:.-]/;
var reAdblockHostFilter = /^\|\|([a-z0-9.-]+[a-z0-9])\^?$/;
var reLocalhost = /(?:^|\s)(?:localhost\.localdomain|localhost|local|broadcasthost|0\.0\.0\.0|127\.0\.0\.1|::1|fe80::1%lo0)(?=\s|$)/g;
var reAsciiSegment = /^[\x21-\x7e]+$/;
var matches;
var lineBeg = 0, lineEnd, currentLineBeg;
@ -392,7 +390,8 @@
// The filter is whatever sequence of printable ascii character without
// whitespaces
matches = reAsciiSegment.exec(line);
if ( !matches || matches.length === 0 ) {
if ( matches === null ) {
//console.debug('µBlock.mergeUbiquitousBlacklist(): skipping "%s"', lineRaw);
continue;
}
@ -404,27 +403,7 @@
continue;
}
line = matches[0];
// Likely an ABP net filter?
if ( reAdblockFilter.test(line) ) {
if ( netFilteringEngine.add(line) ) {
continue;
}
// rhill 2014-01-22: Transpose possible Adblock Plus-filter syntax
// into a plain hostname if possible.
matches = reAdblockHostFilter.exec(line);
if ( !matches || matches.length < 2 ) {
continue;
}
line = matches[1];
}
if ( line === '' ) {
continue;
}
netFilteringEngine.addAnyPartyHostname(line);
netFilteringEngine.add(matches[0]);
}
// For convenience, store the number of entries for this