This commit is contained in:
gorhill 2014-09-19 10:59:44 -04:00
parent 8744dece44
commit 51bb22097c
4 changed files with 138 additions and 96 deletions

View File

@ -89,7 +89,7 @@ return {
firstUpdateAfter: 5 * oneMinute, firstUpdateAfter: 5 * oneMinute,
nextUpdateAfter: 7 * oneHour, nextUpdateAfter: 7 * oneHour,
selfieMagic: 'ccolmudazpvm', selfieMagic: 'rniacaqskjwz',
selfieAfter: 7 * oneMinute, selfieAfter: 7 * oneMinute,
pageStores: {}, pageStores: {},

View File

@ -36,7 +36,7 @@ var LiquidDict = function() {
// Somewhat arbitrary: I need to come up with hard data to know at which // Somewhat arbitrary: I need to come up with hard data to know at which
// point binary search is better than indexOf. // point binary search is better than indexOf.
this.cutoff = 500; this.cutoff = 256;
}; };
/******************************************************************************/ /******************************************************************************/
@ -127,7 +127,7 @@ LiquidDict.prototype.test = function(word) {
return bucket[word] !== undefined; return bucket[word] !== undefined;
} }
if ( bucket.charAt(0) === ' ' ) { if ( bucket.charAt(0) === ' ' ) {
return bucket.indexOf(' ' + word + ' ') >= 0; return bucket.indexOf(' ' + word + ' ') !== -1;
} }
// binary search // binary search
var len = word.length; var len = word.length;

View File

@ -80,6 +80,7 @@ var reIgnoreComment = /^\[|^!/;
var reHostnameRule = /^[0-9a-z][0-9a-z.-]+[0-9a-z]$/; var reHostnameRule = /^[0-9a-z][0-9a-z.-]+[0-9a-z]$/;
var reHostnameToken = /^[0-9a-z]+/g; var reHostnameToken = /^[0-9a-z]+/g;
var reGoodToken = /[%0-9a-z]{2,}/g; var reGoodToken = /[%0-9a-z]{2,}/g;
var reURLPostHostnameAnchors = /[\/?#]/;
var typeNameToTypeValue = { var typeNameToTypeValue = {
'stylesheet': 2 << 9, 'stylesheet': 2 << 9,
@ -166,6 +167,9 @@ Filters family tree:
- anchored at end - anchored at end
- no hostname - no hostname
- specific hostname - specific hostname
- anchored within hostname
- no hostname
- specific hostname (not implemented)
- one wildcard - one wildcard
- anywhere - anywhere
@ -177,6 +181,9 @@ Filters family tree:
- anchored at end - anchored at end
- no hostname - no hostname
- specific hostname - specific hostname
- anchored within hostname
- no hostname (not implemented)
- specific hostname (not implemented)
- more than one wildcard - more than one wildcard
- anywhere - anywhere
@ -188,6 +195,9 @@ Filters family tree:
- anchored at end - anchored at end
- no hostname - no hostname
- specific hostname - specific hostname
- anchored within hostname
- no hostname (not implemented)
- specific hostname (not implemented)
*/ */
@ -458,6 +468,41 @@ FilterPlainRightAnchoredHostname.fromSelfie = function(s) {
/******************************************************************************/ /******************************************************************************/
// https://github.com/gorhill/uBlock/issues/235
// The filter is left-anchored somewhere within the hostname part of the URL.
var FilterPlainHnAnchored = function(s) {
this.s = s;
};
FilterPlainHnAnchored.prototype.match = function(url, tokenBeg) {
if ( url.substr(tokenBeg, this.s.length) !== this.s ) {
return false;
}
// Valid only if hostname-valid characters to the left of token
var pos = url.indexOf('://');
return pos !== -1 &&
reURLPostHostnameAnchors.test(url.slice(pos + 3, tokenBeg)) === false;
};
FilterPlainHnAnchored.prototype.fid = 'h|a';
FilterPlainHnAnchored.prototype.toString = function() {
return '||' + this.s;
};
FilterPlainHnAnchored.prototype.toSelfie = function() {
return this.s;
};
FilterPlainHnAnchored.fromSelfie = function(s) {
return new FilterPlainHnAnchored(s);
};
// https://www.youtube.com/watch?v=71YS6xDB-E4
/******************************************************************************/
// With a single wildcard, regex is not optimal. // With a single wildcard, regex is not optimal.
// See: // See:
// http://jsperf.com/regexp-vs-indexof-abp-miss/3 // http://jsperf.com/regexp-vs-indexof-abp-miss/3
@ -770,6 +815,24 @@ FilterManyWildcardsHostname.fromSelfie = function(s) {
/******************************************************************************/ /******************************************************************************/
// TODO: Some buckets may grow quite large (see histogram excerpt below).
// Evaluate the gain from having an internal dictionary for such large
// buckets: the key would be created by concatenating the char preceding and
// following the token. The dict would contain smaller buckets, and there
// would be a special bucket for those filters for which a prefix, suffix, or
// both is missing.
// I used to do this, but at a higher level, during tokenization, and in the
// end I found out the overhead was to much. I believe it will be a gain
// here because the special treatment would be only for a few specific tokens,
// not systematically done for all tokens.
// key=Ȁ ad count=655
// key=Ȁ ads count=432
// key=̀ doubleclick count= 94
// key=Ȁ adv count= 89
// key=Ȁ google count= 67
// key=Ȁ banner count= 55
var FilterBucket = function(a, b) { var FilterBucket = function(a, b) {
this.f = null; this.f = null;
this.filters = []; this.filters = [];
@ -842,6 +905,9 @@ var makeFilter = function(details, tokenBeg) {
if ( details.anchor > 0 ) { if ( details.anchor > 0 ) {
return new FilterPlainRightAnchored(s); return new FilterPlainRightAnchored(s);
} }
if ( details.hostnameAnchored ) {
return new FilterPlainHnAnchored(s);
}
if ( tokenBeg === 0 ) { if ( tokenBeg === 0 ) {
return new FilterPlainPrefix0(s); return new FilterPlainPrefix0(s);
} }
@ -983,7 +1049,8 @@ FilterParser.prototype.reset = function() {
this.f = ''; this.f = '';
this.firstParty = false; this.firstParty = false;
this.fopts = ''; this.fopts = '';
this.hostname = false; this.hostnameAnchored = false;
this.hostnamePure = false;
this.hostnames.length = 0; this.hostnames.length = 0;
this.notHostname = false; this.notHostname = false;
this.thirdParty = false; this.thirdParty = false;
@ -1060,6 +1127,12 @@ FilterParser.prototype.parse = function(s) {
// important! // important!
this.reset(); this.reset();
if ( reHostnameRule.test(s) ) {
this.f = s;
this.hostnamePure = this.hostnameAnchored = true;
return this;
}
// element hiding filter? // element hiding filter?
if ( s.indexOf('##') >= 0 || s.indexOf('#@') >= 0 ) { if ( s.indexOf('##') >= 0 || s.indexOf('#@') >= 0 ) {
this.elemHiding = true; this.elemHiding = true;
@ -1087,7 +1160,7 @@ FilterParser.prototype.parse = function(s) {
// hostname anchoring // hostname anchoring
if ( s.slice(0, 2) === '||' ) { if ( s.slice(0, 2) === '||' ) {
this.hostname = true; this.hostnameAnchored = true;
s = s.slice(2); s = s.slice(2);
} }
@ -1110,7 +1183,12 @@ FilterParser.prototype.parse = function(s) {
s = s.replace(/\*\*+/g, '*'); s = s.replace(/\*\*+/g, '*');
// remove leading and trailing wildcards // remove leading and trailing wildcards
this.f = trimChar(s, '*'); s = trimChar(s, '*');
// pure hostname-based?
this.hostnamePure = this.hostnameAnchored && reHostnameRule.test(s);
this.f = s;
if ( !this.fopts ) { if ( !this.fopts ) {
return this; return this;
@ -1274,6 +1352,7 @@ FilterContainer.prototype.fromSelfie = function(selfie) {
'|ah': FilterPlainLeftAnchoredHostname, '|ah': FilterPlainLeftAnchoredHostname,
'a|': FilterPlainRightAnchored, 'a|': FilterPlainRightAnchored,
'a|h': FilterPlainRightAnchoredHostname, 'a|h': FilterPlainRightAnchoredHostname,
'h|a': FilterPlainHnAnchored,
'*': FilterSingleWildcard, '*': FilterSingleWildcard,
'*h': FilterSingleWildcardHostname, '*h': FilterSingleWildcardHostname,
'0*': FilterSingleWildcardPrefix0, '0*': FilterSingleWildcardPrefix0,
@ -1345,30 +1424,6 @@ FilterContainer.prototype.makeCategoryKey = function(category) {
/******************************************************************************/ /******************************************************************************/
FilterContainer.prototype.addAnyPartyHostname = function(hostname) {
if ( this.blockedAnyPartyHostnames.add(hostname) ) {
this.acceptedCount++;
this.blockFilterCount++;
return true;
}
this.duplicateCount++;
return false;
};
/******************************************************************************/
FilterContainer.prototype.add3rdPartyHostname = function(hostname) {
if ( this.blocked3rdPartyHostnames.add(hostname) ) {
this.acceptedCount++;
this.blockFilterCount++;
return true;
}
this.duplicateCount++;
return false;
};
/******************************************************************************/
FilterContainer.prototype.add = function(s) { FilterContainer.prototype.add = function(s) {
// ORDER OF TESTS IS IMPORTANT! // ORDER OF TESTS IS IMPORTANT!
@ -1396,31 +1451,35 @@ FilterContainer.prototype.add = function(s) {
return false; return false;
} }
this.processedFilterCount += 1;
this.acceptedCount += 1;
// Pure hostnames, use more efficient liquid dict
if ( parsed.hostnamePure && parsed.action === BlockAction ) {
if ( parsed.fopts === '' ) {
if ( this.blockedAnyPartyHostnames.add(parsed.f) ) {
this.blockFilterCount++;
} else {
this.duplicateCount++;
}
return true;
}
if ( parsed.fopts === 'third-party' ) {
if ( this.blocked3rdPartyHostnames.add(parsed.f) ) {
this.blockFilterCount++;
} else {
this.duplicateCount++;
}
return true;
}
}
if ( this.duplicates[s] ) { if ( this.duplicates[s] ) {
this.duplicateCount++; this.duplicateCount++;
return false; return false;
} }
this.duplicates[s] = true; this.duplicates[s] = true;
this.processedFilterCount += 1;
// Ignore optionless hostname rules, these will be taken care of by µBlock.
if ( parsed.hostname && parsed.fopts === '' && parsed.action === BlockAction && reHostnameRule.test(parsed.f) ) {
return false;
}
this.acceptedCount += 1;
// Pure third-party hostnames, use more efficient liquid dict
if ( reHostnameRule.test(parsed.f) && parsed.hostname && parsed.action === BlockAction ) {
if ( parsed.fopts === 'third-party' ) {
return this.blocked3rdPartyHostnames.add(parsed.f);
}
if ( parsed.fopts === '' ) {
return this.blockedAnyPartyHostnames.add(parsed.f);
}
}
var r = this.addFilter(parsed); var r = this.addFilter(parsed);
if ( r === false ) { if ( r === false ) {
return false; return false;
@ -1439,16 +1498,22 @@ FilterContainer.prototype.add = function(s) {
FilterContainer.prototype.addFilter = function(parsed) { FilterContainer.prototype.addFilter = function(parsed) {
// TODO: avoid duplicates // TODO: avoid duplicates
var matches = parsed.hostname ? findHostnameToken(parsed.f) : findFirstGoodToken(parsed.f); var matches = parsed.hostnameAnchored ?
findHostnameToken(parsed.f) :
findFirstGoodToken(parsed.f);
if ( !matches || !matches[0].length ) { if ( !matches || !matches[0].length ) {
return false; return false;
} }
var tokenBeg = matches.index; var tokenBeg = matches.index;
var tokenEnd = parsed.hostname ? reHostnameToken.lastIndex : reGoodToken.lastIndex; var tokenEnd = parsed.hostnameAnchored ?
reHostnameToken.lastIndex :
reGoodToken.lastIndex;
var filter; var filter;
var i = parsed.hostnames.length; var i = parsed.hostnames.length;
// Applies to specific domains
if ( i !== 0 && !parsed.notHostname ) { if ( i !== 0 && !parsed.notHostname ) {
while ( i-- ) { while ( i-- ) {
filter = makeHostnameFilter(parsed, tokenBeg, parsed.hostnames[i]); filter = makeHostnameFilter(parsed, tokenBeg, parsed.hostnames[i]);
@ -1466,6 +1531,8 @@ FilterContainer.prototype.addFilter = function(parsed) {
return true; return true;
} }
// Applies to all domains, with exception(s)
// https://github.com/gorhill/uBlock/issues/191 // https://github.com/gorhill/uBlock/issues/191
// Invert the purpose of the filter for negated hostnames // Invert the purpose of the filter for negated hostnames
if ( i !== 0 && parsed.notHostname ) { if ( i !== 0 && parsed.notHostname ) {
@ -1498,6 +1565,8 @@ FilterContainer.prototype.addFilter = function(parsed) {
return true; return true;
} }
// Applies to all domains without exceptions
filter = makeFilter(parsed, tokenBeg); filter = makeFilter(parsed, tokenBeg);
if ( !filter ) { if ( !filter ) {
return false; return false;
@ -1630,18 +1699,15 @@ FilterContainer.prototype.matchTokens = function(url) {
// specialized to deal with other complex filters. // specialized to deal with other complex filters.
FilterContainer.prototype.matchAnyPartyHostname = function(requestHostname) { FilterContainer.prototype.matchAnyPartyHostname = function(requestHostname) {
// Quick test first var pos;
if ( this.blockedAnyPartyHostnames.test(requestHostname) ) { while ( this.blockedAnyPartyHostnames.test(requestHostname) !== true ) {
return '||' + requestHostname + '^'; pos = requestHostname.indexOf('.');
} if ( pos === -1 ) {
// Check parent hostnames if quick test failed return false;
var hostnames = µb.URI.parentHostnamesFromHostname(requestHostname);
for ( var i = 0, n = hostnames.length; i < n; i++ ) {
if ( this.blockedAnyPartyHostnames.test(hostnames[i]) ) {
return '||' + hostnames[i] + '^';
} }
requestHostname = requestHostname.slice(pos + 1);
} }
return false; return '||' + requestHostname + '^';
}; };
/******************************************************************************/ /******************************************************************************/
@ -1655,18 +1721,15 @@ FilterContainer.prototype.matchAnyPartyHostname = function(requestHostname) {
// specialized to deal with other complex filters. // specialized to deal with other complex filters.
FilterContainer.prototype.match3rdPartyHostname = function(requestHostname) { FilterContainer.prototype.match3rdPartyHostname = function(requestHostname) {
// Quick test first var pos;
if ( this.blocked3rdPartyHostnames.test(requestHostname) ) { while ( this.blocked3rdPartyHostnames.test(requestHostname) !== true ) {
return '||' + requestHostname + '^$third-party'; pos = requestHostname.indexOf('.');
} if ( pos === -1 ) {
// Check parent hostnames if quick test failed return false;
var hostnames = µb.URI.parentHostnamesFromHostname(requestHostname);
for ( var i = 0, n = hostnames.length; i < n; i++ ) {
if ( this.blocked3rdPartyHostnames.test(hostnames[i]) ) {
return '||' + hostnames[i] + '^$third-party';
} }
requestHostname = requestHostname.slice(pos + 1);
} }
return false; return '||' + requestHostname + '^$third-party';
}; };
/******************************************************************************/ /******************************************************************************/

View File

@ -339,9 +339,7 @@
var parseCosmeticFilters = this.userSettings.parseAllABPHideFilters; var parseCosmeticFilters = this.userSettings.parseAllABPHideFilters;
var duplicateCount = netFilteringEngine.duplicateCount + cosmeticFilteringEngine.duplicateCount; var duplicateCount = netFilteringEngine.duplicateCount + cosmeticFilteringEngine.duplicateCount;
var acceptedCount = netFilteringEngine.acceptedCount + cosmeticFilteringEngine.acceptedCount; var acceptedCount = netFilteringEngine.acceptedCount + cosmeticFilteringEngine.acceptedCount;
var reLocalhost = /(^|\s)(localhost\.localdomain|localhost|local|broadcasthost|0\.0\.0\.0|127\.0\.0\.1|::1|fe80::1%lo0)(?=\s|$)/g; var reLocalhost = /(?:^|\s)(?:localhost\.localdomain|localhost|local|broadcasthost|0\.0\.0\.0|127\.0\.0\.1|::1|fe80::1%lo0)(?=\s|$)/g;
var reAdblockFilter = /^[^a-z0-9:]|[^a-z0-9]$|[^a-z0-9_:.-]/;
var reAdblockHostFilter = /^\|\|([a-z0-9.-]+[a-z0-9])\^?$/;
var reAsciiSegment = /^[\x21-\x7e]+$/; var reAsciiSegment = /^[\x21-\x7e]+$/;
var matches; var matches;
var lineBeg = 0, lineEnd, currentLineBeg; var lineBeg = 0, lineEnd, currentLineBeg;
@ -392,7 +390,8 @@
// The filter is whatever sequence of printable ascii character without // The filter is whatever sequence of printable ascii character without
// whitespaces // whitespaces
matches = reAsciiSegment.exec(line); matches = reAsciiSegment.exec(line);
if ( !matches || matches.length === 0 ) { if ( matches === null ) {
//console.debug('µBlock.mergeUbiquitousBlacklist(): skipping "%s"', lineRaw);
continue; continue;
} }
@ -404,27 +403,7 @@
continue; continue;
} }
line = matches[0]; netFilteringEngine.add(matches[0]);
// Likely an ABP net filter?
if ( reAdblockFilter.test(line) ) {
if ( netFilteringEngine.add(line) ) {
continue;
}
// rhill 2014-01-22: Transpose possible Adblock Plus-filter syntax
// into a plain hostname if possible.
matches = reAdblockHostFilter.exec(line);
if ( !matches || matches.length < 2 ) {
continue;
}
line = matches[1];
}
if ( line === '' ) {
continue;
}
netFilteringEngine.addAnyPartyHostname(line);
} }
// For convenience, store the number of entries for this // For convenience, store the number of entries for this