with the new `important` filter option, a tokenizer makes sense now

This commit is contained in:
gorhill 2014-09-21 14:03:41 -04:00
parent d6b501b264
commit c48a99c4bf
1 changed files with 82 additions and 20 deletions

View File

@ -63,7 +63,7 @@ var typeNameToTypeValue = {
'other': 8 << 4,
'popup': 9 << 4
};
const AnyType = typeNameToTypeValue['any'];
const AnyType = typeNameToTypeValue.any;
const BlockAnyTypeAnyParty = BlockAction | AnyType | AnyParty;
const BlockAnyType1stParty = BlockAction | AnyType | FirstParty;
@ -90,8 +90,10 @@ var reURLPostHostnameAnchors = /[\/?#]/;
// regex tester: http://regex101.com/
/******************************************************************************/
var histogram = function() {};
/*
var histogram = function(label, categories) {
histogram = function(label, categories) {
var h = [],
categoryBucket;
for ( var k in categories ) {
@ -101,7 +103,7 @@ var histogram = function(label, categories) {
// No need for hasOwnProperty() here: there is no prototype chain.
filterBucket = categoryBucket[kk];
h.push({
k: k + ' ' + kk,
k: k.charCodeAt(0).toString(2) + ' ' + kk,
n: filterBucket instanceof FilterBucket ? filterBucket.filters.length : 1
});
}
@ -815,16 +817,30 @@ FilterManyWildcardsHostname.fromSelfie = function(s) {
// here because the special treatment would be only for a few specific tokens,
// not systematically done for all tokens.
// key=?? ad count=657
// key=?? ads count=431
// key=?? mdn count=267
// key=?? google count=181
// key=?? pagead2 count=166
// key=?? doubleclick count=118
// key=?? g count=100
// key=?? doubleclick count=94
// key=?? js count=88
// key=?? adv count=88
// key= 10000 ad count=660
// key= 10000 ads count=433
// key= 10001 google count=277
// key=1000000 2mdn count=267
// key= 10000 social count=240
// key= 10001 pagead2 count=166
// key= 10000 twitter count=122
// key= 10000 doubleclick count=118
// key= 10000 facebook count=114
// key= 10000 share count=113
// key= 10000 google count=106
// key= 10001 code count=103
// key= 11000 doubleclick count=100
// key=1010001 g count=100
// key= 10001 js count= 89
// key= 10000 adv count= 88
// key= 10000 youtube count= 61
// key= 10000 plugins count= 60
// key= 10001 partner count= 59
// key= 10000 ico count= 57
// key= 110001 ssl count= 57
// key= 10000 banner count= 53
// key= 10000 footer count= 51
// key= 10000 rss count= 51
var FilterBucket = function(a, b) {
this.f = null;
@ -1218,8 +1234,17 @@ FilterParser.prototype.parse = function(s) {
/******************************************************************************/
/******************************************************************************/
var TokenEntry = function() {
this.beg = 0;
this.end = 0;
};
/******************************************************************************/
/******************************************************************************/
var FilterContainer = function() {
this.reAnyToken = /[%0-9a-z]+/g;
this.tokens = [];
this.buckets = new Array(4);
this.blockedAnyPartyHostnames = new µb.LiquidDict();
this.blocked3rdPartyHostnames = new µb.LiquidDict();
@ -1249,7 +1274,7 @@ FilterContainer.prototype.reset = function() {
/******************************************************************************/
FilterContainer.prototype.freeze = function() {
//histogram('allFilters', this.categories);
histogram('allFilters', this.categories);
this.blockedAnyPartyHostnames.freeze();
this.blocked3rdPartyHostnames.freeze();
this.duplicates = Object.create(null);
@ -1575,19 +1600,49 @@ FilterContainer.prototype.addToCategory = function(category, tokenKey, filter) {
/******************************************************************************/
FilterContainer.prototype.matchTokens = function(url) {
FilterContainer.prototype.tokenize = function(url) {
var tokens = this.tokens;
var re = this.reAnyToken;
var matches, beg, token, f;
var matches, tokenEntry;
re.lastIndex = 0;
var i = 0;
while ( matches = re.exec(url) ) {
tokenEntry = tokens[i];
if ( tokenEntry === undefined ) {
tokenEntry = tokens[i] = new TokenEntry();
}
tokenEntry.beg = matches.index;
tokenEntry.end = re.lastIndex;
i += 1;
}
// Sentinel
tokenEntry = tokens[i];
if ( tokenEntry === undefined ) {
tokenEntry = tokens[i] = new TokenEntry();
}
tokenEntry.end = 0;
};
/******************************************************************************/
FilterContainer.prototype.matchTokens = function(url) {
var buckets = this.buckets;
var bucket0 = buckets[0];
var bucket1 = buckets[1];
var bucket2 = buckets[2];
var bucket3 = buckets[3];
re.lastIndex = 0;
while ( matches = re.exec(url) ) {
beg = matches.index;
token = url.slice(beg, re.lastIndex);
var tokens = this.tokens;
var tokenEntry, beg, end, token, f;
var i = 0;
for (;;) {
tokenEntry = tokens[i++];
end = tokenEntry.end;
if ( end === 0 ) {
break;
}
beg = tokenEntry.beg;
token = url.slice(beg, end);
if ( bucket0 !== undefined ) {
f = bucket0[token];
if ( f !== undefined && f.match(url, beg) !== false ) {
@ -1682,6 +1737,10 @@ FilterContainer.prototype.matchStringExactType = function(pageDetails, requestUR
// This will be used by hostname-based filters
pageHostname = pageDetails.pageHostname || '';
// Tokenize only once
this.tokenize(url);
// We are testing for a specific type, skip "any type" buckets
buckets[0] = buckets[1] = undefined;
// https://github.com/gorhill/uBlock/issues/139
@ -1763,6 +1822,9 @@ FilterContainer.prototype.matchString = function(pageDetails, requestURL, reques
var categories = this.categories;
var buckets = this.buckets;
// Tokenize only once
this.tokenize(url);
// https://github.com/gorhill/uBlock/issues/139
// Test against important block filters.
// The purpose of the `important` option is to reverse the order of