Add ability to parse `removeparam=` as `queryprune=`

Related issue:
- https://github.com/uBlockOrigin/uBlock-issues/issues/1356

Related commit:
- bde3164eb4

It is not possible to achieve perfect compatiblity at this
point, but reasonable compatibility should be achieved for
a majority of instances of `removeparam=`.

Notable differences:
--------------------

uBO always matches in a case insensitive manner, there is
no need to ask for case-insensitivity, and no need to use
uppercase characters in `queryprune=` values.

uBO does not escape special regex characters since the
`queryprune=` values are always assumed to be literal
regex expression (leaving out the documented special
characters). This means `removeparam=` with characters
which are special regex characters won't be properly
translated and are unlikely to work properly in uBO.

For example, the `queryprune` value of a filter such as
`$removeparam=__xts__[0]` internally become the literal
regex `/__xts__[0]/`, and consequently would not match
a query parameter such as `...?__xts__[0]=...`.

Notes:
------

Additionally, for performance reason, when uBO encounter
a pattern-less `queryprune=` (or `removeparam=`) filter,
it will try to extract a valid pattern from the
`queryprune=` value. For instance, the following filter:

    $queryprune=utm_campaign

Will be translated internally into:

    utm_campaign$queryprune=utm_campaign

The logger will reflect this internal translation.
This commit is contained in:
Raymond Hill 2020-11-26 09:34:12 -05:00
parent 80413dff83
commit 6ac09a2856
No known key found for this signature in database
GPG Key ID: 25E1490B761470C2
2 changed files with 74 additions and 20 deletions

View File

@ -2092,6 +2092,7 @@ const netOptionTokenDescriptors = new Map([
[ 'popunder', OPTTokenPopunder | OPTNonNetworkType | OPTNonCspableType | OPTNonRedirectableType ],
[ 'popup', OPTTokenPopup | OPTNonNetworkType | OPTCanNegate | OPTNonCspableType | OPTNonRedirectableType ],
[ 'queryprune', OPTTokenQueryprune | OPTMustAssign | OPTAllowMayAssign | OPTModifierType | OPTNonCspableType | OPTNonRedirectableType ],
[ 'removeparam', OPTTokenQueryprune | OPTMustAssign | OPTAllowMayAssign | OPTModifierType | OPTNonCspableType | OPTNonRedirectableType ],
[ 'redirect', OPTTokenRedirect | OPTMustAssign | OPTAllowMayAssign | OPTModifierType ],
[ 'redirect-rule', OPTTokenRedirectRule | OPTMustAssign | OPTAllowMayAssign | OPTModifierType | OPTNonCspableType ],
[ 'script', OPTTokenScript | OPTCanNegate | OPTNetworkType | OPTModifiableType | OPTRedirectableType | OPTNonCspableType ],
@ -2147,6 +2148,7 @@ Parser.netOptionTokenIds = new Map([
[ 'popunder', OPTTokenPopunder ],
[ 'popup', OPTTokenPopup ],
[ 'queryprune', OPTTokenQueryprune ],
[ 'removeparam', OPTTokenQueryprune ],
[ 'redirect', OPTTokenRedirect ],
[ 'redirect-rule', OPTTokenRedirectRule ],
[ 'script', OPTTokenScript ],

View File

@ -2628,7 +2628,7 @@ const FilterParser = class {
this.noTokenHash = urlTokenizer.noTokenHash;
this.reIsolateHostname = /^(\*?\.)?([^\x00-\x24\x26-\x2C\x2F\x3A-\x5E\x60\x7B-\x7F]+)(.*)/;
this.reBadCSP = /(?:=|;)\s*report-(?:to|uri)\b/;
this.reRegexToken = /[%0-9A-Za-z]+/g;
this.reToken = /[%0-9A-Za-z]+/g;
this.reRegexTokenAbort = /[\(\)\[\]]/;
this.reRegexBadPrefix = /(^|[^\\]\.|\\[%SDWsdw]|[^\\][()*+?[\\\]{}])$/;
this.reRegexBadSuffix = /^([^\\]\.|\\[%SDWsdw]|[()*+?[\]{}]|$)/;
@ -3110,34 +3110,48 @@ const FilterParser = class {
// i.e. very common with a high probability of ending up as a miss,
// are not good. Avoid if possible. This has a significant positive
// impact on performance.
//
// For pattern-less queryprune filters, try to derive a pattern from
// the queryprune value.
makeToken() {
if ( this.pattern === '*' ) { return; }
if ( this.pattern === '*' ) {
if (
this.modifyType !== this.parser.OPTTokenQueryprune ||
this.makePatternFromQuerypruneValue() === false
) {
return;
}
}
if ( this.isRegex ) {
return this.extractTokenFromRegex();
}
const match = this.extractTokenFromPattern();
if ( match === null ) { return; }
this.token = match.token;
this.tokenHash = urlTokenizer.tokenHashFromString(this.token);
this.tokenBeg = match.pos;
this.extractTokenFromPattern();
}
// Note: a one-char token is better than a documented bad token.
extractTokenFromPattern() {
this.reToken.lastIndex = 0;
const pattern = this.pattern;
let bestMatch = null;
let bestBadness = 0x7FFFFFFF;
for ( const match of this.parser.patternTokens() ) {
const badness = match.token.length > 1
? this.badTokens.get(match.token) || 0
for (;;) {
const match = this.reToken.exec(pattern);
if ( match === null ) { break; }
const badness = match[0].length > 1
? this.badTokens.get(match[0]) || 0
: 1;
if ( badness === 0 ) { return match; }
if ( badness < bestBadness ) {
bestMatch = match;
if ( badness === 0 ) { break; }
bestBadness = badness;
}
}
return bestMatch;
if ( bestMatch !== null ) {
this.token = bestMatch[0];
this.tokenHash = urlTokenizer.tokenHashFromString(this.token);
this.tokenBeg = bestMatch.index;
}
}
// https://github.com/gorhill/uBlock/issues/2781
@ -3147,15 +3161,16 @@ const FilterParser = class {
// Mind `\b` directives: `/\bads\b/` should result in token being `ads`,
// not `bads`.
extractTokenFromRegex() {
this.reRegexToken.lastIndex = 0;
const s = this.pattern;
this.reToken.lastIndex = 0;
const pattern = this.pattern;
let bestToken;
let bestBadness = 0x7FFFFFFF;
for (;;) {
const matches = this.reRegexToken.exec(s);
const matches = this.reToken.exec(pattern);
if ( matches === null ) { break; }
let token = matches[0];
let prefix = s.slice(0, matches.index);
let suffix = s.slice(this.reRegexToken.lastIndex);
let prefix = pattern.slice(0, matches.index);
let suffix = pattern.slice(this.reToken.lastIndex);
if (
this.reRegexTokenAbort.test(prefix) &&
this.reRegexTokenAbort.test(suffix)
@ -3181,13 +3196,47 @@ const FilterParser = class {
? this.badTokens.get(token) || 0
: 1;
if ( badness < bestBadness ) {
this.token = token.toLowerCase();
this.tokenHash = urlTokenizer.tokenHashFromString(this.token);
this.tokenBeg = matches.index;
bestToken = token;
if ( badness === 0 ) { break; }
bestBadness = badness;
}
}
if ( bestToken !== undefined ) {
this.token = bestToken.toLowerCase();
this.tokenHash = urlTokenizer.tokenHashFromString(this.token);
}
}
makePatternFromQuerypruneValue() {
let pattern = this.modifyValue;
if ( pattern === '*' || pattern.charCodeAt(0) === 0x21 /* '!' */ ) {
return false;
}
if ( /^\w+$/.test(pattern) ) {
this.pattern = `${pattern}=`;
return true;
}
const reRegex = /^\/(.+)\/i?$/;
if ( reRegex.test(pattern) ) {
pattern = reRegex.exec(pattern)[1];
} else {
let prefix = '', suffix = '';
if ( pattern.startsWith('|') ) {
pattern = pattern.slice(1);
prefix = '\\b';
}
if ( pattern.endsWith('|') ) {
pattern = pattern.slice(0, -1);
suffix = '\\b';
}
if ( pattern.indexOf('|') !== -1 ) {
pattern = `(?:${pattern})`;
}
pattern = prefix + pattern + suffix;
}
this.pattern = pattern;
this.isRegex = true;
return true;
}
hasNoOptionUnits() {
@ -4288,6 +4337,7 @@ FilterContainer.prototype.filterQuery = function(fctxt) {
FilterContainer.prototype.parseFilterPruneValue = function(modifier) {
const cache = {};
const reRegex = /^\/(.+)\/i?$/;
let retext = modifier.value;
if ( retext === '*' ) {
cache.all = true;
@ -4296,6 +4346,8 @@ FilterContainer.prototype.parseFilterPruneValue = function(modifier) {
if ( cache.not ) { retext = retext.slice(1); }
if ( /^\w+$/.test(retext) ) {
retext = `^${retext}=`;
} else if ( reRegex.test(retext) ) {
retext = reRegex.exec(retext)[1];
} else {
if ( retext.startsWith('|') ) { retext = `^${retext.slice(1)}`; }
if ( retext.endsWith('|') ) { retext = `${retext.slice(0,-1)}$`; }