Ignore pointless trailling `*^` in network filters

There are currently over 160 patterns with such pointless
trailing `*^` in uBO's filter lists, which ended up being
compiled as generic pattern filters (i.e. regex-based
internally), while the trailing `*^` accomplishes nothing
since it will always match the end of a URL ( `^` can
also match the end of URL).

This commit discards pointless trailing `*^` in patterns,
thus allowing most of those filters to be compiled as
plain pattern filters.

The syntax highlighter will reflect that a trailing
`*^` is pointless.
This commit is contained in:
Raymond Hill 2021-12-11 09:45:25 -05:00
parent ca1ec1461b
commit 3b7a265ee2
No known key found for this signature in database
GPG Key ID: 25E1490B761470C2
2 changed files with 138 additions and 128 deletions

View File

@ -471,18 +471,21 @@ const Parser = class {
}
}
// If the pattern is a regex, remember this.
// Assume no anchors.
this.patternLeftAnchorSpan.i = this.patternSpan.i;
this.patternRightAnchorSpan.i = this.optionsAnchorSpan.i;
// Skip all else if pattern is a regex
if ( patternIsRegex ) {
this.patternBits = this.bitsFromSpan(this.patternSpan);
this.flavorBits |= BITFlavorNetRegex;
this.category = CATStaticNetFilter;
return;
}
// Refine by processing pattern anchors.
//
// Assume no anchors.
this.patternLeftAnchorSpan.i = this.patternSpan.i;
this.patternRightAnchorSpan.i = this.optionsAnchorSpan.i;
// Not a regex, there might be anchors.
if ( patternIsRegex === false ) {
// Left anchor?
// `|`: anchor to start of URL
// `||`: anchor to left of a hostname label
@ -542,7 +545,6 @@ const Parser = class {
this.flavorBits |= BITFlavorNetRightHnAnchor;
}
}
}
// Collate useful pattern bits information for further use.
//
@ -553,16 +555,16 @@ const Parser = class {
// the part following the space character.
// https://github.com/uBlockOrigin/uBlock-issues/issues/1118
// Patterns with more than one space are dubious.
{
if ( hasBits(this.allBits, BITSpace) ) {
const { i, len } = this.patternSpan;
const noOptionsAnchor = this.optionsAnchorSpan.len === 0;
let j = len;
for (;;) {
if ( j === 0 ) { break; }
j -= 3;
const bits = this.slices[i+j];
if ( noOptionsAnchor && hasBits(bits, BITSpace) ) { break; }
this.patternBits |= bits;
if ( noOptionsAnchor && hasBits(this.slices[i+j], BITSpace) ) {
break;
}
}
if ( j !== 0 ) {
const sink = this.strFromSlices(this.patternSpan.i, j - 3);
@ -587,86 +589,87 @@ const Parser = class {
}
}
// Pointless wildcards and anchoring:
// Pointless wildcards:
// - Eliminate leading wildcard not followed by a pattern token slice
// - Eliminate trailing wildcard not preceded by a pattern token slice
// - Eliminate pattern anchoring when irrelevant
// - Eliminate pointless trailing asterisk-caret (`*^`)
//
// Leading wildcard history:
// https://github.com/gorhill/uBlock/issues/1669#issuecomment-224822448
// Remove pointless leading *.
// https://github.com/gorhill/uBlock/issues/3034
// We can remove anchoring if we need to match all at the start.
//
// Trailing wildcard history:
// https://github.com/gorhill/uBlock/issues/3034
// We can remove anchoring if we need to match all at the end.
{
if ( hasBits(this.allBits, BITAsterisk) ) {
let { i, len } = this.patternSpan;
let pattern = this.strFromSpan(this.patternSpan);
// Pointless leading wildcard
if (
len > 3 &&
hasBits(this.slices[i], BITAsterisk) &&
hasNoBits(this.slices[i+3], BITPatternToken)
) {
if ( /^\*+[^0-9a-z%]/.test(pattern) ) {
this.slices[i] |= BITIgnore;
i += 3; len -= 3;
this.patternSpan.i = i;
this.patternSpan.len = len;
// We can ignore left-hand pattern anchor
if ( this.patternLeftAnchorSpan.len !== 0 ) {
this.slices[this.patternLeftAnchorSpan.i] |= BITIgnore;
this.flavorBits &= ~BITFlavorNetLeftAnchor;
}
this.patternSpan.i = (i += 3);
this.patternSpan.len = (len -= 3);
pattern = this.strFromSpan(this.patternSpan);
}
// Pointless trailing wildcard
if (
len > 3 &&
hasBits(this.slices[i+len-3], BITAsterisk) &&
hasNoBits(this.slices[i+len-6], BITPatternToken)
) {
if ( /([^0-9a-z%]|[0-9a-z%]{7,})\*+$/.test(pattern) ) {
this.patternSpan.len = (len -= 3);
pattern = this.strFromSpan(this.patternSpan);
// Ignore only if the pattern would not end up looking like
// a regex.
if (
hasNoBits(this.slices[i], BITSlash) ||
hasNoBits(this.slices[i+len-6], BITSlash)
) {
this.slices[i+len-3] |= BITIgnore;
if ( /^\/.+\/$/.test(pattern) === false ) {
this.slices[i+len] |= BITIgnore;
}
len -= 3;
this.patternSpan.len = len;
// We can ignore right-hand pattern anchor
if ( this.patternRightAnchorSpan.len !== 0 ) {
this.slices[this.patternRightAnchorSpan.i] |= BITIgnore;
this.flavorBits &= ~BITFlavorNetRightAnchor;
}
}
// Pointless trailing caret (when preceded by a wildcard)
// TODO
//
// Pointless left-hand pattern anchoring
if (
(
len === 0 ||
len !== 0 && hasBits(this.slices[i], BITAsterisk)
) &&
hasBits(this.flavorBits, BITFlavorNetLeftAnchor)
) {
this.slices[this.patternLeftAnchorSpan.i] |= BITIgnore;
this.flavorBits &= ~BITFlavorNetLeftAnchor;
}
// Pointless right-hand pattern anchoring
if (
(
len === 0 ||
len !== 0 && hasBits(this.slices[i+len-3], BITAsterisk)
) &&
hasBits(this.flavorBits, BITFlavorNetRightAnchor)
) {
// Pointless trailing asterisk-caret: `..*^`, `..*^|`
if ( hasBits(this.allBits, BITCaret) && /\*+\^$/.test(pattern) ) {
this.slices[i+len-3] |= BITIgnore;
this.slices[i+len-6] |= BITIgnore;
this.patternSpan.len = (len -= 6);
pattern = this.strFromSpan(this.patternSpan);
// We can ignore right-hand pattern anchor
if ( this.patternRightAnchorSpan.len !== 0 ) {
this.slices[this.patternRightAnchorSpan.i] |= BITIgnore;
this.flavorBits &= ~BITFlavorNetRightAnchor;
}
}
}
// Pointless left-hand pattern anchoring
//
// Leading wildcard history:
// https://github.com/gorhill/uBlock/issues/3034
// We can remove anchoring if we need to match all at the start.
if ( hasBits(this.flavorBits, BITFlavorNetLeftAnchor) ) {
const i = this.patternLeftAnchorSpan.i;
if (
this.patternSpan.len === 0 ||
hasBits(this.slices[i+3], BITIgnore|BITAsterisk)
) {
this.slices[i] |= BITIgnore;
this.flavorBits &= ~BITFlavorNetLeftAnchor;
}
}
// Pointless right-hand pattern anchoring
//
// Trailing wildcard history:
// https://github.com/gorhill/uBlock/issues/3034
// We can remove anchoring if we need to match all at the end.
if ( hasBits(this.flavorBits, BITFlavorNetRightAnchor) ) {
const i = this.patternLeftAnchorSpan;
if (
this.patternSpan.len === 0 ||
hasBits(this.slices[i-3], BITIgnore|BITAsterisk)
) {
this.slices[i] |= BITIgnore;
this.flavorBits &= ~BITFlavorNetRightAnchor;
}
}
// Collate effective pattern bits
this.patternBits = this.bitsFromSpan(this.patternSpan);
this.category = CATStaticNetFilter;
}
@ -1177,6 +1180,15 @@ const Parser = class {
return true;
}
bitsFromSpan(span) {
const { i, len } = span;
let bits = 0;
for ( let j = 0; j < len; j += 3 ) {
bits |= this.slices[i+j];
}
return bits;
}
hasFlavor(bits) {
return hasBits(this.flavorBits, bits);
}

View File

@ -3254,20 +3254,18 @@ class FilterCompiler {
units.push(FilterPatternGeneric.compile(this));
return;
}
if ( this.wildcardPos === -1 && this.caretPos === -1 ) {
if ( this.wildcardPos === -1 ) {
if ( this.caretPos === -1 ) {
units.push(FilterPatternPlain.compile(this));
return;
}
// Optimize special case: plain pattern with trailing caret
if (
this.wildcardPos === -1 &&
this.caretPos === (this.pattern.length - 1)
) {
if ( this.caretPos === (this.pattern.length - 1) ) {
this.pattern = this.pattern.slice(0, -1);
units.push(FilterPatternPlain.compile(this));
units.push(FilterTrailingSeparator.compile());
return;
}
}
units.push(FilterPatternGeneric.compile(this));
}