Add a new static filtering parser

A new standalone static filtering parser is introduced,
vAPI.StaticFilteringParser. It's purpose is to parse
line of text into representation suitable for
compiling filters. It can additionally serves for
syntax highlighting purpose.

As a side effect, this solves:
- https://github.com/uBlockOrigin/uBlock-issues/issues/1038

This is a first draft, there are more work left to do
to further perfect the implementation and extend its
capabilities, especially those useful to assist filter
authors.

For the time being, this commits break line-continuation
syntax highlighting -- which was already flaky prior to
this commit anyway.
This commit is contained in:
Raymond Hill 2020-06-04 07:18:54 -04:00
parent e8c8fab8c8
commit 01b1ed9a98
No known key found for this signature in database
GPG Key ID: 25E1490B761470C2
10 changed files with 1917 additions and 568 deletions

View File

@ -54,6 +54,7 @@
<script src="js/i18n.js"></script>
<script src="js/dashboard-common.js"></script>
<script src="js/cloud-ui.js"></script>
<script src="js/static-filtering-parser.js"></script>
<script src="js/1p-filters.js"></script>
</body>

View File

@ -45,6 +45,7 @@ body {
<script src="js/udom.js"></script>
<script src="js/i18n.js"></script>
<script src="js/dashboard-common.js"></script>
<script src="js/static-filtering-parser.js"></script>
<script src="js/asset-viewer.js"></script>
</body>

View File

@ -26,8 +26,9 @@
<script src="js/filtering-context.js"></script>
<script src="js/redirect-engine.js"></script>
<script src="js/dynamic-net-filtering.js"></script>
<script src="js/static-net-filtering.js"></script>
<script src="js/url-net-filtering.js"></script>
<script src="js/static-filtering-parser.js"></script>
<script src="js/static-net-filtering.js"></script>
<script src="js/static-ext-filtering.js"></script>
<script src="js/cosmetic-filtering.js"></script>
<script src="js/scriptlet-filtering.js"></script>

View File

@ -22,7 +22,17 @@
word-break: break-all;
}
/* CodeMirror theme overrides */
.cm-s-default .cm-string-2 { color: #a30; }
.cm-s-default .cm-comment { color: #777; }
.cm-s-default .cm-keyword { color: #90b; }
.cm-s-default .cm-error,
.CodeMirror-linebackground.error {
background-color: #ff000018;
text-decoration: underline red;
text-underline-position: under;
}
.cm-directive { color: #333; font-weight: bold; }
.cm-staticext { color: #008; }
.cm-staticnetBlock { color: #800; }

View File

@ -24,117 +24,155 @@
'use strict';
CodeMirror.defineMode("ubo-static-filtering", function() {
const reDirective = /^\s*!#(?:if|endif|include)\b/;
const reComment1 = /^\s*!/;
const reComment2 = /^\s*#/;
const reExt = /(#@?(?:\$\??|\?)?#)(?!##)/;
const reNet = /^\s*(?:@@)?.*(?:(\$)(?:[^$]+)?)?$/;
let lineStyle = null;
let anchorOptPos = null;
const lines = [];
let iLine = 0;
const parser = new vAPI.StaticFilteringParser(true);
const reDirective = /^!#(?:if|endif|include)\b/;
let parserSlot = 0;
let netOptionValueMode = false;
const lineFromLineBuffer = function() {
return lines.length === 1
? lines[0]
: lines.filter(a => a.replace(/^\s*|\s+\\$/g, '')).join('');
};
const parseExtFilter = function() {
lineStyle = 'staticext';
for ( let i = 0; i < lines.length; i++ ) {
const match = reExt.exec(lines[i]);
if ( match === null ) { continue; }
anchorOptPos = { y: i, x: match.index, l: match[1].length };
break;
const colorSpan = function(stream) {
if ( parser.category === parser.CATNone || parser.shouldIgnore() ) {
stream.skipToEnd();
return 'comment';
}
};
const parseNetFilter = function() {
lineStyle = lineFromLineBuffer().startsWith('@@')
? 'staticnetAllow'
: 'staticnetBlock';
let i = lines.length;
while ( i-- ) {
const pos = lines[i].lastIndexOf('$');
if ( pos === -1 ) { continue; }
anchorOptPos = { y: i, x: pos, l: 1 };
break;
if ( parser.category === parser.CATComment ) {
stream.skipToEnd();
return reDirective.test(stream.string)
? 'variable strong'
: 'comment';
}
};
const highlight = function(stream) {
if ( anchorOptPos !== null && iLine === anchorOptPos.y ) {
if ( stream.pos === anchorOptPos.x ) {
stream.pos += anchorOptPos.l;
return `${lineStyle} staticOpt`;
if ( (parser.slices[parserSlot] & parser.BITIgnore) !== 0 ) {
stream.pos += parser.slices[parserSlot+2];
parserSlot += 3;
return 'comment';
}
if ( (parser.slices[parserSlot] & parser.BITError) !== 0 ) {
stream.pos += parser.slices[parserSlot+2];
parserSlot += 3;
return 'error';
}
if ( parser.category === parser.CATStaticExtFilter ) {
if ( parserSlot < parser.optionsAnchorSpan.i ) {
const style = (parser.slices[parserSlot] & parser.BITComma) === 0
? 'string-2'
: 'def';
stream.pos += parser.slices[parserSlot+2];
parserSlot += 3;
return style;
}
if ( stream.pos < anchorOptPos.x ) {
stream.pos = anchorOptPos.x;
return lineStyle;
if (
parserSlot >= parser.optionsAnchorSpan.i &&
parserSlot < parser.patternSpan.i
) {
const style = (parser.flavorBits & parser.BITFlavorException) !== 0
? 'tag'
: 'def';
stream.pos += parser.slices[parserSlot+2];
parserSlot += 3;
return `${style} strong`;
}
if ( parserSlot >= parser.patternSpan.i ) {
stream.skipToEnd();
return 'variable';
}
stream.skipToEnd();
return '';
}
if ( parserSlot < parser.exceptionSpan.i ) {
stream.pos += parser.slices[parserSlot+2];
parserSlot += 3;
return '';
}
if (
parserSlot === parser.exceptionSpan.i &&
parser.exceptionSpan.l !== 0
) {
stream.pos += parser.slices[parserSlot+2];
parserSlot += 3;
return 'tag strong';
}
if (
parserSlot === parser.patternLeftAnchorSpan.i &&
parser.patternLeftAnchorSpan.l !== 0 ||
parserSlot === parser.patternRightAnchorSpan.i &&
parser.patternRightAnchorSpan.l !== 0
) {
stream.pos += parser.slices[parserSlot+2];
parserSlot += 3;
return 'keyword strong';
}
if (
parserSlot >= parser.patternSpan.i &&
parserSlot < parser.patternRightAnchorSpan.i
) {
if ( (parser.slices[parserSlot] & (parser.BITAsterisk | parser.BITCaret)) !== 0 ) {
stream.pos += parser.slices[parserSlot+2];
parserSlot += 3;
return 'keyword strong';
}
const nextSlot = parser.skipUntil(
parserSlot,
parser.patternRightAnchorSpan.i,
parser.BITAsterisk | parser.BITCaret
);
stream.pos = parser.slices[nextSlot+1];
parserSlot = nextSlot;
return 'variable';
}
if (
parserSlot === parser.optionsAnchorSpan.i &&
parser.optionsAnchorSpan.l !== 0
) {
stream.pos += parser.slices[parserSlot+2];
parserSlot += 3;
return 'def strong';
}
if (
parserSlot >= parser.optionsSpan.i &&
parser.optionsSpan.l !== 0
) {
const bits = parser.slices[parserSlot];
let style;
if ( (bits & parser.BITComma) !== 0 ) {
style = 'def strong';
netOptionValueMode = false;
} else if ( (bits & parser.BITTilde) !== 0 ) {
style = 'keyword strong';
} else if ( (bits & parser.BITPipe) !== 0 ) {
style = 'def';
} else if ( netOptionValueMode ) {
style = 'string-2';
} else if ( (bits & parser.BITEqual) !== 0 ) {
netOptionValueMode = true;
}
stream.pos += parser.slices[parserSlot+2];
parserSlot += 3;
return style || 'def';
}
if (
parserSlot >= parser.commentSpan.i &&
parser.commentSpan.l !== 0
) {
stream.skipToEnd();
return 'comment';
}
stream.skipToEnd();
return lineStyle;
};
const parseMultiLine = function() {
anchorOptPos = null;
const line = lineFromLineBuffer();
if ( reDirective.test(line) ) {
lineStyle = 'directive';
return;
}
if ( reComment1.test(line) ) {
lineStyle = 'comment';
return;
}
if ( line.indexOf('#') !== -1 ) {
if ( reExt.test(line) ) {
return parseExtFilter();
}
if ( reComment2.test(line) ) {
lineStyle = 'comment';
return;
}
}
if ( reNet.test(line) ) {
return parseNetFilter();
}
lineStyle = null;
return '';
};
return {
startState: function() {
},
token: function(stream) {
if ( iLine === lines.length || stream.string !== lines[iLine] ) {
iLine = 0;
if ( stream.sol() ) {
parser.analyze(stream.string);
parser.analyzeExtra(stream.string);
parserSlot = 0;
netOptionValueMode = false;
}
if ( iLine === 0 ) {
if ( lines.length > 1 ) {
lines.length = 1;
}
let line = stream.string;
lines[0] = line;
if ( line.endsWith(' \\') ) {
do {
line = stream.lookAhead(lines.length);
if (
line === undefined ||
line.startsWith(' ') === false
) { break; }
lines.push(line);
} while ( line.endsWith(' \\') );
}
parseMultiLine();
let style = colorSpan(stream);
if ( (parser.flavorBits & parser.BITFlavorError) !== 0 ) {
style += ' line-background-error';
}
const style = highlight(stream);
if ( stream.eol() ) {
iLine += 1;
}
return style;
style = style.trim();
return style !== '' ? style : null;
},
};
});

View File

@ -135,7 +135,10 @@ const fromNetFilter = async function(rawFilter) {
const µb = µBlock;
const writer = new µb.CompiledLineIO.Writer();
if ( µb.staticNetFilteringEngine.compile(rawFilter, writer) === false ) {
const parser = new vAPI.StaticFilteringParser();
parser.analyze(rawFilter);
if ( µb.staticNetFilteringEngine.compile(parser, writer) === false ) {
return;
}

View File

@ -52,7 +52,6 @@
µBlock.staticExtFilteringEngine = (( ) => {
const µb = µBlock;
const reHasUnicode = /[^\x00-\x7F]/;
const reParseRegexLiteral = /^\/(.+)\/([imu]+)?$/;
const emptyArray = [];
const parsed = {
@ -142,23 +141,16 @@
: `${selector}:style(${style})`;
};
const hostnamesFromPrefix = function(s) {
const hostnamesFromPrefix = function(parser) {
const hostnames = [];
const hasUnicode = reHasUnicode.test(s);
let beg = 0;
while ( beg < s.length ) {
let end = s.indexOf(',', beg);
if ( end === -1 ) { end = s.length; }
let hostname = s.slice(beg, end).trim();
if ( hostname.length !== 0 ) {
if ( hasUnicode ) {
hostname = hostname.charCodeAt(0) === 0x7E /* '~' */
? '~' + punycode.toASCII(hostname.slice(1))
: punycode.toASCII(hostname);
}
hostnames.push(hostname);
const hasUnicode = parser.optionHasUnicode();
for ( let { hn, not } of parser.options() ) {
hn = hn.trim();
if ( hn.length === 0 ) { continue; }
if ( hasUnicode ) {
hn = punycode.toASCII(hn);
}
beg = end + 1;
hostnames.push(not ? `~${hn}` : hn);
}
return hostnames;
};
@ -844,34 +836,16 @@
return entryPoint;
})();
api.compile = function(raw, writer) {
let lpos = raw.indexOf('#');
if ( lpos === -1 ) { return false; }
let rpos = lpos + 1;
if ( raw.charCodeAt(rpos) !== 0x23 /* '#' */ ) {
rpos = raw.indexOf('#', rpos + 1);
if ( rpos === -1 ) { return false; }
}
api.compile = function(parser, writer) {
if ( parser.category !== parser.CATStaticExtFilter ) { return false; }
// https://github.com/AdguardTeam/AdguardFilters/commit/4fe02d73cee6
// AdGuard also uses `$?` to force inline-based style rather than
// stylesheet-based style.
// Coarse-check that the anchor is valid.
// `##`: l === 1
// `#@#`, `#$#`, `#%#`, `#?#`: l === 2
// `#@$#`, `#@%#`, `#@?#`, `#$?#`: l === 3
// `#@$?#`: l === 4
const anchorLen = rpos - lpos;
if ( anchorLen > 4 ) { return false; }
if (
anchorLen > 1 &&
/^@?(?:\$\??|%|\?)?$/.test(raw.slice(lpos + 1, rpos)) === false
) {
return false;
// Adguard's scriptlet injection: not supported.
if ( (parser.flavorBits & parser.BITFlavorUnsupported) !== 0 ) {
return true;
}
// Extract the selector.
let suffix = raw.slice(rpos + 1).trim();
let suffix = parser.strFromSpan(parser.patternSpan);
if ( suffix.length === 0 ) { return false; }
parsed.suffix = suffix;
@ -882,29 +856,21 @@
// We have an Adguard/ABP cosmetic filter if and only if the
// character is `$`, `%` or `?`, otherwise it's not a cosmetic
// filter.
let cCode = raw.charCodeAt(rpos - 1);
if ( cCode !== 0x23 /* '#' */ && cCode !== 0x40 /* '@' */ ) {
// Adguard's scriptlet injection: not supported.
if ( cCode === 0x25 /* '%' */ ) { return true; }
if ( cCode === 0x3F /* '?' */ && anchorLen > 2 ) {
cCode = raw.charCodeAt(rpos - 2);
}
// Adguard's style injection: translate to uBO's format.
if ( cCode === 0x24 /* '$' */ ) {
suffix = translateAdguardCSSInjectionFilter(suffix);
if ( suffix === '' ) { return true; }
parsed.suffix = suffix;
}
// Adguard's style injection: translate to uBO's format.
if ( (parser.flavorBits & parser.BITFlavorExtStyle) !== 0 ) {
suffix = translateAdguardCSSInjectionFilter(suffix);
if ( suffix === '' ) { return true; }
parsed.suffix = suffix;
}
// Exception filter?
parsed.exception = raw.charCodeAt(lpos + 1) === 0x40 /* '@' */;
parsed.exception = parser.isException();
// Extract the hostname(s), punycode if required.
if ( lpos === 0 ) {
parsed.hostnames = emptyArray;
if ( parser.hasOptions() ) {
parsed.hostnames = hostnamesFromPrefix(parser);
} else {
parsed.hostnames = hostnamesFromPrefix(raw.slice(0, lpos));
parsed.hostnames = emptyArray;
}
// Backward compatibility with deprecated syntax.

File diff suppressed because it is too large Load Diff

View File

@ -84,13 +84,16 @@ const typeNameToTypeValue = {
const otherTypeBitValue = typeNameToTypeValue.other;
const bitFromType = type =>
1 << ((typeNameToTypeValue[type] >>> 4) - 1);
// All network request types to bitmap
// bring origin to 0 (from 4 -- see typeNameToTypeValue)
// left-shift 1 by the above-calculated value
// subtract 1 to set all type bits
const allNetworkTypesBits =
(1 << (otherTypeBitValue >>> 4)) - 1;
const allTypesBits =
allNetworkTypesBits |
1 << (typeNameToTypeValue['popup'] >>> 4) - 1 |
@ -127,42 +130,6 @@ const typeValueToTypeName = {
23: 'unsupported',
};
// https://github.com/gorhill/uBlock/issues/1493
// Transpose `ping` into `other` for now.
const toNormalizedType = {
'all': 'all',
'beacon': 'ping',
'cname': 'cname',
'css': 'stylesheet',
'data': 'data',
'doc': 'main_frame',
'document': 'main_frame',
'font': 'font',
'frame': 'sub_frame',
'genericblock': 'unsupported',
'generichide': 'generichide',
'ghide': 'generichide',
'image': 'image',
'inline-font': 'inline-font',
'inline-script': 'inline-script',
'media': 'media',
'object': 'object',
'object-subrequest': 'object',
'other': 'other',
'ping': 'ping',
'popunder': 'popunder',
'popup': 'popup',
'script': 'script',
'specifichide': 'specifichide',
'shide': 'specifichide',
'stylesheet': 'stylesheet',
'subdocument': 'sub_frame',
'xhr': 'xmlhttprequest',
'xmlhttprequest': 'xmlhttprequest',
'webrtc': 'unsupported',
'websocket': 'websocket',
};
const typeValueFromCatBits = catBits => (catBits >>> 4) & 0b11111;
/******************************************************************************/
@ -409,7 +376,7 @@ const filterPattern = {
units.push(FilterRegex.compile(parsed));
return;
}
const pattern = parsed.f;
const pattern = parsed.pattern;
if ( pattern === '*' ) {
units.push(FilterTrue.compile());
return;
@ -439,27 +406,27 @@ const filterPattern = {
hasCaretCombo ? parsed.firstCaretPos : parsed.firstWildcardPos
);
if ( parsed.tokenBeg < parsed.firstWildcardPos ) {
parsed.f = sleft;
parsed.pattern = sleft;
units.push(FilterPatternPlain.compile(parsed));
parsed.f = sright;
parsed.pattern = sright;
units.push(FilterPatternRight.compile(parsed, hasCaretCombo));
return;
}
// parsed.tokenBeg > parsed.firstWildcardPos
parsed.f = sright;
parsed.pattern = sright;
parsed.tokenBeg -= parsed.firstWildcardPos + 1;
units.push(FilterPatternPlain.compile(parsed));
parsed.f = sleft;
parsed.pattern = sleft;
units.push(FilterPatternLeft.compile(parsed, hasCaretCombo));
},
compileGeneric: function(parsed, units) {
const pattern = parsed.f;
const pattern = parsed.pattern;
// Optimize special case: plain pattern with trailing caret
if (
parsed.firstWildcardPos === -1 &&
parsed.firstCaretPos === (pattern.length - 1)
) {
parsed.f = pattern.slice(0, -1);
parsed.pattern = pattern.slice(0, -1);
units.push(FilterPatternPlain.compile(parsed));
units.push(FilterTrailingSeparator.compile());
return;
@ -479,10 +446,10 @@ const filterPattern = {
// if ( c === 0x2A /* '*' */ || c === 0x5E /* '^' */ ) { break; }
// right += 1;
//}
//parsed.f = pattern.slice(left, right);
//parsed.pattern = pattern.slice(left, right);
//parsed.tokenBeg -= left;
//units.push(FilterPatternPlain.compile(parsed));
//parsed.f = pattern;
//parsed.pattern = pattern;
units.push(FilterPatternGeneric.compile(parsed));
},
};
@ -565,7 +532,7 @@ const FilterPatternPlain = class {
}
static compile(details) {
return [ FilterPatternPlain.fid, details.f, details.tokenBeg ];
return [ FilterPatternPlain.fid, details.pattern, details.tokenBeg ];
}
static fromCompiled(args) {
@ -678,7 +645,7 @@ const FilterPatternLeft = class {
static compile(details, ex) {
return [
ex ? FilterPatternLeftEx.fid : FilterPatternLeft.fid,
details.f
details.pattern
];
}
@ -762,7 +729,7 @@ const FilterPatternRight = class {
static compile(details, ex) {
return [
ex ? FilterPatternRightEx.fid : FilterPatternRight.fid,
details.f
details.pattern
];
}
@ -853,7 +820,7 @@ const FilterPatternGeneric = class {
static compile(details) {
const anchor = details.anchor;
details.anchor = 0;
return [ FilterPatternGeneric.fid, details.f, anchor ];
return [ FilterPatternGeneric.fid, details.pattern, anchor ];
}
static fromCompiled(args) {
@ -1115,7 +1082,7 @@ const FilterRegex = class {
}
static compile(details) {
return [ FilterRegex.fid, details.f ];
return [ FilterRegex.fid, details.pattern ];
}
static fromCompiled(args) {
@ -2101,25 +2068,42 @@ const FILTER_SEQUENCES_MIN = filterSequenceWritePtr;
/******************************************************************************/
const FilterParser = class {
constructor() {
constructor(parser) {
this.cantWebsocket = vAPI.cantWebsocket;
this.domainOpt = '';
this.noTokenHash = urlTokenizer.noTokenHash;
this.reBadDomainOptChars = /[+?^${}()[\]\\]/;
this.reHostnameRule1 = /^\w[\w.-]*[a-z]$/i;
this.reHostnameRule2 = /^\w[\w.-]*[a-z]\^?$/i;
this.reCanTrimCarets1 = /^[^*]*$/;
this.reCanTrimCarets2 = /^\^?[^^]+[^^][^^]+\^?$/;
this.reIsolateHostname = /^(\*?\.)?([^\x00-\x24\x26-\x2C\x2F\x3A-\x5E\x60\x7B-\x7F]+)(.*)/;
this.reHasUnicode = /[^\x00-\x7F]/;
this.reWebsocketAny = /^ws[s*]?(?::\/?\/?)?\*?$/;
this.reBadCSP = /(?:=|;)\s*report-(?:to|uri)\b/;
this.reGoodToken = /[%0-9a-z]{1,}/g;
this.reSeparator = /[\/^]/;
this.reRegexToken = /[%0-9A-Za-z]{2,}/g;
this.reRegexTokenAbort = /[([]/;
this.reRegexBadPrefix = /(^|[^\\]\.|[*?{}\\])$/;
this.reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*.]|$)/;
this.reGoodToken = /[%0-9a-z]{1,}/g;
this.tokenIdToNormalizedType = new Map([
[ parser.OPTTokenCname, bitFromType('cname') ],
[ parser.OPTTokenCss, bitFromType('stylesheet') ],
[ parser.OPTTokenDoc, bitFromType('main_frame') ],
[ parser.OPTTokenFont, bitFromType('font') ],
[ parser.OPTTokenFrame, bitFromType('sub_frame') ],
[ parser.OPTTokenGenericblock, bitFromType('unsupported') ],
[ parser.OPTTokenGhide, bitFromType('generichide') ],
[ parser.OPTTokenImage, bitFromType('image') ],
[ parser.OPTTokenInlineFont, bitFromType('inline-font') ],
[ parser.OPTTokenInlineScript, bitFromType('inline-script') ],
[ parser.OPTTokenMedia, bitFromType('media') ],
[ parser.OPTTokenObject, bitFromType('object') ],
[ parser.OPTTokenOther, bitFromType('other') ],
[ parser.OPTTokenPing, bitFromType('ping') ],
[ parser.OPTTokenPopunder, bitFromType('popunder') ],
[ parser.OPTTokenPopup, bitFromType('popup') ],
[ parser.OPTTokenScript, bitFromType('script') ],
[ parser.OPTTokenShide, bitFromType('specifichide') ],
[ parser.OPTTokenXhr, bitFromType('xmlhttprequest') ],
[ parser.OPTTokenWebrtc, bitFromType('unsupported') ],
[ parser.OPTTokenWebsocket, bitFromType('websocket') ],
]);
// These top 100 "bad tokens" are collated using the "miss" histogram
// from tokenHistograms(). The "score" is their occurrence among the
// 200K+ URLs used in the benchmark and executed against default
@ -2224,7 +2208,7 @@ const FilterParser = class {
[ 'scripts',1446 ],
[ 'twitter',1440 ],
[ 'crop',1431 ],
[ 'new',1412]
[ 'new',1412],
]);
this.maxTokenLen = urlTokenizer.MAX_TOKEN_LENGTH;
this.reset();
@ -2244,16 +2228,14 @@ const FilterParser = class {
this.dataType = undefined;
this.data = undefined;
this.invalid = false;
this.f = '';
this.pattern = '';
this.firstParty = false;
this.thirdParty = false;
this.party = AnyParty;
this.fopts = '';
this.domainOpt = '';
this.denyallow = '';
this.isPureHostname = false;
this.isRegex = false;
this.raw = '';
this.redirect = 0;
this.token = '*';
this.tokenHash = this.noTokenHash;
@ -2278,16 +2260,12 @@ const FilterParser = class {
return '';
}
bitFromType(type) {
return 1 << ((typeNameToTypeValue[type] >>> 4) - 1);
}
// https://github.com/chrisaljoudi/uBlock/issues/589
// Be ready to handle multiple negated types
parseTypeOption(raw, not) {
const typeBit = raw !== 'all'
? this.bitFromType(toNormalizedType[raw])
parseTypeOption(id, not) {
const typeBit = id !== -1
? this.tokenIdToNormalizedType.get(id)
: allTypesBits;
if ( not ) {
this.notTypes |= typeBit;
@ -2309,8 +2287,8 @@ const FilterParser = class {
}
}
parseHostnameList(s) {
if ( this.reHasUnicode.test(s) ) {
parseHostnameList(parser, s) {
if ( parser.optionHasUnicode() ) {
const hostnames = s.split('|');
let i = hostnames.length;
while ( i-- ) {
@ -2320,106 +2298,74 @@ const FilterParser = class {
}
s = hostnames.join('|');
}
// TODO: revisit
if ( this.reBadDomainOptChars.test(s) ) { return ''; }
return s;
}
parseOptions(s) {
this.fopts = s;
for ( let opt of s.split(/\s*,\s*/) ) {
const not = opt.startsWith('~');
if ( not ) {
opt = opt.slice(1);
}
if ( opt === 'third-party' || opt === '3p' ) {
parseOptions(parser) {
for ( let { id, val, not, bad } of parser.options() ) {
if ( bad ) { return false; }
switch ( id ) {
case parser.OPTToken3p:
this.parsePartyOption(false, not);
continue;
}
if ( opt === 'first-party' || opt === '1p' ) {
break;
case parser.OPTToken1p:
this.parsePartyOption(true, not);
continue;
}
if ( toNormalizedType.hasOwnProperty(opt) ) {
this.parseTypeOption(opt, not);
continue;
}
// https://github.com/gorhill/uBlock/issues/2294
// Detect and discard filter if domain option contains nonsensical
// characters.
if ( opt.startsWith('domain=') ) {
this.domainOpt = this.parseHostnameList(opt.slice(7));
if ( this.domainOpt === '' ) {
this.unsupported = true;
break;
}
continue;
}
if ( opt.startsWith('denyallow=') ) {
this.denyallow = this.parseHostnameList(opt.slice(10));
if ( this.denyallow === '' ) {
this.unsupported = true;
break;
}
continue;
}
if ( opt === 'important' ) {
this.important = Important;
continue;
}
if ( /^redirect(?:-rule)?=/.test(opt) ) {
if ( this.redirect !== 0 ) {
this.unsupported = true;
break;
}
this.redirect = opt.charCodeAt(8) === 0x3D /* '=' */ ? 1 : 2;
continue;
}
if (
opt.startsWith('csp=') &&
opt.length > 4 &&
this.reBadCSP.test(opt) === false
) {
this.parseTypeOption('data', not);
this.dataType = 'csp';
this.data = opt.slice(4).trim();
continue;
}
if ( opt === 'csp' && this.action === AllowAction ) {
this.parseTypeOption('data', not);
this.dataType = 'csp';
this.data = '';
continue;
}
// Used by Adguard:
// https://kb.adguard.com/en/general/how-to-create-your-own-ad-filters#empty-modifier
if ( opt === 'empty' || opt === 'mp4' ) {
if ( this.redirect !== 0 ) {
this.unsupported = true;
break;
}
this.redirect = 1;
continue;
}
break;
case parser.OPTTokenAll:
this.parseTypeOption(-1);
break;
// https://github.com/uBlockOrigin/uAssets/issues/192
if ( opt === 'badfilter' ) {
case parser.OPTTokenBadfilter:
this.badFilter = true;
continue;
}
break;
case parser.OPTTokenCsp:
this.typeBits = bitFromType('data');
this.dataType = 'csp';
if ( val !== undefined ) {
if ( this.reBadCSP.test(val) ) { return false; }
this.data = val;
} else if ( this.action === AllowAction ) {
this.data = '';
}
break;
// https://github.com/gorhill/uBlock/issues/2294
// Detect and discard filter if domain option contains nonsensical
// characters.
case parser.OPTTokenDomain:
this.domainOpt = this.parseHostnameList(parser, val);
if ( this.domainOpt === '' ) { return false; }
break;
case parser.OPTTokenDenyAllow:
this.denyallow = this.parseHostnameList(parser, val);
if ( this.denyallow === '' ) { return false; }
break;
// https://www.reddit.com/r/uBlockOrigin/comments/d6vxzj/
// Add support for `elemhide`. Rarely used but it happens.
if ( opt === 'elemhide' || opt === 'ehide' ) {
this.parseTypeOption('specifichide', not);
this.parseTypeOption('generichide', not);
continue;
case parser.OPTTokenEhide:
this.parseTypeOption(parser.OPTTokenShide, not);
this.parseTypeOption(parser.OPTTokenGhide, not);
break;
case parser.OPTTokenImportant:
this.important = Important;
break;
// Used by Adguard:
// https://kb.adguard.com/en/general/how-to-create-your-own-ad-filters#empty-modifier
case parser.OPTTokenEmpty:
case parser.OPTTokenMp4:
case parser.OPTTokenRedirect:
case parser.OPTTokenRedirectRule:
if ( this.redirect !== 0 ) { return false; }
this.redirect = id === parser.OPTTokenRedirectRule ? 2 : 1;
break;
default:
if ( this.tokenIdToNormalizedType.has(id) === false ) {
return false;
}
this.parseTypeOption(id, not);
break;
}
// Unrecognized filter option: ignore whole filter.
this.unsupported = true;
break;
}
// Redirect rules can't be exception filters.
if ( this.redirect !== 0 && this.action !== BlockAction ) {
this.unsupported = true;
}
// Negated network types? Toggle on all network type bits.
@ -2429,9 +2375,7 @@ const FilterParser = class {
}
if ( this.notTypes !== 0 ) {
this.typeBits &= ~this.notTypes;
if ( this.typeBits === 0 ) {
this.unsupported = true;
}
if ( this.typeBits === 0 ) { return false; }
}
// https://github.com/gorhill/uBlock/issues/2283
@ -2439,193 +2383,99 @@ const FilterParser = class {
// toggle off `unsupported` bit.
if ( this.typeBits & unsupportedTypeBit ) {
this.typeBits &= ~unsupportedTypeBit;
if ( this.typeBits === 0 ) {
this.unsupported = true;
}
if ( this.typeBits === 0 ) { return false; }
}
return true;
}
// TODO: use charCodeAt where possible.
parse(raw) {
parse(parser) {
// important!
this.reset();
let s = this.raw = raw.trim();
if ( s.length === 0 ) {
if ( parser.hasError() ) {
this.invalid = true;
return this;
}
// Filters which are a single alphanumeric character are discarded
// as unsupported.
if ( s.length === 1 && /[0-9a-z]/i.test(s) ) {
this.unsupported = true;
// Filters which pattern is a single character other than `*` and have
// no narrowing options are discarded as invalid.
if ( parser.patternIsDubious() ) {
this.invalid = true;
return this;
}
// plain hostname? (from HOSTS file)
if ( this.reHostnameRule1.test(s) ) {
this.f = s.toLowerCase();
this.isPureHostname = true;
this.anchor |= 0b100;
return this;
}
// element hiding filter?
let pos = s.indexOf('#');
if ( pos !== -1 ) {
const c = s.charAt(pos + 1);
if ( c === '#' || c === '@' ) {
console.error('static-net-filtering.js > unexpected cosmetic filters');
this.invalid = true;
return this;
}
}
// block or allow filter?
// Important: this must be executed before parsing options
if ( s.startsWith('@@') ) {
if ( parser.isException() ) {
this.action = AllowAction;
s = s.slice(2);
}
// options
// https://github.com/gorhill/uBlock/issues/842
// - ensure sure we are not dealing with a regex-based filter.
// - lookup the last occurrence of `$`.
if (
s.charCodeAt(0) !== 0x2F /* '/' */ ||
s.charCodeAt(s.length - 1) !== 0x2F /* '/' */
) {
pos = s.lastIndexOf('$');
if ( pos !== -1 ) {
// https://github.com/gorhill/uBlock/issues/952
// Discard Adguard-specific `$$` filters.
if ( s.indexOf('$$') !== -1 ) {
this.unsupported = true;
return this;
}
this.parseOptions(s.slice(pos + 1).trim());
if ( this.unsupported ) { return this; }
s = s.slice(0, pos);
}
}
this.isPureHostname = parser.patternIsPlainHostname();
// regex?
if (
s.length > 2 &&
s.charCodeAt(0) === 0x2F /* '/' */ &&
s.charCodeAt(s.length - 1) === 0x2F /* '/' */
) {
this.isRegex = true;
this.f = s.slice(1, -1);
// https://github.com/gorhill/uBlock/issues/1246
// If the filter is valid, use the corrected version of the
// source string -- this ensure reverse-lookup will work fine.
this.f = this.normalizeRegexSource(this.f);
if ( this.f === '' ) {
this.unsupported = true;
}
// Plain hostname? (from HOSTS file)
if ( this.isPureHostname && parser.hasOptions() === false ) {
this.pattern = parser.patternToLowercase();
this.anchor |= 0b100;
return this;
}
// hostname-anchored
if ( s.startsWith('||') ) {
this.anchor |= 0b100;
s = s.slice(2);
// convert hostname to punycode if needed
// https://github.com/gorhill/uBlock/issues/2599
if ( this.reHasUnicode.test(s) ) {
const matches = this.reIsolateHostname.exec(s);
if ( matches ) {
s = (matches[1] !== undefined ? matches[1] : '') +
punycode.toASCII(matches[2]) +
matches[3];
}
}
// https://github.com/chrisaljoudi/uBlock/issues/1096
if ( s.startsWith('^') ) {
this.unsupported = true;
return this;
}
// plain hostname? (from ABP filter list)
// https://github.com/gorhill/uBlock/issues/1757
// A filter can't be a pure-hostname one if there is a domain or
// csp option present.
if ( this.reHostnameRule2.test(s) ) {
if ( s.charCodeAt(s.length - 1) === 0x5E /* '^' */ ) {
s = s.slice(0, -1);
}
this.f = s.toLowerCase();
this.isPureHostname = true;
return this;
}
}
// left-anchored
else if ( s.startsWith('|') ) {
this.anchor |= 0x2;
s = s.slice(1);
}
// right-anchored
if ( s.endsWith('|') ) {
this.anchor |= 0x1;
s = s.slice(0, -1);
}
// https://github.com/gorhill/uBlock/issues/1669#issuecomment-224822448
// Remove pointless leading *.
// https://github.com/gorhill/uBlock/issues/3034
// We can remove anchoring if we need to match all at the start.
if ( s.startsWith('*') ) {
s = s.replace(/^\*+([^%0-9a-z])/i, '$1');
this.anchor &= ~0x6;
}
// Remove pointless trailing *
// https://github.com/gorhill/uBlock/issues/3034
// We can remove anchoring if we need to match all at the end.
if ( s.endsWith('*') ) {
s = s.replace(/([^%0-9a-z])\*+$/i, '$1');
this.anchor &= ~0x1;
}
// nothing left?
if ( s === '' ) {
s = '*';
}
// TODO: remove once redirect rules with `*/*` pattern are no longer
// used.
else if ( this.redirect !== 0 && s === '/' ) {
s = '*';
}
// https://github.com/gorhill/uBlock/issues/1047
// Hostname-anchored makes no sense if matching all requests.
if ( s === '*' ) {
this.anchor = 0;
}
this.firstWildcardPos = s.indexOf('*');
if ( this.firstWildcardPos !== -1 ) {
this.secondWildcardPos = s.indexOf('*', this.firstWildcardPos + 1);
}
this.firstCaretPos = s.indexOf('^');
if ( this.firstCaretPos !== -1 ) {
this.secondCaretPos = s.indexOf('^', this.firstCaretPos + 1);
}
if ( s.length > 1024 ) {
// options
if ( parser.hasOptions() && this.parseOptions(parser) === false ) {
this.unsupported = true;
return this;
}
this.f = s.toLowerCase();
// regex?
if ( parser.patternIsRegex() ) {
this.isRegex = true;
// https://github.com/gorhill/uBlock/issues/1246
// If the filter is valid, use the corrected version of the
// source string -- this ensure reverse-lookup will work fine.
this.pattern = this.normalizeRegexSource(parser.getPattern());
if ( this.pattern === '' ) {
this.unsupported = true;
}
return this;
}
let pattern;
if ( parser.patternIsMatchAll() ) {
pattern = '*';
} else {
pattern = parser.patternToLowercase();
}
if ( parser.patternIsLeftHostnameAnchored() ) {
this.anchor |= 0b100;
} else if ( parser.patternIsLeftAnchored() ) {
this.anchor |= 0b010;
}
if ( parser.patternIsRightAnchored() ) {
this.anchor |= 0b001;
}
if ( parser.patternHasWildcard() ) {
this.firstWildcardPos = pattern.indexOf('*');
if ( this.firstWildcardPos !== -1 ) {
this.secondWildcardPos =
pattern.indexOf('*', this.firstWildcardPos + 1);
}
}
if ( parser.patternHasCaret() ) {
this.firstCaretPos = pattern.indexOf('^');
if ( this.firstCaretPos !== -1 ) {
this.secondCaretPos =
pattern.indexOf('^', this.firstCaretPos + 1);
}
}
if ( pattern.length > 1024 ) {
this.unsupported = true;
return this;
}
this.pattern = pattern;
return this;
}
@ -2635,41 +2485,24 @@ const FilterParser = class {
// are not good. Avoid if possible. This has a significant positive
// impact on performance.
makeToken() {
makeToken(parser) {
if ( this.isRegex ) {
this.extractTokenFromRegex();
return;
return this.extractTokenFromRegex();
}
if ( this.f === '*' ) { return; }
const matches = this.findGoodToken();
if ( matches === null ) { return; }
this.token = matches[0];
const match = this.findGoodToken(parser);
if ( match === null ) { return; }
this.token = match.token;
this.tokenHash = urlTokenizer.tokenHashFromString(this.token);
this.tokenBeg = matches.index;
this.tokenBeg = match.pos;
}
findGoodToken() {
this.reGoodToken.lastIndex = 0;
const s = this.f;
// Note: a one-char token is better than a documented bad token.
findGoodToken(parser) {
let bestMatch = null;
let bestBadness = 0;
let match;
while ( (match = this.reGoodToken.exec(s)) !== null ) {
const token = match[0];
// https://github.com/gorhill/uBlock/issues/997
// Ignore token if preceded by wildcard.
const pos = match.index;
if (
pos !== 0 &&
s.charCodeAt(pos - 1) === 0x2A /* '*' */ ||
token.length < this.maxTokenLen &&
s.charCodeAt(pos + token.length) === 0x2A /* '*' */
) {
continue;
}
// A one-char token is better than a documented bad token.
const badness = token.length > 1
? this.badTokens.get(token) || 0
for ( const match of parser.patternTokens() ) {
const badness = match.token.length > 1
? this.badTokens.get(match.token) || 0
: 1;
if ( badness === 0 ) { return match; }
if ( bestBadness === 0 || badness < bestBadness ) {
@ -2685,7 +2518,7 @@ const FilterParser = class {
// a regex-based filter.
extractTokenFromRegex() {
this.reRegexToken.lastIndex = 0;
const s = this.f;
const s = this.pattern;
let matches;
while ( (matches = this.reRegexToken.exec(s)) !== null ) {
const prefix = s.slice(0, matches.index);
@ -2712,9 +2545,9 @@ const FilterParser = class {
this.dataType === undefined &&
this.denyallow === '' &&
this.domainOpt !== '' && (
this.f === '*' || (
this.pattern === '*' || (
this.anchor === 0b010 &&
/^(?:http[s*]?:(?:\/\/)?)$/.test(this.f)
/^(?:http[s*]?:(?:\/\/)?)$/.test(this.pattern)
)
) &&
this.domainOpt.indexOf('~') === -1;
@ -2778,15 +2611,15 @@ FilterParser.parse = (( ) => {
ttlTimer = vAPI.setTimeout(ttlProcess, 10007);
};
return s => {
return p => {
if ( parser === undefined ) {
parser = new FilterParser();
parser = new FilterParser(p);
}
last = Date.now();
if ( ttlTimer === undefined ) {
ttlTimer = vAPI.setTimeout(ttlProcess, 10007);
}
return parser.parse(s);
return parser.parse(p);
};
})();
@ -3072,10 +2905,10 @@ FilterContainer.prototype.fromSelfie = function(path) {
/******************************************************************************/
FilterContainer.prototype.compile = function(raw, writer) {
FilterContainer.prototype.compile = function(parser, writer) {
// ORDER OF TESTS IS IMPORTANT!
const parsed = FilterParser.parse(raw);
const parsed = FilterParser.parse(parser);
// Ignore non-static network filters
if ( parsed.invalid ) { return false; }
@ -3086,20 +2919,20 @@ FilterContainer.prototype.compile = function(raw, writer) {
µb.logger.writeOne({
realm: 'message',
type: 'error',
text: `Invalid network filter in ${who}: ${raw}`
text: `Invalid network filter in ${who}: ${parser.raw}`
});
return false;
}
// Redirect rule
if ( parsed.redirect !== 0 ) {
const result = this.compileRedirectRule(parsed, writer);
const result = this.compileRedirectRule(parser.raw, parsed.badFilter, writer);
if ( result === false ) {
const who = writer.properties.get('assetKey') || '?';
µb.logger.writeOne({
realm: 'message',
type: 'error',
text: `Invalid redirect rule in ${who}: ${raw}`
text: `Invalid redirect rule in ${who}: ${parser.raw}`
});
return false;
}
@ -3116,11 +2949,13 @@ FilterContainer.prototype.compile = function(raw, writer) {
parsed.dataType === undefined
) {
parsed.tokenHash = this.dotTokenHash;
this.compileToAtomicFilter(parsed, parsed.f, writer);
this.compileToAtomicFilter(parsed, parsed.pattern, writer);
return true;
}
parsed.makeToken();
if ( parser.patternIsMatchAll() === false ) {
parsed.makeToken(parser);
}
// Special pattern/option cases:
// - `*$domain=...`
@ -3131,9 +2966,9 @@ FilterContainer.prototype.compile = function(raw, writer) {
// are entries in the `domain=` option.
if ( parsed.isJustOrigin() ) {
const tokenHash = parsed.tokenHash;
if ( parsed.f === '*' || parsed.f.startsWith('http*') ) {
if ( parsed.pattern === '*' || parsed.pattern.startsWith('http*') ) {
parsed.tokenHash = this.anyTokenHash;
} else if /* 'https:' */ ( parsed.f.startsWith('https') ) {
} else if /* 'https:' */ ( parsed.pattern.startsWith('https') ) {
parsed.tokenHash = this.anyHTTPSTokenHash;
} else /* 'http:' */ {
parsed.tokenHash = this.anyHTTPTokenHash;
@ -3251,10 +3086,10 @@ FilterContainer.prototype.compileToAtomicFilter = function(
/******************************************************************************/
FilterContainer.prototype.compileRedirectRule = function(parsed, writer) {
const redirects = µb.redirectEngine.compileRuleFromStaticFilter(parsed.raw);
FilterContainer.prototype.compileRedirectRule = function(raw, badFilter, writer) {
const redirects = µb.redirectEngine.compileRuleFromStaticFilter(raw);
if ( Array.isArray(redirects) === false ) { return false; }
writer.select(parsed.badFilter ? 1 : 0);
writer.select(badFilter ? 1 : 0);
const type = typeNameToTypeValue.redirect;
for ( const redirect of redirects ) {
writer.push([ type, redirect ]);

View File

@ -799,60 +799,36 @@ self.addEventListener('hiddenSettingsChanged', ( ) => {
// https://adblockplus.org/en/filters
const staticNetFilteringEngine = this.staticNetFilteringEngine;
const staticExtFilteringEngine = this.staticExtFilteringEngine;
const reIsWhitespaceChar = /\s/;
const reMaybeLocalIp = /^[\d:f]/;
const reIsLocalhostRedirect = /\s+(?:0\.0\.0\.0|broadcasthost|localhost|local|ip6-\w+)\b/;
const reLocalIp = /^(?:(0\.0\.0\.)?0|127\.0\.0\.1|::1?|fe80::1%lo0)\s+/;
const lineIter = new this.LineIterator(this.processDirectives(rawText));
const parser = new vAPI.StaticFilteringParser();
parser.setMaxTokenLength(this.urlTokenizer.MAX_TOKEN_LENGTH);
while ( lineIter.eot() === false ) {
let line = lineIter.next().trim();
if ( line.length === 0 ) { continue; }
let line = lineIter.next();
while ( line.endsWith(' \\') ) {
if ( lineIter.peek(4) !== ' ' ) { break; }
line = line.slice(0, -2).trim() + lineIter.next().trim();
}
// Strip comments
const c = line.charAt(0);
if ( c === '!' || c === '[' ) { continue; }
parser.analyze(line);
// Parse or skip cosmetic filters
// All cosmetic filters are caught here
if ( staticExtFilteringEngine.compile(line, writer) ) { continue; }
if ( parser.shouldIgnore() ) { continue; }
// Whatever else is next can be assumed to not be a cosmetic filter
// Most comments start in first column
if ( c === '#' ) { continue; }
// Catch comments somewhere on the line
// Remove:
// ... #blah blah blah
// ... # blah blah blah
// Don't remove:
// ...#blah blah blah
// because some ABP filters uses the `#` character (URL fragment)
const pos = line.indexOf('#');
if ( pos !== -1 && reIsWhitespaceChar.test(line.charAt(pos - 1)) ) {
line = line.slice(0, pos).trim();
if ( parser.category === parser.CATStaticExtFilter ) {
staticExtFilteringEngine.compile(parser, writer);
continue;
}
// https://github.com/gorhill/httpswitchboard/issues/15
// Ensure localhost et al. don't end up in the ubiquitous blacklist.
// With hosts files, we need to remove local IP redirection
if ( reMaybeLocalIp.test(c) ) {
// Ignore hosts file redirect configuration
// 127.0.0.1 localhost
// 255.255.255.255 broadcasthost
if ( reIsLocalhostRedirect.test(line) ) { continue; }
line = line.replace(reLocalIp, '').trim();
if ( parser.category !== parser.CATStaticNetFilter ) { continue; }
// https://github.com/gorhill/uBlock/issues/2599
// convert hostname to punycode if needed
if ( parser.patternHasUnicode() ) {
parser.toPunycode();
}
if ( line.length === 0 ) { continue; }
staticNetFilteringEngine.compile(line, writer);
staticNetFilteringEngine.compile(parser, writer);
}
return writer.toString();