Improve extraction of tokens from regex-based filters

Regex-based static network filters are those most likely to
cause performance degradation, and as such the best guard
against undue performance degradation caused by regex-based
filters is the ability to extract valid and good tokens
from regex patterns.

This commit introduces a complete regex parser so that the
static network filtering engine can now safely extract
tokens regardless of the complexity of the regex pattern.

The regex parser is a library imported from:
https://github.com/foo123/RegexAnalyzer

The syntax highlighter adds an underline to regex-based
filters as a visual aid to filter authors so as to avoid
mistakenly creating regex-based filters. This commit
further colors the underline as a warning when a regex-based
filter is found to be untokenizable.

Filter list authors are invited to spot these untokenizable
regex-based filters in their lists to verify that no
mistake were made for those filters, causing them to be
untokenizabke. For example, what appears to be a mistake:

    /^https?:\/\/.*\/sw.js?.[a-zA-Z0-9%]{50,}/

Though the mistake is minor, the regex-based filter above
is untokenizable as a result, and become tokenizable when
the `.` is properly escaped:

    /^https?:\/\/.*\/sw\.js?.[a-zA-Z0-9%]{50,}/

Filter list authors can use this search expression in the
asset viewer to find instances of regex-based filters:

    /^(@@)?\/[^\n]+\/(\$|$)/
This commit is contained in:
Raymond Hill 2020-12-26 08:52:42 -05:00
parent fdcb110feb
commit 426395aa03
No known key found for this signature in database
GPG Key ID: 25E1490B761470C2
9 changed files with 2300 additions and 25 deletions

View File

@ -49,6 +49,7 @@
<script src="lib/codemirror/addon/scroll/annotatescrollbar.js"></script>
<script src="lib/codemirror/addon/search/searchcursor.js"></script>
<script src="lib/codemirror/addon/selection/active-line.js"></script>
<script src="lib/regexanalyzer/regex.js"></script>
<script src="js/codemirror/search.js"></script>
<script src="js/codemirror/search-thread.js"></script>

View File

@ -39,6 +39,7 @@
<div class="li"><span><a href="https://github.com/rsms/inter" target="_blank">Inter font family</a> by <a href="https://github.com/rsms">Rasmus Andersson</a></span></div>
<div class="li"><span><a href="https://fontawesome.com/" target="_blank">FontAwesome font family</a> by <a href="https://github.com/davegandy">Dave Gandy</a></span></div>
<div class="li"><span><a href="https://github.com/Swatinem/diff" target="_blank">An implementation of Myers' diff algorithm</a> by <a href="https://github.com/Swatinem">Arpad Borsos</a></span></div>
<div class="li"><span><a href="https://github.com/foo123/RegexAnalyzer" target="_blank">Regular Expression Analyzer</a> by <a href="https://github.com/foo123">Nikos M.</a></span></div>
</div>
<hr>
<div id="dev">

View File

@ -33,6 +33,7 @@
<script src="lib/codemirror/addon/scroll/annotatescrollbar.js"></script>
<script src="lib/codemirror/addon/search/searchcursor.js"></script>
<script src="lib/codemirror/addon/selection/active-line.js"></script>
<script src="lib/regexanalyzer/regex.js"></script>
<script src="js/codemirror/search.js"></script>
<script src="js/codemirror/search-thread.js"></script>

View File

@ -9,6 +9,7 @@
<script src="lib/lz4/lz4-block-codec-any.js"></script>
<script src="lib/punycode.js"></script>
<script src="lib/publicsuffixlist/publicsuffixlist.js"></script>
<script src="lib/regexanalyzer/regex.js"></script>
<script src="js/webext.js"></script>
<script src="js/vapi.js"></script>
<script src="js/vapi-common.js"></script>

View File

@ -274,7 +274,9 @@ CodeMirror.defineMode('ubo-static-filtering', function() {
if ( parser.patternIsRegex() ) {
stream.pos = parser.slices[parser.optionsAnchorSpan.i+1];
parserSlot = parser.optionsAnchorSpan.i;
return 'variable notice';
return parser.patternIsTokenizable()
? 'variable notice'
: 'variable warning';
}
if ( (parser.slices[parserSlot] & (parser.BITAsterisk | parser.BITCaret)) !== 0 ) {
stream.pos += parser.slices[parserSlot+2];

View File

@ -1003,6 +1003,18 @@ const Parser = class {
return (this.flavorBits & BITFlavorNetRegex) !== 0;
}
patternIsTokenizable() {
// TODO: not necessarily true, this needs more work.
if ( this.patternIsRegex === false ) { return true; }
const s = Parser.tokenizableStrFromRegex(this.getNetPattern());
try {
return /(?<![\x01%0-9A-Za-z]|^)[%0-9A-Za-z]{7,}/.test(s) ||
/(?<![\x01%0-9A-Za-z]|^)[%0-9A-Za-z]{1,6}(?![\x01%0-9A-Za-z]|$)/.test(s);
} catch(ex) {
}
return true;
}
patternHasWildcard() {
return hasBits(this.patternBits, BITAsterisk);
}
@ -2748,6 +2760,109 @@ const ExtOptionsIterator = class {
/******************************************************************************/
// Depends on:
// https://github.com/foo123/RegexAnalyzer
Parser.tokenizableStrFromRegex = (( ) => {
const firstCharCodeClass = s => {
return /^[\x01%0-9A-Za-z]/.test(s) ? 1 : 0;
};
const lastCharCodeClass = s => {
return /[\x01%0-9A-Za-z]$/.test(s) ? 1 : 0;
};
const toTokenizableString = node => {
switch ( node.type ) {
case 1: /* T_SEQUENCE, 'Sequence' */ {
let s = '';
for ( let i = 0; i < node.val.length; i++ ) {
s += toTokenizableString(node.val[i]);
}
return s;
}
case 2: /* T_ALTERNATION,'Alternation' */
case 8: /* T_CHARGROUP, 'CharacterGroup' */ {
let firstChar = 0;
let lastChar = 0;
for ( let i = 0; i < node.val.length; i++ ) {
const s = toTokenizableString(node.val[i]);
if ( firstChar === 0 && firstCharCodeClass(s) === 1 ) {
firstChar = 1;
}
if ( lastChar === 0 && lastCharCodeClass(s) === 1 ) {
lastChar = 1;
}
if ( firstChar === 1 && lastChar === 1 ) { break; }
}
return String.fromCharCode(firstChar, lastChar);
}
case 4: /* T_GROUP, 'Group' */ {
if ( node.flags.NegativeLookAhead === 1 ) { return '\x01'; }
if ( node.flags.NegativeLookBehind === 1 ) { return '\x01'; }
return toTokenizableString(node.val);
}
case 16: /* T_QUANTIFIER, 'Quantifier' */ {
const s = toTokenizableString(node.val);
const first = firstCharCodeClass(s);
const last = lastCharCodeClass(s);
if ( node.flags.min === 0 && first === 0 && last === 0 ) {
return '';
}
return String.fromCharCode(first, last);
}
case 64: /* T_HEXCHAR, 'HexChar' */ {
return String.fromCharCode(parseInt(node.val.slice(1), 16));
}
case 128: /* T_SPECIAL, 'Special' */ {
const flags = node.flags;
if ( flags.MatchEnd === 1 ) { return '\x00'; }
if ( flags.MatchStart === 1 ) { return '\x00'; }
if ( flags.MatchWordBoundary === 1 ) { return '\x00'; }
return '\x01';
}
case 256: /* T_CHARS, 'Characters' */ {
for ( let i = 0; i < node.val.length; i++ ) {
if ( firstCharCodeClass(node.val[i]) === 1 ) {
return '\x01';
}
}
return '\x00';
}
// Ranges are assumed to always involve token-related characters.
case 512: /* T_CHARRANGE, 'CharacterRange' */ {
return '\x01';
}
case 1024: /* T_STRING, 'String' */ {
return node.val;
}
case 2048: /* T_COMMENT, 'Comment' */ {
return '';
}
default:
break;
}
return '\x01';
};
return function(reStr) {
if (
self.Regex instanceof Object === false ||
self.Regex.Analyzer instanceof Object === false
) {
return '';
}
try {
return toTokenizableString(self.Regex.Analyzer(reStr, false).tree());
} catch(ex) {
}
return '';
};
})();
/******************************************************************************/
if ( typeof vAPI === 'object' && vAPI !== null ) {
vAPI.StaticFilteringParser = Parser;
} else {

View File

@ -2622,15 +2622,9 @@ const FilterParser = class {
if ( other !== undefined ) {
return Object.assign(this, other);
}
this.cantWebsocket = vAPI.cantWebsocket;
this.noTokenHash = urlTokenizer.noTokenHash;
this.reIsolateHostname = /^(\*?\.)?([^\x00-\x24\x26-\x2C\x2F\x3A-\x5E\x60\x7B-\x7F]+)(.*)/;
this.reBadCSP = /(?:=|;)\s*report-(?:to|uri)\b/;
this.reToken = /[%0-9A-Za-z]+/g;
this.reRegexTokenAbort = /[\(\)\[\]]/;
this.reRegexBadPrefix = /(^|[^\\]\.|\\[%SDWsdw]|[^\\][()*+?[\\\]{}])$/;
this.reRegexBadSuffix = /^([^\\]\.|\\[%SDWsdw]|[()*+?[\]{}]|$)/;
this.reGoodToken = /[%0-9a-z]{1,}/g;
this.domainOptList = [];
this.tokenIdToNormalizedType = new Map([
[ parser.OPTTokenCname, bitFromType('cname') ],
@ -3175,32 +3169,22 @@ const FilterParser = class {
// not `bads`.
extractTokenFromRegex() {
this.reToken.lastIndex = 0;
const pattern = this.pattern;
const pattern =
vAPI.StaticFilteringParser.tokenizableStrFromRegex(this.pattern);
let bestToken;
let bestBadness = 0x7FFFFFFF;
for (;;) {
const matches = this.reToken.exec(pattern);
if ( matches === null ) { break; }
let token = matches[0];
let prefix = pattern.slice(0, matches.index);
let suffix = pattern.slice(this.reToken.lastIndex);
if (
this.reRegexTokenAbort.test(prefix) &&
this.reRegexTokenAbort.test(suffix)
) {
const { 0: token, index } = matches;
if ( index === 0 || pattern.charAt(index - 1) === '\x01' ) {
continue;
}
if ( token.charCodeAt(0) === 0x62 /* 'b' */ ) {
const match = /\\+$/.exec(prefix);
if ( match !== null && (match[0].length & 1) !== 0 ) {
prefix += 'b';
token = token.slice(1);
}
}
const { lastIndex } = this.reToken;
if (
this.reRegexBadPrefix.test(prefix) || (
token.length < this.maxTokenLen &&
this.reRegexBadSuffix.test(suffix)
token.length < this.maxTokenLen && (
lastIndex === pattern.length ||
pattern.charAt(lastIndex) === '\x01'
)
) {
continue;

View File

@ -0,0 +1,14 @@
https://github.com/foo123/RegexAnalyzer/issues/1#issuecomment-750039255
> The (implied) license is as free as it can get. You can modify it and use
> it anywhere you want if it suits you.
>
> An attribution to original author would be appreciated but even this is not
> mandatory.
>
> Copy Left
References:
- https://en.wikipedia.org/wiki/Copyleft
- http://gplv3.fsf.org/wiki/index.php/Compatible_licenses

File diff suppressed because it is too large Load Diff