mirror of https://github.com/gorhill/uBlock.git
Improve extraction of tokens from regex-based filters
Regex-based static network filters are those most likely to cause performance degradation, and as such the best guard against undue performance degradation caused by regex-based filters is the ability to extract valid and good tokens from regex patterns. This commit introduces a complete regex parser so that the static network filtering engine can now safely extract tokens regardless of the complexity of the regex pattern. The regex parser is a library imported from: https://github.com/foo123/RegexAnalyzer The syntax highlighter adds an underline to regex-based filters as a visual aid to filter authors so as to avoid mistakenly creating regex-based filters. This commit further colors the underline as a warning when a regex-based filter is found to be untokenizable. Filter list authors are invited to spot these untokenizable regex-based filters in their lists to verify that no mistake were made for those filters, causing them to be untokenizabke. For example, what appears to be a mistake: /^https?:\/\/.*\/sw.js?.[a-zA-Z0-9%]{50,}/ Though the mistake is minor, the regex-based filter above is untokenizable as a result, and become tokenizable when the `.` is properly escaped: /^https?:\/\/.*\/sw\.js?.[a-zA-Z0-9%]{50,}/ Filter list authors can use this search expression in the asset viewer to find instances of regex-based filters: /^(@@)?\/[^\n]+\/(\$|$)/
This commit is contained in:
parent
fdcb110feb
commit
426395aa03
|
@ -49,6 +49,7 @@
|
|||
<script src="lib/codemirror/addon/scroll/annotatescrollbar.js"></script>
|
||||
<script src="lib/codemirror/addon/search/searchcursor.js"></script>
|
||||
<script src="lib/codemirror/addon/selection/active-line.js"></script>
|
||||
<script src="lib/regexanalyzer/regex.js"></script>
|
||||
|
||||
<script src="js/codemirror/search.js"></script>
|
||||
<script src="js/codemirror/search-thread.js"></script>
|
||||
|
|
|
@ -39,6 +39,7 @@
|
|||
<div class="li"><span><a href="https://github.com/rsms/inter" target="_blank">Inter font family</a> by <a href="https://github.com/rsms">Rasmus Andersson</a></span></div>
|
||||
<div class="li"><span><a href="https://fontawesome.com/" target="_blank">FontAwesome font family</a> by <a href="https://github.com/davegandy">Dave Gandy</a></span></div>
|
||||
<div class="li"><span><a href="https://github.com/Swatinem/diff" target="_blank">An implementation of Myers' diff algorithm</a> by <a href="https://github.com/Swatinem">Arpad Borsos</a></span></div>
|
||||
<div class="li"><span><a href="https://github.com/foo123/RegexAnalyzer" target="_blank">Regular Expression Analyzer</a> by <a href="https://github.com/foo123">Nikos M.</a></span></div>
|
||||
</div>
|
||||
<hr>
|
||||
<div id="dev">
|
||||
|
|
|
@ -33,6 +33,7 @@
|
|||
<script src="lib/codemirror/addon/scroll/annotatescrollbar.js"></script>
|
||||
<script src="lib/codemirror/addon/search/searchcursor.js"></script>
|
||||
<script src="lib/codemirror/addon/selection/active-line.js"></script>
|
||||
<script src="lib/regexanalyzer/regex.js"></script>
|
||||
|
||||
<script src="js/codemirror/search.js"></script>
|
||||
<script src="js/codemirror/search-thread.js"></script>
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
<script src="lib/lz4/lz4-block-codec-any.js"></script>
|
||||
<script src="lib/punycode.js"></script>
|
||||
<script src="lib/publicsuffixlist/publicsuffixlist.js"></script>
|
||||
<script src="lib/regexanalyzer/regex.js"></script>
|
||||
<script src="js/webext.js"></script>
|
||||
<script src="js/vapi.js"></script>
|
||||
<script src="js/vapi-common.js"></script>
|
||||
|
|
|
@ -274,7 +274,9 @@ CodeMirror.defineMode('ubo-static-filtering', function() {
|
|||
if ( parser.patternIsRegex() ) {
|
||||
stream.pos = parser.slices[parser.optionsAnchorSpan.i+1];
|
||||
parserSlot = parser.optionsAnchorSpan.i;
|
||||
return 'variable notice';
|
||||
return parser.patternIsTokenizable()
|
||||
? 'variable notice'
|
||||
: 'variable warning';
|
||||
}
|
||||
if ( (parser.slices[parserSlot] & (parser.BITAsterisk | parser.BITCaret)) !== 0 ) {
|
||||
stream.pos += parser.slices[parserSlot+2];
|
||||
|
|
|
@ -1003,6 +1003,18 @@ const Parser = class {
|
|||
return (this.flavorBits & BITFlavorNetRegex) !== 0;
|
||||
}
|
||||
|
||||
patternIsTokenizable() {
|
||||
// TODO: not necessarily true, this needs more work.
|
||||
if ( this.patternIsRegex === false ) { return true; }
|
||||
const s = Parser.tokenizableStrFromRegex(this.getNetPattern());
|
||||
try {
|
||||
return /(?<![\x01%0-9A-Za-z]|^)[%0-9A-Za-z]{7,}/.test(s) ||
|
||||
/(?<![\x01%0-9A-Za-z]|^)[%0-9A-Za-z]{1,6}(?![\x01%0-9A-Za-z]|$)/.test(s);
|
||||
} catch(ex) {
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
patternHasWildcard() {
|
||||
return hasBits(this.patternBits, BITAsterisk);
|
||||
}
|
||||
|
@ -2748,6 +2760,109 @@ const ExtOptionsIterator = class {
|
|||
|
||||
/******************************************************************************/
|
||||
|
||||
// Depends on:
|
||||
// https://github.com/foo123/RegexAnalyzer
|
||||
|
||||
Parser.tokenizableStrFromRegex = (( ) => {
|
||||
|
||||
const firstCharCodeClass = s => {
|
||||
return /^[\x01%0-9A-Za-z]/.test(s) ? 1 : 0;
|
||||
};
|
||||
|
||||
const lastCharCodeClass = s => {
|
||||
return /[\x01%0-9A-Za-z]$/.test(s) ? 1 : 0;
|
||||
};
|
||||
|
||||
const toTokenizableString = node => {
|
||||
switch ( node.type ) {
|
||||
case 1: /* T_SEQUENCE, 'Sequence' */ {
|
||||
let s = '';
|
||||
for ( let i = 0; i < node.val.length; i++ ) {
|
||||
s += toTokenizableString(node.val[i]);
|
||||
}
|
||||
return s;
|
||||
}
|
||||
case 2: /* T_ALTERNATION,'Alternation' */
|
||||
case 8: /* T_CHARGROUP, 'CharacterGroup' */ {
|
||||
let firstChar = 0;
|
||||
let lastChar = 0;
|
||||
for ( let i = 0; i < node.val.length; i++ ) {
|
||||
const s = toTokenizableString(node.val[i]);
|
||||
if ( firstChar === 0 && firstCharCodeClass(s) === 1 ) {
|
||||
firstChar = 1;
|
||||
}
|
||||
if ( lastChar === 0 && lastCharCodeClass(s) === 1 ) {
|
||||
lastChar = 1;
|
||||
}
|
||||
if ( firstChar === 1 && lastChar === 1 ) { break; }
|
||||
}
|
||||
return String.fromCharCode(firstChar, lastChar);
|
||||
}
|
||||
case 4: /* T_GROUP, 'Group' */ {
|
||||
if ( node.flags.NegativeLookAhead === 1 ) { return '\x01'; }
|
||||
if ( node.flags.NegativeLookBehind === 1 ) { return '\x01'; }
|
||||
return toTokenizableString(node.val);
|
||||
}
|
||||
case 16: /* T_QUANTIFIER, 'Quantifier' */ {
|
||||
const s = toTokenizableString(node.val);
|
||||
const first = firstCharCodeClass(s);
|
||||
const last = lastCharCodeClass(s);
|
||||
if ( node.flags.min === 0 && first === 0 && last === 0 ) {
|
||||
return '';
|
||||
}
|
||||
return String.fromCharCode(first, last);
|
||||
}
|
||||
case 64: /* T_HEXCHAR, 'HexChar' */ {
|
||||
return String.fromCharCode(parseInt(node.val.slice(1), 16));
|
||||
}
|
||||
case 128: /* T_SPECIAL, 'Special' */ {
|
||||
const flags = node.flags;
|
||||
if ( flags.MatchEnd === 1 ) { return '\x00'; }
|
||||
if ( flags.MatchStart === 1 ) { return '\x00'; }
|
||||
if ( flags.MatchWordBoundary === 1 ) { return '\x00'; }
|
||||
return '\x01';
|
||||
}
|
||||
case 256: /* T_CHARS, 'Characters' */ {
|
||||
for ( let i = 0; i < node.val.length; i++ ) {
|
||||
if ( firstCharCodeClass(node.val[i]) === 1 ) {
|
||||
return '\x01';
|
||||
}
|
||||
}
|
||||
return '\x00';
|
||||
}
|
||||
// Ranges are assumed to always involve token-related characters.
|
||||
case 512: /* T_CHARRANGE, 'CharacterRange' */ {
|
||||
return '\x01';
|
||||
}
|
||||
case 1024: /* T_STRING, 'String' */ {
|
||||
return node.val;
|
||||
}
|
||||
case 2048: /* T_COMMENT, 'Comment' */ {
|
||||
return '';
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return '\x01';
|
||||
};
|
||||
|
||||
return function(reStr) {
|
||||
if (
|
||||
self.Regex instanceof Object === false ||
|
||||
self.Regex.Analyzer instanceof Object === false
|
||||
) {
|
||||
return '';
|
||||
}
|
||||
try {
|
||||
return toTokenizableString(self.Regex.Analyzer(reStr, false).tree());
|
||||
} catch(ex) {
|
||||
}
|
||||
return '';
|
||||
};
|
||||
})();
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
if ( typeof vAPI === 'object' && vAPI !== null ) {
|
||||
vAPI.StaticFilteringParser = Parser;
|
||||
} else {
|
||||
|
|
|
@ -2622,15 +2622,9 @@ const FilterParser = class {
|
|||
if ( other !== undefined ) {
|
||||
return Object.assign(this, other);
|
||||
}
|
||||
this.cantWebsocket = vAPI.cantWebsocket;
|
||||
this.noTokenHash = urlTokenizer.noTokenHash;
|
||||
this.reIsolateHostname = /^(\*?\.)?([^\x00-\x24\x26-\x2C\x2F\x3A-\x5E\x60\x7B-\x7F]+)(.*)/;
|
||||
this.reBadCSP = /(?:=|;)\s*report-(?:to|uri)\b/;
|
||||
this.reToken = /[%0-9A-Za-z]+/g;
|
||||
this.reRegexTokenAbort = /[\(\)\[\]]/;
|
||||
this.reRegexBadPrefix = /(^|[^\\]\.|\\[%SDWsdw]|[^\\][()*+?[\\\]{}])$/;
|
||||
this.reRegexBadSuffix = /^([^\\]\.|\\[%SDWsdw]|[()*+?[\]{}]|$)/;
|
||||
this.reGoodToken = /[%0-9a-z]{1,}/g;
|
||||
this.domainOptList = [];
|
||||
this.tokenIdToNormalizedType = new Map([
|
||||
[ parser.OPTTokenCname, bitFromType('cname') ],
|
||||
|
@ -3175,32 +3169,22 @@ const FilterParser = class {
|
|||
// not `bads`.
|
||||
extractTokenFromRegex() {
|
||||
this.reToken.lastIndex = 0;
|
||||
const pattern = this.pattern;
|
||||
const pattern =
|
||||
vAPI.StaticFilteringParser.tokenizableStrFromRegex(this.pattern);
|
||||
let bestToken;
|
||||
let bestBadness = 0x7FFFFFFF;
|
||||
for (;;) {
|
||||
const matches = this.reToken.exec(pattern);
|
||||
if ( matches === null ) { break; }
|
||||
let token = matches[0];
|
||||
let prefix = pattern.slice(0, matches.index);
|
||||
let suffix = pattern.slice(this.reToken.lastIndex);
|
||||
if (
|
||||
this.reRegexTokenAbort.test(prefix) &&
|
||||
this.reRegexTokenAbort.test(suffix)
|
||||
) {
|
||||
const { 0: token, index } = matches;
|
||||
if ( index === 0 || pattern.charAt(index - 1) === '\x01' ) {
|
||||
continue;
|
||||
}
|
||||
if ( token.charCodeAt(0) === 0x62 /* 'b' */ ) {
|
||||
const match = /\\+$/.exec(prefix);
|
||||
if ( match !== null && (match[0].length & 1) !== 0 ) {
|
||||
prefix += 'b';
|
||||
token = token.slice(1);
|
||||
}
|
||||
}
|
||||
const { lastIndex } = this.reToken;
|
||||
if (
|
||||
this.reRegexBadPrefix.test(prefix) || (
|
||||
token.length < this.maxTokenLen &&
|
||||
this.reRegexBadSuffix.test(suffix)
|
||||
token.length < this.maxTokenLen && (
|
||||
lastIndex === pattern.length ||
|
||||
pattern.charAt(lastIndex) === '\x01'
|
||||
)
|
||||
) {
|
||||
continue;
|
||||
|
|
|
@ -0,0 +1,14 @@
|
|||
https://github.com/foo123/RegexAnalyzer/issues/1#issuecomment-750039255
|
||||
|
||||
> The (implied) license is as free as it can get. You can modify it and use
|
||||
> it anywhere you want if it suits you.
|
||||
>
|
||||
> An attribution to original author would be appreciated but even this is not
|
||||
> mandatory.
|
||||
>
|
||||
> Copy Left
|
||||
|
||||
References:
|
||||
|
||||
- https://en.wikipedia.org/wiki/Copyleft
|
||||
- http://gplv3.fsf.org/wiki/index.php/Compatible_licenses
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue