mirror of https://github.com/gorhill/uBlock.git
Improve extraction of tokens from regexes
Fixed flawed extraction of tokens with optional sequences, i.e. when quantifier could be zero. Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/2367 Ignore look-around sequences as suggested when normalizing into tokenizable string. Related issue: - https://github.com/uBlockOrigin/uBlock-issues/issues/2368 Fix regex analyzer throwing with trailing `-` in character class sequence. Related issue: - https://github.com/AdguardTeam/AdguardFilters/pull/134630
This commit is contained in:
parent
560c1fe888
commit
d51b7e082b
|
@ -3009,11 +3009,11 @@ Parser.utils = Parser.prototype.utils = (( ) => {
|
||||||
|
|
||||||
class regex {
|
class regex {
|
||||||
static firstCharCodeClass(s) {
|
static firstCharCodeClass(s) {
|
||||||
return /^[\x01%0-9A-Za-z]/.test(s) ? 1 : 0;
|
return /^[\x01\x03%0-9A-Za-z]/.test(s) ? 1 : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static lastCharCodeClass(s) {
|
static lastCharCodeClass(s) {
|
||||||
return /[\x01%0-9A-Za-z]$/.test(s) ? 1 : 0;
|
return /[\x01\x03%0-9A-Za-z]$/.test(s) ? 1 : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static tokenizableStrFromNode(node) {
|
static tokenizableStrFromNode(node) {
|
||||||
|
@ -3042,18 +3042,24 @@ Parser.utils = Parser.prototype.utils = (( ) => {
|
||||||
return String.fromCharCode(firstChar, lastChar);
|
return String.fromCharCode(firstChar, lastChar);
|
||||||
}
|
}
|
||||||
case 4: /* T_GROUP, 'Group' */ {
|
case 4: /* T_GROUP, 'Group' */ {
|
||||||
if ( node.flags.NegativeLookAhead === 1 ) { return '\x01'; }
|
if (
|
||||||
if ( node.flags.NegativeLookBehind === 1 ) { return '\x01'; }
|
node.flags.LookAhead === 1 ||
|
||||||
|
node.flags.NegativeLookAhead === 1 ||
|
||||||
|
node.flags.LookBehind === 1 ||
|
||||||
|
node.flags.NegativeLookBehind === 1
|
||||||
|
) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
return this.tokenizableStrFromNode(node.val);
|
return this.tokenizableStrFromNode(node.val);
|
||||||
}
|
}
|
||||||
case 16: /* T_QUANTIFIER, 'Quantifier' */ {
|
case 16: /* T_QUANTIFIER, 'Quantifier' */ {
|
||||||
const s = this.tokenizableStrFromNode(node.val);
|
const s = this.tokenizableStrFromNode(node.val);
|
||||||
const first = this.firstCharCodeClass(s);
|
const first = this.firstCharCodeClass(s);
|
||||||
const last = this.lastCharCodeClass(s);
|
const last = this.lastCharCodeClass(s);
|
||||||
if ( node.flags.min === 0 && first === 0 && last === 0 ) {
|
if ( node.flags.min !== 0 ) {
|
||||||
return '';
|
return String.fromCharCode(first, last);
|
||||||
}
|
}
|
||||||
return String.fromCharCode(first, last);
|
return String.fromCharCode(first+2, last+2);
|
||||||
}
|
}
|
||||||
case 64: /* T_HEXCHAR, 'HexChar' */ {
|
case 64: /* T_HEXCHAR, 'HexChar' */ {
|
||||||
return String.fromCharCode(parseInt(node.val.slice(1), 16));
|
return String.fromCharCode(parseInt(node.val.slice(1), 16));
|
||||||
|
@ -3142,13 +3148,33 @@ Parser.utils = Parser.prototype.utils = (( ) => {
|
||||||
|
|
||||||
static toTokenizableStr(reStr) {
|
static toTokenizableStr(reStr) {
|
||||||
if ( regexAnalyzer === null ) { return ''; }
|
if ( regexAnalyzer === null ) { return ''; }
|
||||||
|
let s = '';
|
||||||
try {
|
try {
|
||||||
return this.tokenizableStrFromNode(
|
s = this.tokenizableStrFromNode(
|
||||||
regexAnalyzer(reStr, false).tree()
|
regexAnalyzer(reStr, false).tree()
|
||||||
);
|
);
|
||||||
} catch(ex) {
|
} catch(ex) {
|
||||||
}
|
}
|
||||||
return '';
|
// Process optional sequences
|
||||||
|
const reOptional = /[\x02\x03]+/g;
|
||||||
|
for (;;) {
|
||||||
|
const match = reOptional.exec(s);
|
||||||
|
if ( match === null ) { break; }
|
||||||
|
const left = s.slice(0, match.index);
|
||||||
|
const middle = match[0];
|
||||||
|
const right = s.slice(match.index + middle.length);
|
||||||
|
s = left;
|
||||||
|
s += this.firstCharCodeClass(right) === 1 ||
|
||||||
|
this.firstCharCodeClass(middle) === 1
|
||||||
|
? '\x01'
|
||||||
|
: '\x00';
|
||||||
|
s += this.lastCharCodeClass(left) === 1 ||
|
||||||
|
this.lastCharCodeClass(middle) === 1
|
||||||
|
? '\x01'
|
||||||
|
: '\x00';
|
||||||
|
s += right;
|
||||||
|
}
|
||||||
|
return s;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1019,6 +1019,12 @@ var rnd = function( a, b ){ return Math.round((b-a)*Math.random()+a); },
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ( isRange && escaped === false && ']' === ch )
|
||||||
|
{
|
||||||
|
isRange = false;
|
||||||
|
chars.push('-');
|
||||||
|
}
|
||||||
|
|
||||||
if ( isRange )
|
if ( isRange )
|
||||||
{
|
{
|
||||||
if ( chars.length )
|
if ( chars.length )
|
||||||
|
|
Loading…
Reference in New Issue