Improve validation of hostname in `domain=` and `denyallow` options

Related issue:
- https://github.com/uBlockOrigin/uBlock-issues/issues/1249

For "exotic" hostname values, the browser's own API will be
used to ultimately validate hostname values.
This commit is contained in:
Raymond Hill 2020-09-18 10:23:02 -04:00
parent 7e906b33c5
commit 3f299ef623
No known key found for this signature in database
GPG Key ID: 25E1490B761470C2
2 changed files with 102 additions and 97 deletions

View File

@ -108,6 +108,10 @@ const Parser = class {
this.reHostsSource = /^[^\x00-\x24\x26-\x29\x2B\x2C\x2F\x3A-\x40\x5B-\x5E\x60\x7B-\x7F]+$/;
this.reUnicodeChar = /[^\x00-\x7F]/;
this.reUnicodeChars = /[^\x00-\x7F]/g;
this.reHostnameLabel = /[^.]+/g;
this.rePlainHostname = /^(?:[\w-]+\.)*[a-z]+$/;
this.rePlainEntity = /^(?:[\w-]+\.)+\*$/;
this.reEntity = /^[^*]+\.\*$/;
this.punycoder = new URL(self.location);
this.selectorCompiler = new this.SelectorCompiler(this);
// TODO: reuse for network filtering analysis
@ -313,7 +317,7 @@ const Parser = class {
analyzeExtExtra() {
if ( this.hasOptions() ) {
const { i, len } = this.optionsSpan;
this.analyzeDomainList(i, i + len, BITComma, 0b11);
this.analyzeDomainList(i, i + len, BITComma, 0b1110);
}
if ( hasBits(this.flavorBits, BITFlavorUnsupported) ) {
this.markSpan(this.patternSpan, BITError);
@ -668,66 +672,62 @@ const Parser = class {
}
}
// bits:
// 0: can use entity-based hostnames
// 1: can use single wildcard
analyzeDomain(from, to, optionBits) {
const { slices } = this;
let len = to - from;
if ( len === 0 ) { return false; }
const not = hasBits(slices[from], BITTilde);
if ( not ) {
if ( (optionBits & 0b01) === 0 || slices[from+2] > 1 ) { return false; }
from += 3;
len -= 3;
analyzeDomain(from, to, modeBits) {
if ( to === from ) { return false; }
return this.normalizeHostnameValue(
this.strFromSlices(from, to - 3),
modeBits
) !== undefined;
}
// Ultimately, let the browser API do the hostname normalization, after
// making some other trivial checks.
//
// modeBits:
// 0: can use wildcard at any position
// 1: can use entity-based hostnames
// 2: can use single wildcard
// 3: can be negated
normalizeHostnameValue(s, modeBits = 0b0000) {
const not = s.charCodeAt(0) === 0x7E /* '~' */;
if ( not && (modeBits & 0b1000) === 0 ) { return; }
let hn = not === false ? s : s.slice(1);
if ( this.rePlainHostname.test(hn) ) { return s; }
const hasWildcard = hn.lastIndexOf('*') !== -1;
if ( hasWildcard ) {
if ( modeBits === 0 ) { return; }
if ( hn.length === 1 ) {
if ( not || (modeBits & 0b0100) === 0 ) { return; }
return s;
}
if ( (modeBits & 0b0010) !== 0 ) {
if ( this.rePlainEntity.test(hn) ) { return s; }
if ( this.reEntity.test(hn) === false ) { return; }
} else if ( (modeBits & 0b0001) === 0 ) {
return;
}
hn = hn.replace(/\*/g, '__asterisk__');
}
this.punycoder.hostname = '_';
try {
this.punycoder.hostname = hn;
hn = this.punycoder.hostname;
} catch (_) {
return;
}
if ( hn === '_' || hn === '' ) { return; }
if ( hasWildcard ) {
hn = this.punycoder.hostname.replace(/__asterisk__/g, '*');
}
if ( len === 0 ) { return false; }
// One slice only, check for single asterisk
if (
len === 3 &&
not === false &&
(optionBits & 0b10) !== 0 &&
hasBits(slices[from], BITAsterisk)
(modeBits & 0b0001) === 0 && (
hn.charCodeAt(0) === 0x2E /* '.' */ ||
hn.charCodeAt(hn.length - 1) === 0x2E /* '.' */
)
) {
return slices[from+2] === 1;
return;
}
// First slice must be regex-equivalent of `\w`
if ( hasNoBits(slices[from], BITRegexWord | BITUnicode) ) { return false; }
// Last slice
if ( len > 3 ) {
const last = to - 3;
if ( hasBits(slices[last], BITAsterisk) ) {
if (
(optionBits & 0b01) === 0 ||
len < 9 ||
slices[last+2] > 1 ||
hasNoBits(slices[last-3], BITPeriod)
) {
return false;
}
} else if ( hasNoBits(slices[to-3], BITAlphaNum | BITUnicode) ) {
return false;
}
}
// Middle slices
if ( len > 6 ) {
for ( let i = from + 3; i < to - 3; i += 3 ) {
const bits = slices[i];
if ( hasNoBits(bits, BITHostname) ) { return false; }
if ( hasBits(bits, BITPeriod) && slices[i+2] > 1 ) {
return false;
}
if (
hasBits(bits, BITDash) && (
hasNoBits(slices[i-3], BITRegexWord | BITUnicode) ||
hasNoBits(slices[i+3], BITRegexWord | BITUnicode)
)
) {
return false;
}
}
}
return true;
return not ? '~' + hn : hn;
}
slice(raw) {
@ -1081,6 +1081,8 @@ const Parser = class {
// Be ready to deal with non-punycode-able Unicode characters.
// https://github.com/uBlockOrigin/uBlock-issues/issues/772
// Encode Unicode characters beyond the hostname part.
// Prepend with '*' character to prevent the browser API from refusing to
// punycode -- this occurs when the extracted label starts with a dash.
toASCII(dryrun = false) {
if ( this.patternHasUnicode() === false ) { return true; }
const { i, len } = this.patternSpan;
@ -1090,16 +1092,14 @@ const Parser = class {
// Punycode hostname part of the pattern.
if ( patternIsRegex === false ) {
const match = this.reHostname.exec(pattern);
if ( match === null ) { return true; }
try {
this.punycoder.hostname = match[0].replace(/\*/g, '__asterisk__');
} catch(ex) {
return false;
if ( match !== null ) {
const hn = match[0].replace(this.reHostnameLabel, s => {
if ( this.reUnicodeChar.test(s) === false ) { return s; }
if ( s.charCodeAt(0) === 0x2D /* '-' */ ) { s = '*' + s; }
return this.normalizeHostnameValue(s, 0b0001) || s;
});
pattern = hn + pattern.slice(match.index + match[0].length);
}
const hn = this.punycoder.hostname;
if ( hn === '' ) { return false; }
const punycoded = hn.replace(/__asterisk__/g, '*');
pattern = punycoded + pattern.slice(match.index + match[0].length);
}
// Percent-encode remaining Unicode characters.
if ( this.reUnicodeChar.test(pattern) ) {
@ -1755,7 +1755,6 @@ const BITError = 1 << 31;
const BITAll = 0xFFFFFFFF;
const BITAlphaNum = BITNum | BITAlpha;
const BITRegexWord = BITAlphaNum | BITUnderscore;
const BITHostname = BITNum | BITAlpha | BITUppercase | BITDash | BITPeriod | BITUnderscore | BITUnicode;
const BITPatternToken = BITNum | BITAlpha | BITPercent;
const BITLineComment = BITExclamation | BITHash | BITSquareBracket;
@ -2226,7 +2225,7 @@ const NetOptionsIterator = class {
if ( this.interactive && hasBits(descriptor, OPTDomainList) ) {
this.parser.analyzeDomainList(
lval + 3, i, BITPipe,
(descriptor & 0xFF) === OPTTokenDomain ? 0b01 : 0b00
(descriptor & 0xFF) === OPTTokenDomain ? 0b1010 : 0b0000
);
}
} else {

View File

@ -20,7 +20,6 @@
*/
/* jshint bitwise: false */
/* global punycode */
'use strict';
@ -1120,12 +1119,12 @@ const filterOrigin = (( ) => {
this.trieContainer = new µb.HNTrieContainer();
}
compile(domainOpt, prepend, units) {
compile(domainOptList, prepend, units) {
const hostnameHits = [];
const hostnameMisses = [];
const entityHits = [];
const entityMisses = [];
for ( const s of FilterParser.domainOptIterator(domainOpt) ) {
for ( const s of domainOptList ) {
const len = s.length;
const beg = len > 1 && s.charCodeAt(0) === 0x7E ? 1 : 0;
const end = len > 2 &&
@ -1770,7 +1769,7 @@ const FilterDenyAllow = class {
}
static compile(details) {
return [ FilterDenyAllow.fid, details.denyallow ];
return [ FilterDenyAllow.fid, details.denyallowOpt ];
}
static fromCompiled(args) {
@ -2074,17 +2073,15 @@ const FILTER_SEQUENCES_MIN = filterSequenceWritePtr;
const FilterParser = class {
constructor(parser) {
this.cantWebsocket = vAPI.cantWebsocket;
this.domainOpt = '';
this.noTokenHash = urlTokenizer.noTokenHash;
this.reBadDomainOptChars = /[+?^${}()[\]\\]/;
this.reIsolateHostname = /^(\*?\.)?([^\x00-\x24\x26-\x2C\x2F\x3A-\x5E\x60\x7B-\x7F]+)(.*)/;
this.reHasUnicode = /[^\x00-\x7F]/;
this.reBadCSP = /(?:=|;)\s*report-(?:to|uri)\b/;
this.reRegexToken = /[%0-9A-Za-z]{2,}/g;
this.reRegexTokenAbort = /[([]/;
this.reRegexBadPrefix = /(^|[^\\]\.|[*?{}\\])$/;
this.reRegexBadSuffix = /^([^\\]\.|\\[dw]|[([{}?*.]|$)/;
this.reGoodToken = /[%0-9a-z]{1,}/g;
this.domainOptList = [];
this.tokenIdToNormalizedType = new Map([
[ parser.OPTTokenCname, bitFromType('cname') ],
[ parser.OPTTokenCss, bitFromType('stylesheet') ],
@ -2237,7 +2234,7 @@ const FilterParser = class {
this.thirdParty = false;
this.party = AnyParty;
this.domainOpt = '';
this.denyallow = '';
this.denyallowOpt = '';
this.isPureHostname = false;
this.isRegex = false;
this.redirect = 0;
@ -2291,20 +2288,24 @@ const FilterParser = class {
}
}
parseHostnameList(parser, s) {
if ( parser.optionHasUnicode() ) {
const hostnames = s.split('|');
let i = hostnames.length;
while ( i-- ) {
if ( this.reHasUnicode.test(hostnames[i]) ) {
hostnames[i] = punycode.toASCII(hostnames[i]);
}
parseHostnameList(parser, s, modeBits, out = []) {
let beg = 0;
let slen = s.length;
let i = 0;
while ( beg < slen ) {
let end = s.indexOf('|', beg);
if ( end === -1 ) { end = slen; }
const hn = parser.normalizeHostnameValue(
s.slice(beg, end),
modeBits
);
if ( hn !== undefined ) {
out[i] = hn; i += 1;
}
s = hostnames.join('|');
beg = end + 1;
}
// TODO: revisit
if ( this.reBadDomainOptChars.test(s) ) { return ''; }
return s;
out.length = i;
return i === 1 ? out[0] : out.join('|');
}
parseOptions(parser) {
@ -2337,12 +2338,17 @@ const FilterParser = class {
// Detect and discard filter if domain option contains nonsensical
// characters.
case parser.OPTTokenDomain:
this.domainOpt = this.parseHostnameList(parser, val);
this.domainOpt = this.parseHostnameList(
parser,
val,
0b1010,
this.domainOptList
);
if ( this.domainOpt === '' ) { return false; }
break;
case parser.OPTTokenDenyAllow:
this.denyallow = this.parseHostnameList(parser, val);
if ( this.denyallow === '' ) { return false; }
this.denyallowOpt = this.parseHostnameList(parser, val, 0b0000);
if ( this.denyallowOpt === '' ) { return false; }
break;
// https://www.reddit.com/r/uBlockOrigin/comments/d6vxzj/
// Add support for `elemhide`. Rarely used but it happens.
@ -2559,7 +2565,7 @@ const FilterParser = class {
isJustOrigin() {
return this.isRegex === false &&
this.dataType === undefined &&
this.denyallow === '' &&
this.denyallowOpt === '' &&
this.domainOpt !== '' && (
this.pattern === '*' || (
this.anchor === 0b010 &&
@ -2961,7 +2967,7 @@ FilterContainer.prototype.compile = function(parser, writer) {
if (
parsed.isPureHostname &&
parsed.domainOpt === '' &&
parsed.denyallow === '' &&
parsed.denyallowOpt === '' &&
parsed.dataType === undefined
) {
parsed.tokenHash = this.dotTokenHash;
@ -2990,7 +2996,7 @@ FilterContainer.prototype.compile = function(parser, writer) {
parsed.tokenHash = this.anyHTTPTokenHash;
}
const entities = [];
for ( const hn of FilterParser.domainOptIterator(parsed.domainOpt) ) {
for ( const hn of parsed.domainOptList ) {
if ( parsed.domainIsEntity(hn) === false ) {
this.compileToAtomicFilter(parsed, hn, writer);
} else {
@ -3004,7 +3010,7 @@ FilterContainer.prototype.compile = function(parser, writer) {
const units = [];
filterPattern.compile(parsed, units);
if ( leftAnchored ) { units.push(FilterAnchorLeft.compile()); }
filterOrigin.compile(entity, true, units);
filterOrigin.compile([ entity ], true, units);
this.compileToAtomicFilter(
parsed, FilterCompositeAll.compile(units), writer
);
@ -3034,14 +3040,14 @@ FilterContainer.prototype.compile = function(parser, writer) {
// Origin
if ( parsed.domainOpt !== '' ) {
filterOrigin.compile(
parsed.domainOpt,
parsed.domainOptList,
units.length !== 0 && filterClasses[units[0][0]].isSlow === true,
units
);
}
// Deny-allow
if ( parsed.denyallow !== '' ) {
if ( parsed.denyallowOpt !== '' ) {
units.push(FilterDenyAllow.compile(parsed));
}