Assume UTF-8 when no encoding can be looked up.

This will make HTML filtering and `replace=` filter option less
likely to be bypassed by uBO, as the body response filterer
previously required an encoding to be expressly declared before
acting on the response body.

UTF-8 usage is currently reported as ~98.2%:
https://w3techs.com/technologies/history_overview/character_encoding
This commit is contained in:
Raymond Hill 2024-03-05 11:11:42 -05:00
parent b95a1e987f
commit 63acdcbdeb
No known key found for this signature in database
GPG Key ID: 25E1490B761470C2
1 changed files with 10 additions and 6 deletions

View File

@ -749,7 +749,7 @@ const bodyFilterer = (( ) => {
/* t */ if ( bytes[i+6] !== 0x74 ) { continue; }
break;
}
if ( (i - 40) >= 65536 ) { return; }
if ( (i + 40) >= 65536 ) { return; }
i += 8;
// find first alpha character
let j = -1;
@ -827,13 +827,17 @@ const bodyFilterer = (( ) => {
}
if ( this.status !== 'finishedtransferringdata' ) { return; }
// If encoding is still unknown, try to extract from stream data
// If encoding is still unknown, try to extract from stream data.
// Just assume utf-8 if ultimately no encoding can be looked up.
if ( session.charset === undefined ) {
const charsetFound = charsetFromStream(session.buffer);
if ( charsetFound === undefined ) { return streamClose(session); }
const charsetUsed = textEncode.normalizeCharset(charsetFound);
if ( charsetUsed === undefined ) { return streamClose(session); }
session.charset = charsetUsed;
if ( charsetFound !== undefined ) {
const charsetUsed = textEncode.normalizeCharset(charsetFound);
if ( charsetUsed === undefined ) { return streamClose(session); }
session.charset = charsetUsed;
} else {
session.charset = 'utf-8';
}
}
while ( session.jobs.length !== 0 ) {