Assume UTF-8 when no encoding can be looked up.

This will make HTML filtering and `replace=` filter option less likely to be bypassed by uBO, as the body response filterer previously required an encoding to be expressly declared before acting on the response body. UTF-8 usage is currently reported as ~98.2%: https://w3techs.com/technologies/history_overview/character_encoding
2024-03-05 11:11:42 -05:00 · 2024-03-05 11:11:42 -05:00 · 63acdcbdeb
parent b95a1e987f
commit 63acdcbdeb
1 changed files with 10 additions and 6 deletions
--- a/src/js/traffic.js
+++ b/src/js/traffic.js
@ -749,7 +749,7 @@ const bodyFilterer = (( ) => {
            /* t */ if ( bytes[i+6] !== 0x74 ) { continue; }
            break;
        }
-        if ( (i - 40) >= 65536 ) { return; }
+        if ( (i + 40) >= 65536 ) { return; }
        i += 8;
        // find first alpha character
        let j = -1;
@ -827,13 +827,17 @@ const bodyFilterer = (( ) => {
        }
        if ( this.status !== 'finishedtransferringdata' ) { return; }

-        // If encoding is still unknown, try to extract from stream data
+        // If encoding is still unknown, try to extract from stream data.
+        // Just assume utf-8 if ultimately no encoding can be looked up.
        if ( session.charset === undefined ) {
            const charsetFound = charsetFromStream(session.buffer);
-            if ( charsetFound === undefined ) { return streamClose(session); }
-            const charsetUsed = textEncode.normalizeCharset(charsetFound);
-            if ( charsetUsed === undefined ) { return streamClose(session); }
-            session.charset = charsetUsed;
+            if ( charsetFound !== undefined ) {
+                const charsetUsed = textEncode.normalizeCharset(charsetFound);
+                if ( charsetUsed === undefined ) { return streamClose(session); }
+                session.charset = charsetUsed;
+            } else {
+                session.charset = 'utf-8';
+            }
        }

        while ( session.jobs.length !== 0 ) {