Use a sequence of base 64 numbers to encode array buffers

The purpose of using a custom base128 encoder is to convert array buffers into strings, to allow a direct string-to-array buffer conversion at load time: string => array buffer Whereas a JSON array would require an extra step: JSON array as string => JS array => array buffer Turns out that the current use of a custom base128 encoding results in a significantly larger selfie storage usage when converting array buffers into strings. Speculation: possibly the browser convert the strings to save into JSON strings internally. Since the custom base128 encoder is likely to cause the resulting string to contain a lot of unprintable ASCII characters, these will need to be escaped when converted to JSON -- escaped characters occupy more space than non-escaped ones. Using a sequence of base 64 numbers means only printable will be present in the output string, hence no escaping necessary. I have observed significant reduction in storage usage for selfie purpose.
2019-04-20 09:06:54 -04:00 · 2019-04-20 09:06:54 -04:00 · fa83744b58
parent a0c4183cad
commit fa83744b58
6 changed files with 92 additions and 169 deletions
--- a/src/js/background.js
+++ b/src/js/background.js
@ -138,7 +138,7 @@ const µBlock = (function() { // jshint ignore:line
        // Read-only
        systemSettings: {
            compiledMagic: 10,  // Increase when compiled format changes
-            selfieMagic: 10     // Increase when selfie format changes
+            selfieMagic: 11     // Increase when selfie format changes
        },

        restoreBackupSettings: {
--- a/src/js/hntrie.js
+++ b/src/js/hntrie.js
@ -377,8 +377,8 @@ HNTrieContainer.prototype = {
        let byteLength = shouldDecode
            ? decoder.decodeSize(selfie)
            : selfie.length << 2;
+        if ( byteLength === 0 ) { return false; }
        byteLength = byteLength + HNTRIE_PAGE_SIZE-1 & ~(HNTRIE_PAGE_SIZE-1);
-        if ( byteLength === 0 ) { return; }
        if ( this.wasmMemory !== null ) {
            const pageCountBefore = this.buf.length >>> 16;
            const pageCountAfter = byteLength >>> 16;
@ -396,6 +396,7 @@ HNTrieContainer.prototype = {
        } else {
            this.buf32.set(selfie);
        }
+        return true;
    },

    //--------------------------------------------------------------------------
--- a/src/js/static-net-filtering.js
+++ b/src/js/static-net-filtering.js
@ -2246,15 +2246,15 @@ FilterContainer.prototype.toSelfie = function(path) {
    return Promise.all([
        µBlock.assets.put(
            `${path}/FilterHostnameDict.trieContainer`,
-            FilterHostnameDict.trieContainer.serialize(µBlock.base128)
+            FilterHostnameDict.trieContainer.serialize(µBlock.base64)
        ),
        µBlock.assets.put(
            `${path}/FilterOrigin.trieContainer`,
-            filterOrigin.trieContainer.serialize(µBlock.base128)
+            filterOrigin.trieContainer.serialize(µBlock.base64)
        ),
        µBlock.assets.put(
            `${path}/FilterBucket.trieContainer`,
-            FilterBucket.trieContainer.serialize(µBlock.base128)
+            FilterBucket.trieContainer.serialize(µBlock.base64)
        ),
        µBlock.assets.put(
            `${path}/main`,
@ -2276,27 +2276,24 @@ FilterContainer.prototype.toSelfie = function(path) {

 FilterContainer.prototype.fromSelfie = function(path) {
    return Promise.all([
-        µBlock.assets.get(`${path}/FilterHostnameDict.trieContainer`).then(details => {
+        µBlock.assets.get(`${path}/FilterHostnameDict.trieContainer`).then(details =>
            FilterHostnameDict.trieContainer.unserialize(
                details.content,
-                µBlock.base128
-            );
-            return true;
-        }),
-        µBlock.assets.get(`${path}/FilterOrigin.trieContainer`).then(details => {
+                µBlock.base64
+            )
+        ),
+        µBlock.assets.get(`${path}/FilterOrigin.trieContainer`).then(details =>
            filterOrigin.trieContainer.unserialize(
                details.content,
-                µBlock.base128
-            );
-            return true;
-        }),
-        µBlock.assets.get(`${path}/FilterBucket.trieContainer`).then(details => {
+                µBlock.base64
+            )
+        ),
+        µBlock.assets.get(`${path}/FilterBucket.trieContainer`).then(details =>
            FilterBucket.trieContainer.unserialize(
                details.content,
-                µBlock.base128
-            );
-            return true;
-        }),
+                µBlock.base64
+            )
+        ),
        µBlock.assets.get(`${path}/main`).then(details => {
            let selfie;
            try {
@ -2869,7 +2866,9 @@ FilterContainer.prototype.benchmark = function(action) {
            if ( expected !== undefined && r !== expected[i] ) {
                console.log('Mismatch with reference results:');
                console.log(`\tExpected ${expected[i]}, got ${r}:`);
-                console.log(`\turl=${fctxt.url} docOrigin=${fctxt.getDocOrigin()}`);
+                console.log(`\ttype=${fctxt.type}`);
+                console.log(`\turl=${fctxt.url}`);
+                console.log(`\tdocOrigin=${fctxt.getDocOrigin()}`);
            }
        }
        const t1 = self.performance.now();
--- a/src/js/storage.js
+++ b/src/js/storage.js
@ -1036,7 +1036,7 @@
    return this.assets.get(
        'compiled/' + this.pslAssetKey
    ).then(details =>
-        publicSuffixList.fromSelfie(details.content, µBlock.base128)
+        publicSuffixList.fromSelfie(details.content, µBlock.base64)
    ).catch(reason => {
        console.info(reason);
        return false;
@ -1054,7 +1054,7 @@
    publicSuffixList.parse(content, punycode.toASCII);
    this.assets.put(
        'compiled/' + this.pslAssetKey,
-        publicSuffixList.toSelfie(µBlock.base128)
+        publicSuffixList.toSelfie(µBlock.base64)
    );
 };

--- a/src/js/strie.js
+++ b/src/js/strie.js
@ -252,8 +252,8 @@ const STrieContainer = class {
        let byteLength = shouldDecode
            ? decoder.decodeSize(selfie)
            : selfie.length << 2;
+        if ( byteLength === 0 ) { return false; }
        byteLength = byteLength + STRIE_PAGE_SIZE-1 & ~(STRIE_PAGE_SIZE-1);
-        if ( byteLength === 0 ) { return; }
        if ( byteLength > this.buf.length ) {
            this.buf = new Uint8Array(byteLength);
            this.buf32 = new Uint32Array(this.buf.buffer);
@ -263,6 +263,7 @@ const STrieContainer = class {
        } else {
            this.buf32.set(selfie);
        }
+        return true;
    }

    //--------------------------------------------------------------------------
--- a/src/js/utils.js
+++ b/src/js/utils.js
@ -530,7 +530,7 @@

 /******************************************************************************/

-// Custom base128 encoder/decoder
+// Custom base64 encoder/decoder
 //
 // TODO:
 //   Could expand the LZ4 codec API to be able to return UTF8-safe string
@ -541,163 +541,85 @@
 //   JSON string. The fallback can be removed once min supported version is
 //   above 59.

-µBlock.base128 = {
-    encode: function(arrbuf, arrlen) {
-        if (
-            vAPI.webextFlavor.soup.has('chromium') &&
-            vAPI.webextFlavor.major < 60
-        ) {
-            return this.encodeJSON(arrbuf);
-        }
-        return this.encodeBase128(arrbuf, arrlen);
-    },
-    encodeBase128: function(arrbuf, arrlen) {
-        const inbuf = new Uint8Array(arrbuf, 0, arrlen);
-        const inputLength = arrlen;
-        let _7cnt = Math.floor(inputLength / 7);
-        let outputLength = _7cnt * 8;
-        let _7rem = inputLength % 7;
-        if ( _7rem !== 0 ) {
-            outputLength += 1 + _7rem;
+µBlock.base64 = new (class {
+    constructor() {
+        this.valToDigit = new Uint8Array(64);
+        this.digitToVal = new Uint8Array(128);
+        const chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz@%";
+        for ( let i = 0, n = chars.length; i < n; i++ ) {
+            const c = chars.charCodeAt(i);
+            this.valToDigit[i] = c;
+            this.digitToVal[c] = i;
        }
+        this.magic = 'Base64_1';
+    }
+
+    encode(arrbuf, arrlen) {
+        const inputLength = arrlen >>> 2;
+        const inbuf = new Uint32Array(arrbuf, 0, inputLength);
+        const outputLength = this.magic.length + 7 + inputLength * 7;
        const outbuf = new Uint8Array(outputLength);
-        let msbits, v;
-        let i = 0, j = 0;
-        while ( _7cnt--  ) {
-            v = inbuf[i+0];
-            msbits  = (v & 0x80) >>> 7;
-            outbuf[j+1] = v & 0x7F;
-            v = inbuf[i+1];
-            msbits |= (v & 0x80) >>> 6;
-            outbuf[j+2] = v & 0x7F;
-            v = inbuf[i+2];
-            msbits |= (v & 0x80) >>> 5;
-            outbuf[j+3] = v & 0x7F;
-            v = inbuf[i+3];
-            msbits |= (v & 0x80) >>> 4;
-            outbuf[j+4] = v & 0x7F;
-            v = inbuf[i+4];
-            msbits |= (v & 0x80) >>> 3;
-            outbuf[j+5] = v & 0x7F;
-            v = inbuf[i+5];
-            msbits |= (v & 0x80) >>> 2;
-            outbuf[j+6] = v & 0x7F;
-            v = inbuf[i+6];
-            msbits |= (v & 0x80) >>> 1;
-            outbuf[j+7] = v & 0x7F;
-            outbuf[j+0] = msbits;
-            i += 7; j += 8;
+        let j = 0;
+        for ( let i = 0; i < this.magic.length; i++ ) {
+            outbuf[j++] = this.magic.charCodeAt(i);
        }
-        if ( _7rem > 0 ) {
-            msbits = 0;
-            for ( let ir = 0; ir < _7rem; ir++ ) {
-                v = inbuf[i+ir];
-                msbits |= (v & 0x80) >>> (7 - ir);
-                outbuf[j+ir+1] = v & 0x7F;
-            }
-            outbuf[j+0] = msbits;
+        let v = inputLength;
+        do {
+            outbuf[j++] = this.valToDigit[v & 0b111111];
+            v >>>= 6;
+        } while ( v !== 0 );
+        outbuf[j++] = 0x20 /* ' ' */;
+        for ( let i = 0; i < inputLength; i++ ) {
+            v = inbuf[i];
+            do {
+                outbuf[j++] = this.valToDigit[v & 0b111111];
+                v >>>= 6;
+            } while ( v !== 0 );
+            outbuf[j++] = 0x20 /* ' ' */;
        }
        const textDecoder = new TextDecoder();
-        return textDecoder.decode(outbuf);
-    },
-    encodeJSON: function(arrbuf) {
-        return JSON.stringify(Array.from(new Uint32Array(arrbuf)));
-    },
-    // TODO:
-    //   Surprisingly, there does not seem to be any performance gain when
-    //   first converting the input string into a Uint8Array through
-    //   TextEncoder. Investigate again to confirm original findings and
-    //   to find out whether results have changed. Not using TextEncoder()
-    //   to create an intermediate input buffer lower peak memory usage
-    //   at selfie load time.
-    //
-    //   const textEncoder = new TextEncoder();
-    //   const inbuf = textEncoder.encode(instr);
-    //   const inputLength = inbuf.byteLength;
-    decode: function(instr, arrbuf) {
-        if ( instr.length === 0 ) { return; }
-        if ( instr.charCodeAt(0) === 0x5B /* '[' */ ) {
-            const outbuf = this.decodeJSON(instr, arrbuf);
-            if ( outbuf !== undefined ) {
-                return outbuf;
-            }
+        return textDecoder.decode(new Uint8Array(outbuf.buffer, 0, j));
+    }
+
+    decode(instr, arrbuf) {
+        if ( instr.startsWith(this.magic) === false ) {
+            throw new Error('Invalid µBlock.base64 encoding');
        }
-        if (
-            vAPI.webextFlavor.soup.has('chromium') &&
-            vAPI.webextFlavor.major < 60
-        ) {
-            throw new Error('Unexpected µBlock.base128 encoding');
-        }
-        return this.decodeBase128(instr, arrbuf);
-    },
-    decodeBase128: function(instr, arrbuf) {
        const inputLength = instr.length;
-        let _8cnt = inputLength >>> 3;
-        let outputLength = _8cnt * 7;
-        let _8rem = inputLength % 8;
-        if ( _8rem !== 0 ) {
-            outputLength += _8rem - 1;
-        }
        const outbuf = arrbuf instanceof ArrayBuffer === false
-            ? new Uint8Array(outputLength)
-            : new Uint8Array(arrbuf);
-        let msbits;
-        let i = 0, j = 0;
-        while ( _8cnt-- ) {
-            msbits = instr.charCodeAt(i+0);
-            outbuf[j+0] = msbits << 7 & 0x80 | instr.charCodeAt(i+1);
-            outbuf[j+1] = msbits << 6 & 0x80 | instr.charCodeAt(i+2);
-            outbuf[j+2] = msbits << 5 & 0x80 | instr.charCodeAt(i+3);
-            outbuf[j+3] = msbits << 4 & 0x80 | instr.charCodeAt(i+4);
-            outbuf[j+4] = msbits << 3 & 0x80 | instr.charCodeAt(i+5);
-            outbuf[j+5] = msbits << 2 & 0x80 | instr.charCodeAt(i+6);
-            outbuf[j+6] = msbits << 1 & 0x80 | instr.charCodeAt(i+7);
-            i += 8; j += 7;
+            ? new Uint32Array(this.decodeSize(instr))
+            : new Uint32Array(arrbuf);
+        let i = instr.indexOf(' ', this.magic.length) + 1;
+        if ( i === -1 ) {
+            throw new Error('Invalid µBlock.base64 encoding');
        }
-        if ( _8rem > 1 ) {
-            msbits = instr.charCodeAt(i+0);
-            for ( let ir = 1; ir < _8rem; ir++ ) {
-                outbuf[j+ir-1] = msbits << (8-ir) & 0x80 | instr.charCodeAt(i+ir);
+        let j = 0;
+        for (;;) {
+            if ( i === inputLength ) { break; }
+            let v = 0, l = 0;
+            for (;;) {
+                const c = instr.charCodeAt(i++);
+                if ( c === 0x20 /* ' ' */ ) { break; }
+                v += this.digitToVal[c] << l;
+                l += 6;
            }
+            outbuf[j++] = v;
        }
        return outbuf;
-    },
-    decodeJSON: function(instr, arrbuf) {
-        let buf;
-        try {
-            buf = JSON.parse(instr);
-        } catch (ex) {
+    }
+
+    decodeSize(instr) {
+        if ( instr.startsWith(this.magic) === false ) { return 0; }
+        let v = 0, l = 0, i = this.magic.length;
+        for (;;) {
+            const c = instr.charCodeAt(i++);
+            if ( c === 0x20 /* ' ' */ ) { break; }
+            v += this.digitToVal[c] << l;
+            l += 6;
        }
-        if ( Array.isArray(buf) === false ) { return; }
-        const outbuf = arrbuf instanceof ArrayBuffer === false
-            ? new Uint32Array(buf.length << 2)
-            : new Uint32Array(arrbuf);
-        outbuf.set(buf);
-        return new Uint8Array(outbuf.buffer);
-    },
-    decodeSize: function(instr) {
-        if ( instr.length === 0 ) { return 0; }
-        if ( instr.charCodeAt(0) === 0x5B /* '[' */ ) {
-            let buf;
-            try {
-                buf = JSON.parse(instr);
-            } catch (ex) {
-            }
-            if ( Array.isArray(buf) ) {
-                return buf.length << 2;
-            }
-        }
-        if (
-            vAPI.webextFlavor.soup.has('chromium') &&
-            vAPI.webextFlavor.major < 60
-        ) {
-            throw new Error('Unexpected µBlock.base128 encoding');
-        }
-        const size = (instr.length >>> 3) * 7;
-        const rem = instr.length & 7;
-        return rem === 0 ? size : size + rem - 1;
-    },
-};
+        return v << 2;
+    }
+})();

 /******************************************************************************/