Use a sequence of base 64 numbers to encode array buffers

The purpose of using a custom base128 encoder is to
convert array buffers into strings, to allow a direct
string-to-array buffer conversion at load time:

  string => array buffer

Whereas a JSON array would require an extra step:

  JSON array as string => JS array => array buffer

Turns out that the current use of a custom base128 encoding
results in a significantly larger selfie storage usage when
converting array buffers into strings.

Speculation: possibly the browser convert the strings to
save into JSON strings internally. Since the custom base128
encoder is likely to cause the resulting string to contain
a lot of unprintable ASCII characters, these will need to
be escaped when converted to JSON -- escaped characters
occupy more space than non-escaped ones.

Using a sequence of base 64 numbers means only printable
will be present in the output string, hence no escaping
necessary. I have observed significant reduction in
storage usage for selfie purpose.
This commit is contained in:
Raymond Hill 2019-04-20 09:06:54 -04:00
parent a0c4183cad
commit fa83744b58
No known key found for this signature in database
GPG Key ID: 25E1490B761470C2
6 changed files with 92 additions and 169 deletions

View File

@ -138,7 +138,7 @@ const µBlock = (function() { // jshint ignore:line
// Read-only
systemSettings: {
compiledMagic: 10, // Increase when compiled format changes
selfieMagic: 10 // Increase when selfie format changes
selfieMagic: 11 // Increase when selfie format changes
},
restoreBackupSettings: {

View File

@ -377,8 +377,8 @@ HNTrieContainer.prototype = {
let byteLength = shouldDecode
? decoder.decodeSize(selfie)
: selfie.length << 2;
if ( byteLength === 0 ) { return false; }
byteLength = byteLength + HNTRIE_PAGE_SIZE-1 & ~(HNTRIE_PAGE_SIZE-1);
if ( byteLength === 0 ) { return; }
if ( this.wasmMemory !== null ) {
const pageCountBefore = this.buf.length >>> 16;
const pageCountAfter = byteLength >>> 16;
@ -396,6 +396,7 @@ HNTrieContainer.prototype = {
} else {
this.buf32.set(selfie);
}
return true;
},
//--------------------------------------------------------------------------

View File

@ -2246,15 +2246,15 @@ FilterContainer.prototype.toSelfie = function(path) {
return Promise.all([
µBlock.assets.put(
`${path}/FilterHostnameDict.trieContainer`,
FilterHostnameDict.trieContainer.serialize(µBlock.base128)
FilterHostnameDict.trieContainer.serialize(µBlock.base64)
),
µBlock.assets.put(
`${path}/FilterOrigin.trieContainer`,
filterOrigin.trieContainer.serialize(µBlock.base128)
filterOrigin.trieContainer.serialize(µBlock.base64)
),
µBlock.assets.put(
`${path}/FilterBucket.trieContainer`,
FilterBucket.trieContainer.serialize(µBlock.base128)
FilterBucket.trieContainer.serialize(µBlock.base64)
),
µBlock.assets.put(
`${path}/main`,
@ -2276,27 +2276,24 @@ FilterContainer.prototype.toSelfie = function(path) {
FilterContainer.prototype.fromSelfie = function(path) {
return Promise.all([
µBlock.assets.get(`${path}/FilterHostnameDict.trieContainer`).then(details => {
µBlock.assets.get(`${path}/FilterHostnameDict.trieContainer`).then(details =>
FilterHostnameDict.trieContainer.unserialize(
details.content,
µBlock.base128
);
return true;
}),
µBlock.assets.get(`${path}/FilterOrigin.trieContainer`).then(details => {
µBlock.base64
)
),
µBlock.assets.get(`${path}/FilterOrigin.trieContainer`).then(details =>
filterOrigin.trieContainer.unserialize(
details.content,
µBlock.base128
);
return true;
}),
µBlock.assets.get(`${path}/FilterBucket.trieContainer`).then(details => {
µBlock.base64
)
),
µBlock.assets.get(`${path}/FilterBucket.trieContainer`).then(details =>
FilterBucket.trieContainer.unserialize(
details.content,
µBlock.base128
);
return true;
}),
µBlock.base64
)
),
µBlock.assets.get(`${path}/main`).then(details => {
let selfie;
try {
@ -2869,7 +2866,9 @@ FilterContainer.prototype.benchmark = function(action) {
if ( expected !== undefined && r !== expected[i] ) {
console.log('Mismatch with reference results:');
console.log(`\tExpected ${expected[i]}, got ${r}:`);
console.log(`\turl=${fctxt.url} docOrigin=${fctxt.getDocOrigin()}`);
console.log(`\ttype=${fctxt.type}`);
console.log(`\turl=${fctxt.url}`);
console.log(`\tdocOrigin=${fctxt.getDocOrigin()}`);
}
}
const t1 = self.performance.now();

View File

@ -1036,7 +1036,7 @@
return this.assets.get(
'compiled/' + this.pslAssetKey
).then(details =>
publicSuffixList.fromSelfie(details.content, µBlock.base128)
publicSuffixList.fromSelfie(details.content, µBlock.base64)
).catch(reason => {
console.info(reason);
return false;
@ -1054,7 +1054,7 @@
publicSuffixList.parse(content, punycode.toASCII);
this.assets.put(
'compiled/' + this.pslAssetKey,
publicSuffixList.toSelfie(µBlock.base128)
publicSuffixList.toSelfie(µBlock.base64)
);
};

View File

@ -252,8 +252,8 @@ const STrieContainer = class {
let byteLength = shouldDecode
? decoder.decodeSize(selfie)
: selfie.length << 2;
if ( byteLength === 0 ) { return false; }
byteLength = byteLength + STRIE_PAGE_SIZE-1 & ~(STRIE_PAGE_SIZE-1);
if ( byteLength === 0 ) { return; }
if ( byteLength > this.buf.length ) {
this.buf = new Uint8Array(byteLength);
this.buf32 = new Uint32Array(this.buf.buffer);
@ -263,6 +263,7 @@ const STrieContainer = class {
} else {
this.buf32.set(selfie);
}
return true;
}
//--------------------------------------------------------------------------

View File

@ -530,7 +530,7 @@
/******************************************************************************/
// Custom base128 encoder/decoder
// Custom base64 encoder/decoder
//
// TODO:
// Could expand the LZ4 codec API to be able to return UTF8-safe string
@ -541,163 +541,85 @@
// JSON string. The fallback can be removed once min supported version is
// above 59.
µBlock.base128 = {
encode: function(arrbuf, arrlen) {
if (
vAPI.webextFlavor.soup.has('chromium') &&
vAPI.webextFlavor.major < 60
) {
return this.encodeJSON(arrbuf);
}
return this.encodeBase128(arrbuf, arrlen);
},
encodeBase128: function(arrbuf, arrlen) {
const inbuf = new Uint8Array(arrbuf, 0, arrlen);
const inputLength = arrlen;
let _7cnt = Math.floor(inputLength / 7);
let outputLength = _7cnt * 8;
let _7rem = inputLength % 7;
if ( _7rem !== 0 ) {
outputLength += 1 + _7rem;
µBlock.base64 = new (class {
constructor() {
this.valToDigit = new Uint8Array(64);
this.digitToVal = new Uint8Array(128);
const chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz@%";
for ( let i = 0, n = chars.length; i < n; i++ ) {
const c = chars.charCodeAt(i);
this.valToDigit[i] = c;
this.digitToVal[c] = i;
}
this.magic = 'Base64_1';
}
encode(arrbuf, arrlen) {
const inputLength = arrlen >>> 2;
const inbuf = new Uint32Array(arrbuf, 0, inputLength);
const outputLength = this.magic.length + 7 + inputLength * 7;
const outbuf = new Uint8Array(outputLength);
let msbits, v;
let i = 0, j = 0;
while ( _7cnt-- ) {
v = inbuf[i+0];
msbits = (v & 0x80) >>> 7;
outbuf[j+1] = v & 0x7F;
v = inbuf[i+1];
msbits |= (v & 0x80) >>> 6;
outbuf[j+2] = v & 0x7F;
v = inbuf[i+2];
msbits |= (v & 0x80) >>> 5;
outbuf[j+3] = v & 0x7F;
v = inbuf[i+3];
msbits |= (v & 0x80) >>> 4;
outbuf[j+4] = v & 0x7F;
v = inbuf[i+4];
msbits |= (v & 0x80) >>> 3;
outbuf[j+5] = v & 0x7F;
v = inbuf[i+5];
msbits |= (v & 0x80) >>> 2;
outbuf[j+6] = v & 0x7F;
v = inbuf[i+6];
msbits |= (v & 0x80) >>> 1;
outbuf[j+7] = v & 0x7F;
outbuf[j+0] = msbits;
i += 7; j += 8;
let j = 0;
for ( let i = 0; i < this.magic.length; i++ ) {
outbuf[j++] = this.magic.charCodeAt(i);
}
if ( _7rem > 0 ) {
msbits = 0;
for ( let ir = 0; ir < _7rem; ir++ ) {
v = inbuf[i+ir];
msbits |= (v & 0x80) >>> (7 - ir);
outbuf[j+ir+1] = v & 0x7F;
}
outbuf[j+0] = msbits;
let v = inputLength;
do {
outbuf[j++] = this.valToDigit[v & 0b111111];
v >>>= 6;
} while ( v !== 0 );
outbuf[j++] = 0x20 /* ' ' */;
for ( let i = 0; i < inputLength; i++ ) {
v = inbuf[i];
do {
outbuf[j++] = this.valToDigit[v & 0b111111];
v >>>= 6;
} while ( v !== 0 );
outbuf[j++] = 0x20 /* ' ' */;
}
const textDecoder = new TextDecoder();
return textDecoder.decode(outbuf);
},
encodeJSON: function(arrbuf) {
return JSON.stringify(Array.from(new Uint32Array(arrbuf)));
},
// TODO:
// Surprisingly, there does not seem to be any performance gain when
// first converting the input string into a Uint8Array through
// TextEncoder. Investigate again to confirm original findings and
// to find out whether results have changed. Not using TextEncoder()
// to create an intermediate input buffer lower peak memory usage
// at selfie load time.
//
// const textEncoder = new TextEncoder();
// const inbuf = textEncoder.encode(instr);
// const inputLength = inbuf.byteLength;
decode: function(instr, arrbuf) {
if ( instr.length === 0 ) { return; }
if ( instr.charCodeAt(0) === 0x5B /* '[' */ ) {
const outbuf = this.decodeJSON(instr, arrbuf);
if ( outbuf !== undefined ) {
return outbuf;
}
return textDecoder.decode(new Uint8Array(outbuf.buffer, 0, j));
}
decode(instr, arrbuf) {
if ( instr.startsWith(this.magic) === false ) {
throw new Error('Invalid µBlock.base64 encoding');
}
if (
vAPI.webextFlavor.soup.has('chromium') &&
vAPI.webextFlavor.major < 60
) {
throw new Error('Unexpected µBlock.base128 encoding');
}
return this.decodeBase128(instr, arrbuf);
},
decodeBase128: function(instr, arrbuf) {
const inputLength = instr.length;
let _8cnt = inputLength >>> 3;
let outputLength = _8cnt * 7;
let _8rem = inputLength % 8;
if ( _8rem !== 0 ) {
outputLength += _8rem - 1;
}
const outbuf = arrbuf instanceof ArrayBuffer === false
? new Uint8Array(outputLength)
: new Uint8Array(arrbuf);
let msbits;
let i = 0, j = 0;
while ( _8cnt-- ) {
msbits = instr.charCodeAt(i+0);
outbuf[j+0] = msbits << 7 & 0x80 | instr.charCodeAt(i+1);
outbuf[j+1] = msbits << 6 & 0x80 | instr.charCodeAt(i+2);
outbuf[j+2] = msbits << 5 & 0x80 | instr.charCodeAt(i+3);
outbuf[j+3] = msbits << 4 & 0x80 | instr.charCodeAt(i+4);
outbuf[j+4] = msbits << 3 & 0x80 | instr.charCodeAt(i+5);
outbuf[j+5] = msbits << 2 & 0x80 | instr.charCodeAt(i+6);
outbuf[j+6] = msbits << 1 & 0x80 | instr.charCodeAt(i+7);
i += 8; j += 7;
? new Uint32Array(this.decodeSize(instr))
: new Uint32Array(arrbuf);
let i = instr.indexOf(' ', this.magic.length) + 1;
if ( i === -1 ) {
throw new Error('Invalid µBlock.base64 encoding');
}
if ( _8rem > 1 ) {
msbits = instr.charCodeAt(i+0);
for ( let ir = 1; ir < _8rem; ir++ ) {
outbuf[j+ir-1] = msbits << (8-ir) & 0x80 | instr.charCodeAt(i+ir);
let j = 0;
for (;;) {
if ( i === inputLength ) { break; }
let v = 0, l = 0;
for (;;) {
const c = instr.charCodeAt(i++);
if ( c === 0x20 /* ' ' */ ) { break; }
v += this.digitToVal[c] << l;
l += 6;
}
outbuf[j++] = v;
}
return outbuf;
},
decodeJSON: function(instr, arrbuf) {
let buf;
try {
buf = JSON.parse(instr);
} catch (ex) {
}
decodeSize(instr) {
if ( instr.startsWith(this.magic) === false ) { return 0; }
let v = 0, l = 0, i = this.magic.length;
for (;;) {
const c = instr.charCodeAt(i++);
if ( c === 0x20 /* ' ' */ ) { break; }
v += this.digitToVal[c] << l;
l += 6;
}
if ( Array.isArray(buf) === false ) { return; }
const outbuf = arrbuf instanceof ArrayBuffer === false
? new Uint32Array(buf.length << 2)
: new Uint32Array(arrbuf);
outbuf.set(buf);
return new Uint8Array(outbuf.buffer);
},
decodeSize: function(instr) {
if ( instr.length === 0 ) { return 0; }
if ( instr.charCodeAt(0) === 0x5B /* '[' */ ) {
let buf;
try {
buf = JSON.parse(instr);
} catch (ex) {
}
if ( Array.isArray(buf) ) {
return buf.length << 2;
}
}
if (
vAPI.webextFlavor.soup.has('chromium') &&
vAPI.webextFlavor.major < 60
) {
throw new Error('Unexpected µBlock.base128 encoding');
}
const size = (instr.length >>> 3) * 7;
const rem = instr.length & 7;
return rem === 0 ? size : size + rem - 1;
},
};
return v << 2;
}
})();
/******************************************************************************/