Increase resolution of known-token lookup table

Related commit: - 69a43e07c4 Using 32 bits of token hash rather than just the 16 lower bits does help discard more unknown tokens. Using the default filter lists, the known-token lookup table is populated by 12,276 entries, out of 65,536, thus making the case that theoretically there is a lot of possible tokens which can be discarded. In practice, running the built-in staticNetFilteringEngine.benchmark() with default filter lists, I find that 1,518,929 tokens were skipped out of 4,441,891 extracted tokens, or 34%.
2019-04-27 08:18:01 -04:00 · 2019-04-27 08:18:01 -04:00 · 96dce22218
parent 60938451ab
commit 96dce22218
3 changed files with 13 additions and 10 deletions
--- a/src/js/background.js
+++ b/src/js/background.js
@ -138,7 +138,7 @@ const µBlock = (function() { // jshint ignore:line
        // Read-only
        systemSettings: {
            compiledMagic: 12,  // Increase when compiled format changes
-            selfieMagic: 12     // Increase when selfie format changes
+            selfieMagic: 13     // Increase when selfie format changes
        },

        restoreBackupSettings: {
--- a/src/js/static-net-filtering.js
+++ b/src/js/static-net-filtering.js
@ -2326,7 +2326,6 @@ FilterContainer.prototype.freeze = function() {
    const filterDataHolderId = FilterDataHolder.fid;
    const redirectTypeValue = typeNameToTypeValue.redirect;
    const unserialize = µb.CompiledLineIO.unserialize;
-    const knownTokens = this.urlTokenizer.knownTokens;

    for ( const line of this.goodFilters ) {
        if ( this.badFilters.has(line) ) {
@ -2358,7 +2357,7 @@ FilterContainer.prototype.freeze = function() {
                entry.next = bucket;
            }
            this.dataFilters.set(tokenHash, entry);
-            knownTokens[tokenHash & 0xFFFF] = 1;
+            this.urlTokenizer.addKnownToken(tokenHash);
            continue;
        }

@ -2405,7 +2404,7 @@ FilterContainer.prototype.freeze = function() {
            continue;
        }

-        knownTokens[tokenHash & 0xFFFF] = 1;
+        this.urlTokenizer.addKnownToken(tokenHash);

        if ( entry === undefined ) {
            bucket.set(tokenHash, filterFromCompiledData(fdata));
--- a/src/js/utils.js
+++ b/src/js/utils.js
@ -81,11 +81,15 @@

    resetKnownTokens() {
        this.knownTokens.fill(0);
-        this.knownTokens[this.dotTokenHash & 0xFFFF] = 1;
-        this.knownTokens[this.anyTokenHash & 0xFFFF] = 1;
-        this.knownTokens[this.anyHTTPSTokenHash & 0xFFFF] = 1;
-        this.knownTokens[this.anyHTTPTokenHash & 0xFFFF] = 1;
-        this.knownTokens[this.noTokenHash & 0xFFFF] = 1;
+        this.addKnownToken(this.dotTokenHash);
+        this.addKnownToken(this.anyTokenHash);
+        this.addKnownToken(this.anyHTTPSTokenHash);
+        this.addKnownToken(this.anyHTTPTokenHash);
+        this.addKnownToken(this.noTokenHash);
+    }
+
+    addKnownToken(th) {
+        this.knownTokens[th & 0xFFFF ^ th >>> 16] = 1;
    }

    // Tokenize on demand.
@ -172,7 +176,7 @@
                th = th * 64 + v;
                n += 1;
            }
-            if ( knownTokens[th & 0xFFFF] !== 0 ) {
+            if ( knownTokens[th & 0xFFFF ^ th >>> 16] !== 0 ) {
                tokens[j+0] = th;
                tokens[j+1] = ti;
                j += 2;