Do not store impossible to match filters in HNTrie

Consider the two following filters:

    example.com
    www.example.com

This commit make it so that if the first filter is
already present in a given HNTrie, the second filter
will not be stored, since HNTrie will _always_
return the first filter as a match whenever the
hostname to match is example.com or any subdomain
of example.com.

The detection of such pointless filters is
virtually free when adding a hostname to an HNTrie
instance (given how data is stored in the trie), so
in practice no overhead is incurred to detect such
pointless filters.

The ability to ignore impossible to match filters
in HNTrie instances will _especially_ benefit those
using large hosts files.

Examples of how this helps using real configurations:

- Default lists:
  444 filters out of 100,382 were ignored as a result
  of this commit.

- Default lists + "Energized Ultimate Protection":
  283,669 filters out of 903,235 were ignored as a
  result of this commit.

Side note: There was no measurable difference between
the two configurations above in the performance of
the matching algorithm as reported by the built-in
benchmark tool.
This commit is contained in:
Raymond Hill 2019-04-29 13:15:16 -04:00
parent c4f9ae706a
commit adabb56dc9
No known key found for this signature in database
GPG Key ID: 25E1490B761470C2
4 changed files with 85 additions and 38 deletions

View File

@ -138,7 +138,7 @@ const µBlock = (function() { // jshint ignore:line
// Read-only
systemSettings: {
compiledMagic: 13, // Increase when compiled format changes
selfieMagic: 13 // Increase when selfie format changes
selfieMagic: 14 // Increase when selfie format changes
},
restoreBackupSettings: {

View File

@ -175,7 +175,8 @@ const HNTrieContainer = class {
matchesJS(iroot) {
const char0 = this.buf32[HNTRIE_CHAR0_SLOT];
let ineedle = this.buf[255];
let icell = iroot;
let icell = this.buf32[iroot+0];
if ( icell === 0 ) { return -1; }
for (;;) {
if ( ineedle === 0 ) { return -1; }
ineedle -= 1;
@ -238,12 +239,6 @@ const HNTrieContainer = class {
addJS(iroot) {
let lhnchar = this.buf[255];
if ( lhnchar === 0 ) { return 0; }
let icell = iroot;
// special case: first node in trie
if ( this.buf32[icell+2] === 0 ) {
this.buf32[icell+2] = this.addSegment(lhnchar);
return 1;
}
// grow buffer if needed
if (
(this.buf32[HNTRIE_CHAR0_SLOT] - this.buf32[HNTRIE_TRIE1_SLOT]) < 24 ||
@ -251,6 +246,12 @@ const HNTrieContainer = class {
) {
this.growBuf(24, 256);
}
let icell = this.buf32[iroot+0];
// special case: first node in trie
if ( icell === 0 ) {
this.buf32[iroot+0] = this.addCell(0, 0, this.addSegment(lhnchar));
return 1;
}
//
const char0 = this.buf32[HNTRIE_CHAR0_SLOT];
let inext;
@ -259,6 +260,9 @@ const HNTrieContainer = class {
const vseg = this.buf32[icell+2];
// skip boundary cells
if ( vseg === 0 ) {
// remainder is at label boundary? if yes, no need to add
// the rest since the shortest match is always reported
if ( this.buf[lhnchar-1] === 0x2E /* '.' */ ) { return -1; }
icell = this.buf32[icell+1];
continue;
}
@ -303,6 +307,9 @@ const HNTrieContainer = class {
icell = inext;
continue;
}
// remainder is at label boundary? if yes, no need to add
// the rest since the shortest match is always reported
if ( this.buf[lhnchar-1] === 0x2E /* '.' */ ) { return -1; }
// boundary cell + needle remainder
inext = this.addCell(0, 0, 0);
this.buf32[icell+1] = inext;
@ -550,7 +557,7 @@ HNTrieContainer.prototype.HNTrieRef = class {
}
add(hn) {
if ( this.container.setNeedle(hn).add(this.iroot) === 1 ) {
if ( this.container.setNeedle(hn).add(this.iroot) > 0 ) {
this.last = -1;
this.needle = '';
this.size += 1;
@ -560,7 +567,7 @@ HNTrieContainer.prototype.HNTrieRef = class {
}
addJS(hn) {
if ( this.container.setNeedle(hn).addJS(this.iroot) === 1 ) {
if ( this.container.setNeedle(hn).addJS(this.iroot) > 0 ) {
this.last = -1;
this.needle = '';
this.size += 1;
@ -570,7 +577,7 @@ HNTrieContainer.prototype.HNTrieRef = class {
}
addWASM(hn) {
if ( this.container.setNeedle(hn).addWASM(this.iroot) === 1 ) {
if ( this.container.setNeedle(hn).addWASM(this.iroot) > 0 ) {
this.last = -1;
this.needle = '';
this.size += 1;

Binary file not shown.

View File

@ -51,8 +51,9 @@
;; offset.
;;
(func (export "matches")
(param $icell i32) ;; offset to root cell of the trie
(param $iroot i32) ;; offset to root cell of the trie
(result i32) ;; result = match index, -1 = miss
(local $icell i32) ;; offset to the current cell
(local $char0 i32) ;; offset to first character data
(local $ineedle i32) ;; current needle offset
(local $c i32)
@ -64,15 +65,24 @@
i32.const 264 ;; start of char section is stored at addr 264
i32.load
set_local $char0
;; $icell is an index into an array of 32-bit values
get_local $icell
i32.const 2
i32.shl
set_local $icell
;; let ineedle = this.buf[255];
i32.const 255 ;; addr of needle is stored at addr 255
i32.load8_u
set_local $ineedle
;; let icell = this.buf32[iroot+0];
get_local $iroot
i32.const 2
i32.shl
i32.load
i32.const 2
i32.shl
tee_local $icell
;; if ( icell === 0 ) { return -1; }
i32.eqz
if
i32.const -1
return
end
;; for (;;) {
block $noSegment loop $nextSegment
;; if ( ineedle === 0 ) { return -1; }
@ -244,8 +254,9 @@
;; Add a new hostname to a trie which root cell is passed as argument.
;;
(func (export "add")
(param $icell i32) ;; index of root cell of the trie
(param $iroot i32) ;; index of root cell of the trie
(result i32) ;; result: 0 not added, 1 = added
(local $icell i32) ;; index of current cell in the trie
(local $lhnchar i32) ;; number of characters left to process in hostname
(local $char0 i32) ;; offset to start of character data section
(local $vseg i32) ;; integer value describing a segment
@ -264,24 +275,6 @@
i32.const 0
return
end
;; let icell = iroot;
get_local $icell
i32.const 2
i32.shl
tee_local $icell
;; if ( this.buf32[icell+2] === 0 ) {
i32.load offset=8
i32.eqz
if
;;this.buf32[icell+2] = this.addSegment(lhnchar);
;; return 1;
get_local $icell
get_local $lhnchar
call $addSegment
i32.store offset=8
i32.const 1
return
end
;; if (
;; (this.buf32[HNBIGTRIE_CHAR0_SLOT] - this.buf32[HNBIGTRIE_TRIE1_SLOT]) < 24 ||
;; (this.buf.length - this.buf32[HNBIGTRIE_CHAR1_SLOT]) < 256
@ -310,6 +303,30 @@
call $growBuf
end
end
;; let icell = this.buf32[iroot+0];
get_local $iroot
i32.const 2
i32.shl
tee_local $iroot
i32.load
i32.const 2
i32.shl
tee_local $icell
;; if ( this.buf32[icell+2] === 0 ) {
i32.eqz
if
;; this.buf32[iroot+0] = this.addCell(0, 0, this.addSegment(lhnchar));
;; return 1;
get_local $iroot
i32.const 0
i32.const 0
get_local $lhnchar
call $addSegment
call $addCell
i32.store
i32.const 1
return
end
;; const char0 = this.buf32[HNBIGTRIE_CHAR0_SLOT];
i32.const 264
i32.load
@ -323,6 +340,19 @@
;; if ( vseg === 0 ) {
i32.eqz
if
;; if ( this.buf[lhnchar-1] === 0x2E /* '.' */ ) { return -1; }
get_local $lhnchar
i32.const -1
i32.add
i32.load8_u
i32.const 0x2E
i32.eq
if
i32.const -1
return
end
;; icell = this.buf32[icell+1];
;; continue;
get_local $icell
i32.load offset=4
i32.const 2
@ -463,13 +493,23 @@
else
;; if ( inext !== 0 ) {
get_local $inext
i32.eqz
if else
if
;; icell = inext;
get_local $inext
set_local $icell
br $nextSegment
end
;; if ( this.buf[lhnchar-1] === 0x2E /* '.' */ ) { return -1; }
get_local $lhnchar
i32.const -1
i32.add
i32.load8_u
i32.const 0x2E
i32.eq
if
i32.const -1
return
end
;; inext = this.addCell(0, 0, 0);
;; this.buf32[icell+1] = inext;
get_local $icell