Skip to content

Commit

Permalink
feat(lists): add support for wildcard lists using a custom Trie
Browse files Browse the repository at this point in the history
A couple other Trie implementations were tested but they use more
memory and are slower. See PR 0xERR0R#1233 for details.
  • Loading branch information
ThinkChaos committed Nov 12, 2023
1 parent bafa615 commit 56fd90d
Show file tree
Hide file tree
Showing 17 changed files with 892,083 additions and 22 deletions.
7 changes: 7 additions & 0 deletions cache/stringcache/in_memory_grouped_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ func NewInMemoryGroupedRegexCache() *InMemoryGroupedCache {
}
}

func NewInMemoryGroupedWildcardCache() *InMemoryGroupedCache {
return &InMemoryGroupedCache{
caches: make(map[string]stringCache),
factoryFn: newWildcardCacheFactory,
}
}

func (c *InMemoryGroupedCache) ElementCount(group string) int {
c.lock.RLock()
cache, found := c.caches[group]
Expand Down
19 changes: 19 additions & 0 deletions cache/stringcache/in_memory_grouped_cache_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,25 @@ var _ = Describe("In-Memory grouped cache", func() {
Expect(cache.Contains("shouldalsomatchstring2", []string{"group1"})).Should(ConsistOf("group1"))
})
})
When("Wildcard grouped cache is used", func() {
BeforeEach(func() {
cache = stringcache.NewInMemoryGroupedWildcardCache()
factory = cache.Refresh("group1")

Expect(factory.AddEntry("string1")).Should(BeFalse())
Expect(factory.AddEntry("/string2/")).Should(BeFalse())
Expect(factory.AddEntry("*.string3")).Should(BeTrue())
factory.Finish()
})

It("should ignore non-wildcard", func() {
Expect(cache.ElementCount("group1")).Should(BeNumerically("==", 1))
Expect(cache.Contains("string1", []string{"group1"})).Should(BeEmpty())
Expect(cache.Contains("string2", []string{"group1"})).Should(BeEmpty())
Expect(cache.Contains("string3", []string{"group1"})).Should(ConsistOf("group1"))
Expect(cache.Contains("shouldalsomatch.string3", []string{"group1"})).Should(ConsistOf("group1"))
})
})
})

Describe("Cache refresh", func() {
Expand Down
68 changes: 66 additions & 2 deletions cache/stringcache/string_caches.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"strings"

"github.com/0xERR0R/blocky/log"
"github.com/0xERR0R/blocky/trie"
)

type stringCache interface {
Expand Down Expand Up @@ -119,8 +120,6 @@ func (s *stringCacheFactory) create() stringCache {
cache[k] = strings.Join(v, "")
}

s.tmp = nil

return cache
}

Expand Down Expand Up @@ -183,3 +182,68 @@ func newRegexCacheFactory() cacheFactory {
cache: make(regexCache, 0),
}
}

type wildcardCache struct {
trie trie.Trie
cnt int
}

func (cache wildcardCache) elementCount() int {
return cache.cnt
}

func (cache wildcardCache) contains(domain string) bool {
return cache.trie.HasParentOf(domain)
}

type wildcardCacheFactory struct {
trie *trie.Trie
cnt int
}

func newWildcardCacheFactory() cacheFactory {
return &wildcardCacheFactory{
trie: trie.NewTrie(trie.SplitTLD),
}
}

func (r *wildcardCacheFactory) addEntry(entry string) bool {
globCount := strings.Count(entry, "*")
if globCount == 0 {
return false
}

if !strings.HasPrefix(entry, "*.") || globCount > 1 {
log.Log().Warnf("unsupported wildcard '%s': must start with '*.' and contain no other '*'", entry)

return true // invalid but handled
}

entry = normalizeWildcard(entry)

r.trie.Insert(entry)
r.cnt++

return true
}

func (r *wildcardCacheFactory) count() int {
return r.cnt
}

func (r *wildcardCacheFactory) create() stringCache {
if r.cnt == 0 {
return nil
}

return wildcardCache{*r.trie, r.cnt}
}

func normalizeWildcard(domain string) string {
domain = normalizeEntry(domain)
domain = strings.TrimLeft(domain, "*")
domain = strings.Trim(domain, ".")
domain = strings.ToLower(domain)

return domain
}
233 changes: 216 additions & 17 deletions cache/stringcache/string_caches_benchmark_test.go
Original file line number Diff line number Diff line change
@@ -1,46 +1,245 @@
package stringcache

import (
"math/rand"
"context"
"fmt"
"math"
"os"
"regexp"
"runtime"
"runtime/debug"
"strings"
"testing"

"github.com/0xERR0R/blocky/lists/parsers"
"github.com/0xERR0R/blocky/log"
)

func BenchmarkStringCache(b *testing.B) {
testdata := createTestdata(10_000)
var (
// String and Wildcard benchmarks don't use the exact same data,
// but since it's two versions of the same list it's closer to
// the real world: we build the cache using different sources, but then check
// the same list of domains.
//
// It is possible to run the benchmarks using the exact same data: set `useRealLists`
// to `false`. The results should be similar to the current ones, with memory use
// changing the most.
useRealLists = true

regexTestData []string
stringTestData []string
wildcardTestData []string

baseMemStats runtime.MemStats
)

func init() { //nolint:gochecknoinits
// If you update either list, make sure both are the list version (see file header).
stringTestData = loadTestdata("../../helpertest/data/oisd-big-plain.txt")

if useRealLists {
wildcardTestData = loadTestdata("../../helpertest/data/oisd-big-wildcard.txt")

// Domain is in plain but not wildcard list, add it so `benchmarkCache` doesn't fail
wildcardTestData = append(wildcardTestData, "*.btest.oisd.nl")
} else {
wildcardTestData = make([]string, 0, len(stringTestData))

for _, domain := range stringTestData {
wildcardTestData = append(wildcardTestData, "*."+domain)
}
}

// OISD regex list is the exact same as the wildcard one, just using a different format
regexTestData = make([]string, 0, len(wildcardTestData))

for _, wildcard := range wildcardTestData {
domain := strings.TrimPrefix(wildcard, "*.")

// /^(.*\.)?subdomain\.example\.com$/
regex := fmt.Sprintf(`/^(.*\.)?%s$/`, regexp.QuoteMeta(domain))

regexTestData = append(regexTestData, regex)
}
}

// --- Cache Building ---
//
// Most memory efficient: Wildcard (blocky/trie) because of peak
// Fastest: Wildcard (blocky/trie)
//
//nolint:lll
// BenchmarkRegexFactory-8 1 1 232 170 998 ns/op 430.60 fact_heap_MB 430.60 peak_heap_MB 1 792 669 136 B/op 9 826 987 allocs/op
// BenchmarkStringFactory-8 7 159 934 992 ns/op 11.79 fact_heap_MB 26.91 peak_heap_MB 67 613 644 B/op 1 305 allocs/op
// BenchmarkWildcardFactory-8 18 60 091 687 ns/op 16.61 fact_heap_MB 16.61 peak_heap_MB 26 733 498 B/op 92 213 allocs/op

func BenchmarkRegexFactory(b *testing.B) {
benchmarkRegexFactory(b, newRegexCacheFactory)
}

func BenchmarkStringFactory(b *testing.B) {
benchmarkStringFactory(b, newStringCacheFactory)
}

func BenchmarkWildcardFactory(b *testing.B) {
benchmarkWildcardFactory(b, newWildcardCacheFactory)
}

func benchmarkRegexFactory(b *testing.B, newFactory func() cacheFactory) {
benchmarkFactory(b, regexTestData, newFactory)
}

func benchmarkStringFactory(b *testing.B, newFactory func() cacheFactory) {
benchmarkFactory(b, stringTestData, newFactory)
}

func benchmarkWildcardFactory(b *testing.B, newFactory func() cacheFactory) {
benchmarkFactory(b, wildcardTestData, newFactory)
}

func benchmarkFactory(b *testing.B, data []string, newFactory func() cacheFactory) {
baseMemStats = readMemStats()

b.ReportAllocs()

var (
factory cacheFactory
cache stringCache
)

for i := 0; i < b.N; i++ {
factory := newStringCacheFactory()
factory = newFactory()

for _, s := range testdata {
for _, s := range data {
if !factory.addEntry(s) {
b.Fatalf("cache didn't insert value: %s", s)
}
}

factory.create()
cache = factory.create()
}

b.StopTimer()
reportMemUsage(b, "peak", factory, cache)
reportMemUsage(b, "fact", factory) // cache will be GC'd
}

// --- Cache Querying ---
//
// Most memory efficient: String (map)
// Fastest: Wildcard (blocky/trie)
//
//nolint:lll
// BenchmarkStringCache-8 6 204 754 798 ns/op 15.11 cache_heap_MB 0 B/op 0 allocs/op
// BenchmarkWildcardCache-8 14 76 186 334 ns/op 16.61 cache_heap_MB 0 B/op 0 allocs/op

// Regex search is too slow to even complete
// func BenchmarkRegexCache(b *testing.B) {
// benchmarkRegexCache(b, newRegexCacheFactory)
// }

func BenchmarkStringCache(b *testing.B) {
benchmarkStringCache(b, newStringCacheFactory)
}

func BenchmarkWildcardCache(b *testing.B) {
benchmarkWildcardCache(b, newWildcardCacheFactory)
}

// func benchmarkRegexCache(b *testing.B, newFactory func() cacheFactory) {
// benchmarkCache(b, regexTestData, newFactory)
// }

func benchmarkStringCache(b *testing.B, newFactory func() cacheFactory) {
benchmarkCache(b, stringTestData, newFactory)
}

func benchmarkWildcardCache(b *testing.B, newFactory func() cacheFactory) {
benchmarkCache(b, wildcardTestData, newFactory)
}

func benchmarkCache(b *testing.B, data []string, newFactory func() cacheFactory) {
baseMemStats = readMemStats()

factory := newFactory()

for _, s := range data {
factory.addEntry(s)
}

cache := factory.create()

b.ReportAllocs()
b.ResetTimer()

for i := 0; i < b.N; i++ {
// Always use the plain strings for search:
// - wildcards and regexes need a plain string query
// - all benchmarks will do the same number of queries
for _, s := range stringTestData {
if !cache.contains(s) {
b.Fatalf("cache is missing value from stringTestData: %s", s)
}
}
}

b.StopTimer()
reportMemUsage(b, "cache", cache)
}

func randString(n int) string {
const charPool = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-."
// ---

func readMemStats() (res runtime.MemStats) {
runtime.GC()
debug.FreeOSMemory()

b := make([]byte, n)
runtime.ReadMemStats(&res)

return res
}

for i := range b {
b[i] = charPool[rand.Intn(len(charPool))]
func reportMemUsage(b *testing.B, prefix string, toKeepAllocated ...any) {
m := readMemStats()

b.ReportMetric(toMB(m.HeapAlloc-baseMemStats.HeapAlloc), prefix+"_heap_MB")

// Forces Go to keep the values allocated, meaning we include them in the above measurement
// You can tell it works because factory benchmarks have different values for both calls
for i := range toKeepAllocated {
toKeepAllocated[i] = nil
}
}

func toMB(b uint64) float64 {
const bytesInKB = float64(1024)

return string(b)
kb := float64(b) / bytesInKB

return math.Round(kb) / 1024
}

func createTestdata(count int) []string {
var result []string
func loadTestdata(path string) (res []string) {
f, err := os.Open(path)
if err != nil {
panic(err)
}
defer f.Close()

p := parsers.AllowErrors(parsers.Hosts(f), parsers.NoErrorLimit)
p.OnErr(func(err error) {
log.Log().Warnf("could not parse line in %s: %s", path, err)
})

err = parsers.ForEach[*parsers.HostsIterator](context.Background(), p, func(hosts *parsers.HostsIterator) error {
return hosts.ForEach(func(host string) error {
res = append(res, host)

for i := 0; i < count; i++ {
result = append(result, randString(8+rand.Intn(20)))
return nil
})
})
if err != nil {
panic(err)
}

return result
return res
}
Loading

0 comments on commit 56fd90d

Please sign in to comment.