move elastic sync to workers instead of threads, parallel elastic delete sync, reimplement partial elastic sync
This commit is contained in:
parent
d16eaf614e
commit
e9db83f09b
|
@ -5,6 +5,7 @@ import (
|
||||||
"crazyfs/Workers"
|
"crazyfs/Workers"
|
||||||
"crazyfs/api/helpers"
|
"crazyfs/api/helpers"
|
||||||
"crazyfs/config"
|
"crazyfs/config"
|
||||||
|
"crazyfs/elastic"
|
||||||
"crypto/sha256"
|
"crypto/sha256"
|
||||||
"crypto/subtle"
|
"crypto/subtle"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
@ -37,6 +38,13 @@ func AdminCrawlsInfo(w http.ResponseWriter, r *http.Request) {
|
||||||
"size": Workers.Queue.GetQueueSize(),
|
"size": Workers.Queue.GetQueueSize(),
|
||||||
},
|
},
|
||||||
"initialCrawlElapsed": config.InitialCrawlElapsed,
|
"initialCrawlElapsed": config.InitialCrawlElapsed,
|
||||||
|
"elastic": map[string]interface{}{
|
||||||
|
"busy": elastic.BusyWorkers,
|
||||||
|
"alive": config.GetConfig().ElasticsearchSyncThreads,
|
||||||
|
"queue": map[string]interface{}{
|
||||||
|
"size": elastic.Queue.GetQueueSize(),
|
||||||
|
},
|
||||||
|
},
|
||||||
}
|
}
|
||||||
w.Header().Set("Cache-Control", "no-store")
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
w.Header().Set("Content-Type", "application/json")
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
|
|
@ -5,6 +5,7 @@ import (
|
||||||
"crazyfs/SharedCache"
|
"crazyfs/SharedCache"
|
||||||
"crazyfs/Workers"
|
"crazyfs/Workers"
|
||||||
"crazyfs/config"
|
"crazyfs/config"
|
||||||
|
"crazyfs/elastic"
|
||||||
"sync"
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
@ -51,7 +52,14 @@ func startCrawl(wg *sync.WaitGroup, crawlerChan chan struct{}) {
|
||||||
func logCacheStatus(msg string, ticker *time.Ticker, logFn func(format string, args ...interface{})) {
|
func logCacheStatus(msg string, ticker *time.Ticker, logFn func(format string, args ...interface{})) {
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
for range ticker.C {
|
for range ticker.C {
|
||||||
logFn("%s - %d/%d items in the cache. Busy Workers: %d. Jobs queued: %d. Running crawls: %d.",
|
if !config.GetConfig().ElasticsearchSyncEnable {
|
||||||
msg, len(SharedCache.Cache.Keys()), config.GetConfig().CacheSize, Workers.BusyWorkers, Workers.Queue.GetQueueSize(), DirectoryCrawler.GetTotalActiveCrawls())
|
logStr := "%s - %d/%d items in the cache. Busy Workers: %d. Jobs queued: %d. Running crawls: %d."
|
||||||
|
logFn(logStr,
|
||||||
|
msg, len(SharedCache.Cache.Keys()), config.GetConfig().CacheSize, Workers.BusyWorkers, Workers.Queue.GetQueueSize(), DirectoryCrawler.GetTotalActiveCrawls())
|
||||||
|
} else {
|
||||||
|
logStr := "%s - %d/%d items in the cache. Busy Workers: %d. Jobs queued: %d. Running crawls: %d. Busy Elastic sync workers: %d. Elastic sync queued: %d"
|
||||||
|
logFn(logStr,
|
||||||
|
msg, len(SharedCache.Cache.Keys()), config.GetConfig().CacheSize, Workers.BusyWorkers, Workers.Queue.GetQueueSize(), DirectoryCrawler.GetTotalActiveCrawls(), elastic.BusyWorkers, elastic.Queue.GetQueueSize())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -143,7 +143,7 @@ func main() {
|
||||||
elastic.ElasticClient = es
|
elastic.ElasticClient = es
|
||||||
|
|
||||||
if cfg.ElasticsearchSyncEnable && !cliArgs.disableElasticSync {
|
if cfg.ElasticsearchSyncEnable && !cliArgs.disableElasticSync {
|
||||||
go elastic.ElasticsearchThread()
|
go elastic.SyncThread()
|
||||||
log.Info("Started the background Elasticsearch sync thread.")
|
log.Info("Started the background Elasticsearch sync thread.")
|
||||||
} else {
|
} else {
|
||||||
log.Info("The background Elasticsearch sync thread is disabled.")
|
log.Info("The background Elasticsearch sync thread is disabled.")
|
||||||
|
|
|
@ -0,0 +1,49 @@
|
||||||
|
package elastic
|
||||||
|
|
||||||
|
import "sync"
|
||||||
|
|
||||||
|
// More or less like the other queue implementation.
|
||||||
|
|
||||||
|
type DeleteJob struct {
|
||||||
|
Key string
|
||||||
|
}
|
||||||
|
|
||||||
|
type DeleteJobQueue struct {
|
||||||
|
jobs []DeleteJob
|
||||||
|
mutex sync.Mutex
|
||||||
|
cond *sync.Cond
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewJobQueue() *DeleteJobQueue {
|
||||||
|
q := &DeleteJobQueue{}
|
||||||
|
q.cond = sync.NewCond(&q.mutex)
|
||||||
|
return q
|
||||||
|
}
|
||||||
|
|
||||||
|
// AddJob adds a job to the queue and signals the workers so they know to pick it up.
|
||||||
|
func (q *DeleteJobQueue) AddJob(job DeleteJob) {
|
||||||
|
q.mutex.Lock()
|
||||||
|
q.jobs = append(q.jobs, job)
|
||||||
|
q.mutex.Unlock()
|
||||||
|
q.cond.Signal()
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetJob is how a worker pulls a job from the queue.
|
||||||
|
func (q *DeleteJobQueue) GetJob() DeleteJob {
|
||||||
|
q.mutex.Lock()
|
||||||
|
defer q.mutex.Unlock()
|
||||||
|
|
||||||
|
for len(q.jobs) == 0 {
|
||||||
|
q.cond.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
job := q.jobs[0]
|
||||||
|
q.jobs = q.jobs[1:]
|
||||||
|
|
||||||
|
return job
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetQueueSize returns the size of the queue.
|
||||||
|
func (q *DeleteJobQueue) GetQueueSize() int {
|
||||||
|
return len(q.jobs)
|
||||||
|
}
|
|
@ -0,0 +1,35 @@
|
||||||
|
package elastic
|
||||||
|
|
||||||
|
import (
|
||||||
|
"crazyfs/SharedCache"
|
||||||
|
"crazyfs/config"
|
||||||
|
"sync/atomic"
|
||||||
|
)
|
||||||
|
|
||||||
|
// BusyWorkers is an atomic counter for the number of active Workers
|
||||||
|
var BusyWorkers int32
|
||||||
|
|
||||||
|
// InitializeWorkers starts the number of Workers defined by the config.
|
||||||
|
func InitializeWorkers() {
|
||||||
|
Queue = NewJobQueue()
|
||||||
|
for n := 1; n <= config.GetConfig().ElasticsearchSyncThreads; n++ {
|
||||||
|
go worker()
|
||||||
|
}
|
||||||
|
log.Debugf("ELASTIC - Started %d sync workers.", config.GetConfig().ElasticsearchSyncThreads)
|
||||||
|
}
|
||||||
|
|
||||||
|
// worker processes jobs forever.
|
||||||
|
func worker() {
|
||||||
|
for {
|
||||||
|
job := Queue.GetJob()
|
||||||
|
atomic.AddInt32(&BusyWorkers, 1)
|
||||||
|
|
||||||
|
if _, ok := SharedCache.Cache.Get(job.Key); !ok {
|
||||||
|
// If a key does not exist in the LRU cache, delete it from Elasticsearch
|
||||||
|
deleteFromElasticsearch(job.Key)
|
||||||
|
log.Debugf(`ELASTIC - Removed key "%s"`, job.Key)
|
||||||
|
}
|
||||||
|
|
||||||
|
atomic.AddInt32(&BusyWorkers, -1)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,94 @@
|
||||||
|
package elastic
|
||||||
|
|
||||||
|
import (
|
||||||
|
"crazyfs/DirectoryCrawler"
|
||||||
|
"crazyfs/config"
|
||||||
|
"sync"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
var Queue *DeleteJobQueue
|
||||||
|
|
||||||
|
var syncLock sync.Mutex
|
||||||
|
|
||||||
|
func SyncThread() {
|
||||||
|
Queue = NewJobQueue()
|
||||||
|
InitializeWorkers()
|
||||||
|
|
||||||
|
createCrazyfsIndex()
|
||||||
|
|
||||||
|
// Test connection to Elastic.
|
||||||
|
esSize, err := getElasticSize()
|
||||||
|
if err != nil {
|
||||||
|
logElasticConnError(err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
log.Infof(`ELASTIC - index "%s" contains %d items.`, config.GetConfig().ElasticsearchIndex, esSize)
|
||||||
|
|
||||||
|
// Run a partial sync at startup, unless configured to run a full one.
|
||||||
|
syncElasticsearch(false)
|
||||||
|
|
||||||
|
ticker := time.NewTicker(time.Duration(config.GetConfig().ElasticsearchSyncInterval) * time.Second)
|
||||||
|
fullSyncTicker := time.NewTicker(time.Duration(config.GetConfig().ElasticsearchFullSyncInterval) * time.Second)
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ticker.C:
|
||||||
|
syncElasticsearch(false)
|
||||||
|
case <-fullSyncTicker.C:
|
||||||
|
syncElasticsearch(true)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: make this use workers instead of starting a million threads
|
||||||
|
// TODO: have the workers exit when the sync job is finished
|
||||||
|
func syncElasticsearch(doFullSync bool) {
|
||||||
|
// Only one sync at a time. Also helps to prevent races with the global variables.
|
||||||
|
syncLock.Lock()
|
||||||
|
|
||||||
|
var syncType string
|
||||||
|
if fullSync {
|
||||||
|
ElasticRefreshSyncRunning = true
|
||||||
|
syncType = "full refresh"
|
||||||
|
} else {
|
||||||
|
ElasticNewSyncRunning = true
|
||||||
|
syncType = "refresh"
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
log.Infof("ELASTIC - started a %s sync.", syncType)
|
||||||
|
start := time.Now()
|
||||||
|
|
||||||
|
startRemoveStaleItemsFromElasticsearch()
|
||||||
|
|
||||||
|
// Set global variables for the workers to read.
|
||||||
|
fullSync = doFullSync
|
||||||
|
var err error
|
||||||
|
existingKeys, err = getPathsFromIndex()
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("ELASTIC - Error retrieving keys from Elasticsearch: %s", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
dc := DirectoryCrawler.NewDirectoryCrawler()
|
||||||
|
err = dc.Crawl(config.GetConfig().RootDir, addToElasticsearch)
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("ELASTIC - crawl failed: %s", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
duration := time.Since(start)
|
||||||
|
log.Infof("ELASTIC - %s sync finished in %s", syncType, duration)
|
||||||
|
syncLock.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
func logElasticConnError(err error) {
|
||||||
|
log.Errorf("ELASTIC - Failed to read the index: %s", err)
|
||||||
|
LogElasticQuit()
|
||||||
|
}
|
||||||
|
|
||||||
|
func LogElasticQuit() {
|
||||||
|
log.Errorln("ELASTIC - background thread exiting, Elastic indexing and search will not be available.")
|
||||||
|
|
||||||
|
}
|
|
@ -1,62 +0,0 @@
|
||||||
package elastic
|
|
||||||
|
|
||||||
import (
|
|
||||||
"crazyfs/DirectoryCrawler"
|
|
||||||
"crazyfs/config"
|
|
||||||
"time"
|
|
||||||
)
|
|
||||||
|
|
||||||
func ElasticsearchThread() {
|
|
||||||
createCrazyfsIndex()
|
|
||||||
|
|
||||||
// Test connection to Elastic.
|
|
||||||
esSize, err := getElasticSize()
|
|
||||||
if err != nil {
|
|
||||||
logElasticConnError(err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
log.Infof(`ELASTIC - index "%s" contains %d items.`, config.GetConfig().ElasticsearchIndex, esSize)
|
|
||||||
|
|
||||||
// Run a partial sync at startup, unless configured to run a full one.
|
|
||||||
syncElasticsearch()
|
|
||||||
|
|
||||||
ticker := time.NewTicker(time.Duration(config.GetConfig().ElasticsearchSyncInterval) * time.Second)
|
|
||||||
|
|
||||||
for {
|
|
||||||
select {
|
|
||||||
case <-ticker.C:
|
|
||||||
syncElasticsearch()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: make this use workers instead of starting a million threads
|
|
||||||
// TODO: have the workers exit when the sync job is finished
|
|
||||||
func syncElasticsearch() {
|
|
||||||
log.Infof("ELASTIC - started syncing.")
|
|
||||||
start := time.Now()
|
|
||||||
|
|
||||||
dc := DirectoryCrawler.NewDirectoryCrawler()
|
|
||||||
err := dc.Crawl(config.GetConfig().RootDir, addToElasticsearch)
|
|
||||||
if err != nil {
|
|
||||||
log.Errorf("ELASTIC - crawl failed: %s", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: use workers for this
|
|
||||||
log.Debugln("ELASTIC - Checking for removed items...")
|
|
||||||
removeStaleItemsFromElasticsearch()
|
|
||||||
|
|
||||||
duration := time.Since(start)
|
|
||||||
log.Infof("ELASTIC - sync finished in %s", duration)
|
|
||||||
}
|
|
||||||
|
|
||||||
func logElasticConnError(err error) {
|
|
||||||
log.Errorf("ELASTIC - Failed to read the index: %s", err)
|
|
||||||
LogElasticQuit()
|
|
||||||
}
|
|
||||||
|
|
||||||
func LogElasticQuit() {
|
|
||||||
log.Errorln("ELASTIC - background thread exiting, Elastic indexing and search will not be available.")
|
|
||||||
|
|
||||||
}
|
|
|
@ -10,8 +10,17 @@ import (
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"github.com/elastic/go-elasticsearch/v8/esapi"
|
"github.com/elastic/go-elasticsearch/v8/esapi"
|
||||||
"os"
|
"os"
|
||||||
|
"slices"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// existingKeys is a global variable called by the Walker callback: addToElasticsearch().
|
||||||
|
// It is set only by syncElasticsearch() when a sync is started. Only one sync can run at a time.
|
||||||
|
// A global is needed since there is no way to pass variables like this to the workers.
|
||||||
|
var existingKeys []string
|
||||||
|
|
||||||
|
// fullSync is another global variable accessed by the workers and set by syncElasticsearch()
|
||||||
|
var fullSync bool
|
||||||
|
|
||||||
func addToElasticsearch(fullPath string, info os.FileInfo, incomingErr error) error {
|
func addToElasticsearch(fullPath string, info os.FileInfo, incomingErr error) error {
|
||||||
key := file.StripRootDir(fullPath)
|
key := file.StripRootDir(fullPath)
|
||||||
cacheItem, found := SharedCache.Cache.Get(key)
|
cacheItem, found := SharedCache.Cache.Get(key)
|
||||||
|
@ -19,9 +28,11 @@ func addToElasticsearch(fullPath string, info os.FileInfo, incomingErr error) er
|
||||||
log.Fatalf(`ELASTICSEARCH - Could not fetch item "%s" from the LRU cache!`, key)
|
log.Fatalf(`ELASTICSEARCH - Could not fetch item "%s" from the LRU cache!`, key)
|
||||||
} else {
|
} else {
|
||||||
if !shouldExclude(key, config.GetConfig().ElasticsearchExcludePatterns) {
|
if !shouldExclude(key, config.GetConfig().ElasticsearchExcludePatterns) {
|
||||||
preformAddToElasticsearch(cacheItem)
|
if fullSync {
|
||||||
} else {
|
preformAddToElasticsearch(cacheItem)
|
||||||
deleteFromElasticsearch(key) // clean up
|
} else if !slices.Contains(existingKeys, key) {
|
||||||
|
preformAddToElasticsearch(cacheItem)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
|
|
|
@ -2,14 +2,12 @@ package elastic
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"crazyfs/SharedCache"
|
|
||||||
"crazyfs/config"
|
"crazyfs/config"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"github.com/elastic/go-elasticsearch/v8/esapi"
|
"github.com/elastic/go-elasticsearch/v8/esapi"
|
||||||
"sync"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func removeStaleItemsFromElasticsearch() {
|
func startRemoveStaleItemsFromElasticsearch() {
|
||||||
// Retrieve all keys from Elasticsearch
|
// Retrieve all keys from Elasticsearch
|
||||||
keys, err := getPathsFromIndex()
|
keys, err := getPathsFromIndex()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -17,37 +15,12 @@ func removeStaleItemsFromElasticsearch() {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create a buffered channel as a semaphore
|
log.Debugln("ELASTIC - Checking for removed items...")
|
||||||
sem := make(chan struct{}, config.GetConfig().ElasticsearchSyncThreads)
|
|
||||||
|
|
||||||
// Create a wait group to wait for all goroutines to finish
|
// For each key in Elasticsearch, create a job to check (and remove it if the key no longer exists in the cache).
|
||||||
var wg sync.WaitGroup
|
|
||||||
|
|
||||||
// For each key in Elasticsearch, check if it exists in the LRU cache
|
|
||||||
for _, key := range keys {
|
for _, key := range keys {
|
||||||
// Increment the wait group counter
|
go Queue.AddJob(DeleteJob{Key: key})
|
||||||
wg.Add(1)
|
|
||||||
|
|
||||||
// Acquire a semaphore
|
|
||||||
sem <- struct{}{}
|
|
||||||
|
|
||||||
go func(key string) {
|
|
||||||
// Ensure the semaphore is released and the wait group counter is decremented when the goroutine finishes
|
|
||||||
defer func() {
|
|
||||||
<-sem
|
|
||||||
wg.Done()
|
|
||||||
}()
|
|
||||||
|
|
||||||
if _, ok := SharedCache.Cache.Get(key); !ok {
|
|
||||||
// If a key does not exist in the LRU cache, delete it from Elasticsearch
|
|
||||||
deleteFromElasticsearch(key)
|
|
||||||
log.Debugf(`ELASTIC - Removed key "%s"`, key)
|
|
||||||
}
|
|
||||||
}(key)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Wait for all goroutines to finish
|
|
||||||
wg.Wait()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func deleteFromElasticsearch(key string) {
|
func deleteFromElasticsearch(key string) {
|
||||||
|
|
|
@ -22,7 +22,7 @@ func getPathsFromIndex() ([]string, error) {
|
||||||
// This may take a bit if the index is very large, so avoid calling this.
|
// This may take a bit if the index is very large, so avoid calling this.
|
||||||
|
|
||||||
// Print a debug message so the user doesn't think we're frozen.
|
// Print a debug message so the user doesn't think we're frozen.
|
||||||
log.Debugln("Fetching indexed paths from Elasticsearch...")
|
log.Debugln("ELASTIC - Fetching indexed paths from Elasticsearch...")
|
||||||
|
|
||||||
var paths []string
|
var paths []string
|
||||||
var r map[string]interface{}
|
var r map[string]interface{}
|
||||||
|
|
1
todo.txt
1
todo.txt
|
@ -1,4 +1,3 @@
|
||||||
- Some way for the add to elastic callback to skip existing keys if not doing a full search
|
|
||||||
|
|
||||||
Later:
|
Later:
|
||||||
- Add a wildcard option to restricted_download_paths to block all sub-directories
|
- Add a wildcard option to restricted_download_paths to block all sub-directories
|
||||||
|
|
Loading…
Reference in New Issue