fix elastic workers, fix misc bugs
This commit is contained in:
parent
c0faaab3f8
commit
74206cd6e5
|
@ -42,7 +42,7 @@ func APIAdminCrawlsInfo(w http.ResponseWriter, r *http.Request) {
|
||||||
elasticWorkers = map[string]interface{}{
|
elasticWorkers = map[string]interface{}{
|
||||||
"busy": globals.ElasticCrawlers.BusyWorkers,
|
"busy": globals.ElasticCrawlers.BusyWorkers,
|
||||||
"alive": config.GetConfig().ElasticsearchSyncThreads,
|
"alive": config.GetConfig().ElasticsearchSyncThreads,
|
||||||
"queueSize": globals.ElasticCrawlers.Queue.GetQueueSize(),
|
"queueSize": globals.ElasticCrawlers.Queue.GetQueuedJobs(),
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
elasticWorkers = map[string]interface{}{
|
elasticWorkers = map[string]interface{}{
|
||||||
|
@ -62,7 +62,7 @@ func APIAdminCrawlsInfo(w http.ResponseWriter, r *http.Request) {
|
||||||
"alive": config.GetConfig().DirectoryCrawlers,
|
"alive": config.GetConfig().DirectoryCrawlers,
|
||||||
},
|
},
|
||||||
"queue": map[string]interface{}{
|
"queue": map[string]interface{}{
|
||||||
"items": globals.DirectoryCrawlers.Queue.GetQueueSize(),
|
"items": globals.DirectoryCrawlers.Queue.GetQueuedJobs(),
|
||||||
},
|
},
|
||||||
"initialCrawlElapsed": config.InitialCrawlElapsed,
|
"initialCrawlElapsed": config.InitialCrawlElapsed,
|
||||||
"elastic": map[string]interface{}{
|
"elastic": map[string]interface{}{
|
||||||
|
|
|
@ -56,7 +56,7 @@ func logCacheStatus(msg string, ticker *time.Ticker, logFn func(format string, a
|
||||||
case <-ticker.C:
|
case <-ticker.C:
|
||||||
logStr := "%s - %d/%d items in the cache. Busy workers: %d. Jobs queued: %d. Running crawls: %d."
|
logStr := "%s - %d/%d items in the cache. Busy workers: %d. Jobs queued: %d. Running crawls: %d."
|
||||||
logFn(logStr,
|
logFn(logStr,
|
||||||
msg, len(sharedcache.Cache.Keys()), config.GetConfig().CacheSize, atomic.LoadInt32(&globals.DirectoryCrawlers.BusyWorkers), globals.DirectoryCrawlers.Queue.GetQueueSize(), directorycrawler.GetTotalActiveCrawls())
|
msg, len(sharedcache.Cache.Keys()), config.GetConfig().CacheSize, atomic.LoadInt32(&globals.DirectoryCrawlers.BusyWorkers), globals.DirectoryCrawlers.Queue.GetQueuedJobs(), directorycrawler.GetTotalActiveCrawls())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -41,6 +41,7 @@ type Config struct {
|
||||||
ElasticsearchExcludePatterns []string
|
ElasticsearchExcludePatterns []string
|
||||||
ElasticsearchFullSyncOnStart bool
|
ElasticsearchFullSyncOnStart bool
|
||||||
ElasticsearchDefaultQueryField string
|
ElasticsearchDefaultQueryField string
|
||||||
|
ElasticPrintChanges bool
|
||||||
HTTPRealIPHeader string
|
HTTPRealIPHeader string
|
||||||
HTTPNoMimeSniffHeader bool
|
HTTPNoMimeSniffHeader bool
|
||||||
HTTPAccessControlAllowOriginHeader string
|
HTTPAccessControlAllowOriginHeader string
|
||||||
|
@ -88,6 +89,7 @@ func SetConfig(configFile string) (*Config, error) {
|
||||||
viper.SetDefault("elasticsearch_full_sync_on_start", false)
|
viper.SetDefault("elasticsearch_full_sync_on_start", false)
|
||||||
viper.SetDefault("elasticsearch_query_fields", []string{"extension", "name", "path", "type", "size", "isDir"})
|
viper.SetDefault("elasticsearch_query_fields", []string{"extension", "name", "path", "type", "size", "isDir"})
|
||||||
viper.SetDefault("elasticsearch_default_query_field", "name")
|
viper.SetDefault("elasticsearch_default_query_field", "name")
|
||||||
|
viper.SetDefault("elasticsearch_print_changes", false)
|
||||||
viper.SetDefault("http_real_ip_header", "X-Forwarded-For")
|
viper.SetDefault("http_real_ip_header", "X-Forwarded-For")
|
||||||
viper.SetDefault("http_no_mime_sniff_header", false)
|
viper.SetDefault("http_no_mime_sniff_header", false)
|
||||||
viper.SetDefault("http_access_control_allow_origin_header", "*")
|
viper.SetDefault("http_access_control_allow_origin_header", "*")
|
||||||
|
@ -141,6 +143,7 @@ func SetConfig(configFile string) (*Config, error) {
|
||||||
ElasticsearchExcludePatterns: viper.GetStringSlice("elasticsearch_exclude_patterns"),
|
ElasticsearchExcludePatterns: viper.GetStringSlice("elasticsearch_exclude_patterns"),
|
||||||
ElasticsearchFullSyncOnStart: viper.GetBool("elasticsearch_full_sync_on_start"),
|
ElasticsearchFullSyncOnStart: viper.GetBool("elasticsearch_full_sync_on_start"),
|
||||||
ElasticsearchDefaultQueryField: viper.GetString("elasticsearch_default_query_field"),
|
ElasticsearchDefaultQueryField: viper.GetString("elasticsearch_default_query_field"),
|
||||||
|
ElasticPrintChanges: viper.GetBool("elasticsearch_print_changes"),
|
||||||
HTTPRealIPHeader: viper.GetString("http_real_ip_header"),
|
HTTPRealIPHeader: viper.GetString("http_real_ip_header"),
|
||||||
HTTPNoMimeSniffHeader: viper.GetBool("http_no_mime_sniff_header"),
|
HTTPNoMimeSniffHeader: viper.GetBool("http_no_mime_sniff_header"),
|
||||||
HTTPAccessControlAllowOriginHeader: viper.GetString("http_access_control_allow_origin_header"),
|
HTTPAccessControlAllowOriginHeader: viper.GetString("http_access_control_allow_origin_header"),
|
||||||
|
|
|
@ -116,11 +116,16 @@ func main() {
|
||||||
// Start the Elastic connection, so it can initialize while we're doing the initial crawl.
|
// Start the Elastic connection, so it can initialize while we're doing the initial crawl.
|
||||||
// If we fail to establish a connection to Elastic, don't kill the entire server. Instead, just disable Elastic.
|
// If we fail to establish a connection to Elastic, don't kill the entire server. Instead, just disable Elastic.
|
||||||
if cfg.ElasticsearchEnable && !cliArgs.disableElasticSync {
|
if cfg.ElasticsearchEnable && !cliArgs.disableElasticSync {
|
||||||
|
fmt.Println(config.GetConfig().ElasticsearchSyncThreads + 1)
|
||||||
esCfg := elasticsearch.Config{
|
esCfg := elasticsearch.Config{
|
||||||
Addresses: []string{
|
Addresses: []string{
|
||||||
cfg.ElasticsearchEndpoint,
|
cfg.ElasticsearchEndpoint,
|
||||||
},
|
},
|
||||||
APIKey: cfg.ElasticsearchAPIKey,
|
APIKey: cfg.ElasticsearchAPIKey,
|
||||||
|
//Transport: &http.Transport{
|
||||||
|
// MaxIdleConnsPerHost: config.GetConfig().ElasticsearchSyncThreads + 1,
|
||||||
|
// IdleConnTimeout: 30 * time.Second,
|
||||||
|
//},
|
||||||
}
|
}
|
||||||
es, err := elasticsearch.NewClient(esCfg)
|
es, err := elasticsearch.NewClient(esCfg)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
@ -11,7 +11,7 @@ func InitializeDirectoryCrawlerWorkers() *globals.DcWorkers {
|
||||||
if globals.DirectoryCrawlers != nil {
|
if globals.DirectoryCrawlers != nil {
|
||||||
panic("DirectoryCrawlers has already been defined!")
|
panic("DirectoryCrawlers has already been defined!")
|
||||||
}
|
}
|
||||||
dcWorkers := workers.InitializeWorkers(directoryCrawlerWorker) // *workers.CrawlWorkers
|
dcWorkers := workers.InitializeWorkers(config.GetConfig().DirectoryCrawlers, directoryCrawlerWorker)
|
||||||
d := &globals.DcWorkers{}
|
d := &globals.DcWorkers{}
|
||||||
|
|
||||||
// Copy the fields given to us by InitializeWorkers() to the global object.
|
// Copy the fields given to us by InitializeWorkers() to the global object.
|
||||||
|
|
|
@ -5,7 +5,6 @@ import (
|
||||||
"crazyfs/globals"
|
"crazyfs/globals"
|
||||||
"crazyfs/sharedcache"
|
"crazyfs/sharedcache"
|
||||||
"crazyfs/workers"
|
"crazyfs/workers"
|
||||||
"fmt"
|
|
||||||
"sync"
|
"sync"
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
)
|
)
|
||||||
|
@ -13,14 +12,27 @@ import (
|
||||||
// aliveWorkers is used by syncElasticsearch to know when it is safe to erase the queue.
|
// aliveWorkers is used by syncElasticsearch to know when it is safe to erase the queue.
|
||||||
var aliveWorkers sync.WaitGroup
|
var aliveWorkers sync.WaitGroup
|
||||||
|
|
||||||
|
type CrawlerExtras struct {
|
||||||
|
Added int32
|
||||||
|
}
|
||||||
|
|
||||||
|
type JobExtras struct {
|
||||||
|
Task string
|
||||||
|
Key string
|
||||||
|
}
|
||||||
|
|
||||||
func InitializeElasticCrawlerWorkers() *globals.DcWorkers {
|
func InitializeElasticCrawlerWorkers() *globals.DcWorkers {
|
||||||
if globals.ElasticCrawlers != nil {
|
if globals.ElasticCrawlers != nil {
|
||||||
panic("ElasticCrawlers has already been defined!")
|
panic("ElasticCrawlers has already been defined!")
|
||||||
}
|
}
|
||||||
deleteWorkers := workers.InitializeWorkers(elasticDeleteWorker)
|
elWorkers := workers.InitializeWorkers(config.GetConfig().ElasticsearchSyncThreads, elasticDeleteWorker)
|
||||||
d := &globals.DcWorkers{}
|
d := &globals.DcWorkers{}
|
||||||
d.Queue = deleteWorkers.Queue
|
d.Queue = elWorkers.Queue
|
||||||
deleteWorkers.BusyWorkers = &d.BusyWorkers
|
extra := make(map[string]interface{})
|
||||||
|
var completed int32
|
||||||
|
extra["completed"] = completed
|
||||||
|
d.Extra = &CrawlerExtras{}
|
||||||
|
elWorkers.BusyWorkers = &d.BusyWorkers
|
||||||
globals.ElasticCrawlers = d
|
globals.ElasticCrawlers = d
|
||||||
log.Debugf("CRAWLERS - Started %d Elasticsearch sync workers.", config.GetConfig().ElasticsearchSyncThreads)
|
log.Debugf("CRAWLERS - Started %d Elasticsearch sync workers.", config.GetConfig().ElasticsearchSyncThreads)
|
||||||
return d
|
return d
|
||||||
|
@ -31,9 +43,7 @@ func elasticDeleteWorker(w *workers.CrawlWorkers) {
|
||||||
defer aliveWorkers.Done()
|
defer aliveWorkers.Done()
|
||||||
for {
|
for {
|
||||||
job := w.Queue.GetJob()
|
job := w.Queue.GetJob()
|
||||||
|
|
||||||
if job.Terminate {
|
if job.Terminate {
|
||||||
fmt.Println("delete worker stopping")
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -46,22 +56,27 @@ func elasticDeleteWorker(w *workers.CrawlWorkers) {
|
||||||
log.Warnf("ELCrawlWorker:Add - %s - %s", job.StartPath, err)
|
log.Warnf("ELCrawlWorker:Add - %s - %s", job.StartPath, err)
|
||||||
}
|
}
|
||||||
job.Walker.Wg.Done()
|
job.Walker.Wg.Done()
|
||||||
|
|
||||||
|
// Only increment the completed counter when we're adding.
|
||||||
|
e := globals.ElasticCrawlers.Extra.(*CrawlerExtras)
|
||||||
|
atomic.AddInt32(&e.Added, 1)
|
||||||
} else {
|
} else {
|
||||||
e := *job.Extra
|
e := job.Extra.(JobExtras)
|
||||||
task := e["task"]
|
if e.Task == TASKDELETE {
|
||||||
if task == TASKDELETE {
|
|
||||||
if _, ok := sharedcache.Cache.Get(job.StartPath); !ok {
|
if _, ok := sharedcache.Cache.Get(job.StartPath); !ok {
|
||||||
// If a key in Elastic does not exist in the LRU cache, delete it from Elastic.
|
// If a key in Elastic does not exist in the LRU cache, delete it from Elastic.
|
||||||
key := e["key"].(string)
|
key := e.Key
|
||||||
err := DeleteFromElasticsearch(key)
|
err := DeleteFromElasticsearch(key)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf(`ELCrawlWorker:Delete - Error deleting key "%s" - %s`, key, err)
|
log.Errorf(`ELCrawlWorker:Delete - Error deleting key "%s" - %s`, key, err)
|
||||||
} else {
|
} else {
|
||||||
log.Debugf(`ELCrawlWorker:Delete - Deleted path: "%s"`, job.StartPath)
|
if config.GetConfig().ElasticPrintChanges {
|
||||||
|
log.Debugf(`ELCrawlWorker:Delete - Deleted path: "%s"`, job.StartPath)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
panic(task)
|
panic(e.Task)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
atomic.AddInt32(w.BusyWorkers, -1)
|
atomic.AddInt32(w.BusyWorkers, -1)
|
||||||
|
|
|
@ -6,6 +6,7 @@ import (
|
||||||
"crazyfs/directorycrawler"
|
"crazyfs/directorycrawler"
|
||||||
"crazyfs/globals"
|
"crazyfs/globals"
|
||||||
"sync"
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -73,9 +74,22 @@ func syncElasticsearch(doFullSync bool) {
|
||||||
|
|
||||||
InitializeElasticCrawlerWorkers()
|
InitializeElasticCrawlerWorkers()
|
||||||
|
|
||||||
|
// Refresh the global variables for the workers.
|
||||||
|
var err error
|
||||||
|
globalPathsByKeyMutex.Lock()
|
||||||
|
globalKeysByPathMutex.Lock()
|
||||||
|
globalKeysByPath, globalPathsByKey, err = getPathsFromIndex(true, 100)
|
||||||
|
globalPathsByKeyMutex.Unlock()
|
||||||
|
globalKeysByPathMutex.Unlock()
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("ELASTIC - Error retrieving keys from Elasticsearch: %s", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
frozenIndexSize := len(globalKeysByPath)
|
||||||
|
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
defer cancel()
|
defer cancel()
|
||||||
ticker := time.NewTicker(60 * time.Second)
|
ticker := time.NewTicker(1 * time.Second)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
|
@ -85,33 +99,26 @@ func syncElasticsearch(doFullSync bool) {
|
||||||
return
|
return
|
||||||
case <-ticker.C:
|
case <-ticker.C:
|
||||||
elapsed := time.Since(start)
|
elapsed := time.Since(start)
|
||||||
logStr := "ELASTIC - Sync in progress. Elapsed: %s. Busy workers: %d. Jobs queued: %d"
|
e := globals.ElasticCrawlers.Extra.(*CrawlerExtras)
|
||||||
log.Debugf(logStr, elapsed, globals.ElasticCrawlers.BusyWorkers, globals.ElasticCrawlers.Queue.GetQueueSize())
|
logStr := "ELASTIC - Sync in progress. Completed: %d/%d, Elapsed: %s. Busy workers: %d. Jobs queued: %d"
|
||||||
|
log.Debugf(logStr, atomic.LoadInt32(&e.Added), frozenIndexSize, elapsed, atomic.LoadInt32(&globals.ElasticCrawlers.BusyWorkers), globals.ElasticCrawlers.Queue.GetQueuedJobs())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
// Refresh the global variables for the workers.
|
|
||||||
var err error
|
|
||||||
globalPathsByKeyMutex.Lock()
|
|
||||||
globalKeysByPathMutex.Lock()
|
|
||||||
globalKeysByPath, globalPathsByKey, err = getPathsFromIndex()
|
|
||||||
globalPathsByKeyMutex.Unlock()
|
|
||||||
globalKeysByPathMutex.Unlock()
|
|
||||||
if err != nil {
|
|
||||||
log.Errorf("ELASTIC - Error retrieving keys from Elasticsearch: %s", err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
startRemoveStaleItemsFromElasticsearch()
|
startRemoveStaleItemsFromElasticsearch()
|
||||||
|
|
||||||
dc := directorycrawler.NewDirectoryCrawler(globals.ElasticCrawlers.Queue)
|
dc := directorycrawler.NewDirectoryCrawler(globals.ElasticCrawlers.Queue)
|
||||||
err = dc.Crawl(config.GetConfig().RootDir, addToElasticsearch)
|
err = dc.Crawl(config.GetConfig().RootDir, addToElasticsearch)
|
||||||
|
var crawlFailed bool
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
crawlFailed = true
|
||||||
log.Errorf("ELASTIC - Crawl failed: %s", err)
|
log.Errorf("ELASTIC - Crawl failed: %s", err)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
e := globals.ElasticCrawlers.Extra.(*CrawlerExtras)
|
||||||
|
addedItems := atomic.LoadInt32(&e.Added)
|
||||||
|
|
||||||
// Shut down the elastic sync workers once we've finished.
|
// Shut down the elastic sync workers once we've finished.
|
||||||
globals.ElasticCrawlers.Queue.Terminate()
|
globals.ElasticCrawlers.Queue.Terminate()
|
||||||
aliveWorkers.Wait()
|
aliveWorkers.Wait()
|
||||||
|
@ -123,8 +130,11 @@ func syncElasticsearch(doFullSync bool) {
|
||||||
globalPathsByKeyMutex.Unlock()
|
globalPathsByKeyMutex.Unlock()
|
||||||
globalKeysByPathMutex.Unlock()
|
globalKeysByPathMutex.Unlock()
|
||||||
|
|
||||||
duration := time.Since(start)
|
if !crawlFailed {
|
||||||
log.Infof("ELASTIC - %s sync finished in %s", syncType, duration)
|
duration := time.Since(start)
|
||||||
|
|
||||||
|
log.Infof("ELASTIC - %s sync finished in %s and added %d items.", syncType, duration, addedItems)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func logElasticConnError(err error) {
|
func logElasticConnError(err error) {
|
||||||
|
@ -133,14 +143,15 @@ func logElasticConnError(err error) {
|
||||||
|
|
||||||
// EnableElasticsearchConnection tests the connection to Elastic and enables the backend if it's successful.
|
// EnableElasticsearchConnection tests the connection to Elastic and enables the backend if it's successful.
|
||||||
func EnableElasticsearchConnection() {
|
func EnableElasticsearchConnection() {
|
||||||
esSize, err := getElasticSize()
|
_, _, err := getPathsFromIndex(false, 10) // query a very small sample
|
||||||
if err != nil || esSize == -1 {
|
if err != nil {
|
||||||
logElasticConnError(err)
|
logElasticConnError(err)
|
||||||
Enabled = false
|
Enabled = false
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
Enabled = true
|
Enabled = true
|
||||||
log.Infof(`ELASTIC - Connected to index "%s". Contains %d items.`, config.GetConfig().ElasticsearchIndex, esSize)
|
//http.DefaultTransport.(*http.Transport).MaxIdleConnsPerHost = config.GetConfig().ElasticsearchSyncThreads
|
||||||
|
log.Infof(`ELASTIC - Connected to index "%s".`, config.GetConfig().ElasticsearchIndex)
|
||||||
}
|
}
|
||||||
|
|
||||||
func LogElasticQuit() {
|
func LogElasticQuit() {
|
||||||
|
|
|
@ -11,6 +11,7 @@ import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/elastic/go-elasticsearch/v8/esapi"
|
"github.com/elastic/go-elasticsearch/v8/esapi"
|
||||||
|
"io"
|
||||||
"os"
|
"os"
|
||||||
"sync"
|
"sync"
|
||||||
)
|
)
|
||||||
|
@ -72,22 +73,30 @@ func performAddToElasticsearch(item *cacheitem.Item) error {
|
||||||
DocumentID: encodeToBase64(item.Path),
|
DocumentID: encodeToBase64(item.Path),
|
||||||
Body: bytes.NewReader(data),
|
Body: bytes.NewReader(data),
|
||||||
Refresh: "true",
|
Refresh: "true",
|
||||||
|
Timeout: 100,
|
||||||
}
|
}
|
||||||
res, err := req.Do(context.Background(), ElasticClient)
|
res, err := req.Do(context.Background(), ElasticClient)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
defer res.Body.Close()
|
body, err := io.ReadAll(res.Body)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
res.Body.Close()
|
||||||
|
reader := bytes.NewReader(body)
|
||||||
|
|
||||||
if res.IsError() {
|
if res.IsError() {
|
||||||
var e map[string]interface{}
|
var e map[string]interface{}
|
||||||
if err := json.NewDecoder(res.Body).Decode(&e); err != nil {
|
if err := json.NewDecoder(reader).Decode(&e); err != nil {
|
||||||
return errors.New(fmt.Sprintf("Error parsing the response body: %s", err))
|
return errors.New(fmt.Sprintf("Error parsing the response body: %s", err))
|
||||||
}
|
}
|
||||||
return errors.New(fmt.Sprintf(`Error indexing document "%s" - Status code: %d - Response: %s`, item.Path, res.StatusCode, e))
|
return errors.New(fmt.Sprintf(`Error indexing document "%s" - Status code: %d - Response: %s`, item.Path, res.StatusCode, e))
|
||||||
}
|
}
|
||||||
|
|
||||||
log.Debugf(`ELASTIC:Add - Added: "%s"`, preparedItem.Path)
|
if config.GetConfig().ElasticPrintChanges {
|
||||||
|
log.Debugf(`ELASTIC:Add - Added: "%s"`, preparedItem.Path)
|
||||||
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package elastic
|
package elastic
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
"crazyfs/config"
|
"crazyfs/config"
|
||||||
"crazyfs/globals"
|
"crazyfs/globals"
|
||||||
|
@ -9,6 +10,7 @@ import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"github.com/elastic/go-elasticsearch/v8/esapi"
|
"github.com/elastic/go-elasticsearch/v8/esapi"
|
||||||
|
"io"
|
||||||
)
|
)
|
||||||
|
|
||||||
type DeleteJob struct {
|
type DeleteJob struct {
|
||||||
|
@ -29,12 +31,11 @@ func startRemoveStaleItemsFromElasticsearch() {
|
||||||
for path, key := range globalPathsByKey {
|
for path, key := range globalPathsByKey {
|
||||||
job := queuedwalk.Job{
|
job := queuedwalk.Job{
|
||||||
StartPath: path,
|
StartPath: path,
|
||||||
|
Extra: JobExtras{
|
||||||
|
Task: TASKDELETE,
|
||||||
|
Key: key,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
extra := make(map[string]interface{})
|
|
||||||
extra["task"] = TASKDELETE
|
|
||||||
extra["key"] = key
|
|
||||||
job.Extra = &extra
|
|
||||||
|
|
||||||
globals.ElasticCrawlers.Queue.AddJob(job)
|
globals.ElasticCrawlers.Queue.AddJob(job)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -49,12 +50,17 @@ func DeleteFromElasticsearch(key string) error {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
defer res.Body.Close()
|
body, err := io.ReadAll(res.Body)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
res.Body.Close()
|
||||||
|
reader := bytes.NewReader(body)
|
||||||
|
|
||||||
// If we tried to delete a key that doesn't exist in Elastic, it will return an error.
|
// If we tried to delete a key that doesn't exist in Elastic, it will return an error.
|
||||||
if res.IsError() && res.StatusCode != 404 {
|
if res.IsError() {
|
||||||
var e map[string]interface{}
|
var e map[string]interface{}
|
||||||
if err := json.NewDecoder(res.Body).Decode(&e); err != nil {
|
if err := json.NewDecoder(reader).Decode(&e); err != nil {
|
||||||
text := fmt.Sprintf("failed to parse the response body: %s", err)
|
text := fmt.Sprintf("failed to parse the response body: %s", err)
|
||||||
return errors.New(text)
|
return errors.New(text)
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,14 +11,14 @@ import (
|
||||||
)
|
)
|
||||||
|
|
||||||
func getElasticSize() (int, error) {
|
func getElasticSize() (int, error) {
|
||||||
keysByPath, _, err := getPathsFromIndex()
|
keysByPath, _, err := getPathsFromIndex(true, 100)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return -1, err
|
return -1, err
|
||||||
}
|
}
|
||||||
return len(keysByPath), nil
|
return len(keysByPath), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func getPathsFromIndex() (map[string]string, map[string]string, error) {
|
func getPathsFromIndex(doScroll bool, withSize int) (map[string]string, map[string]string, error) {
|
||||||
// This may take a bit if the index is very large, so avoid calling this.
|
// This may take a bit if the index is very large, so avoid calling this.
|
||||||
|
|
||||||
// Print a debug message so the user doesn't think we're frozen.
|
// Print a debug message so the user doesn't think we're frozen.
|
||||||
|
@ -32,7 +32,8 @@ func getPathsFromIndex() (map[string]string, map[string]string, error) {
|
||||||
ElasticClient.Search.WithContext(context.Background()),
|
ElasticClient.Search.WithContext(context.Background()),
|
||||||
ElasticClient.Search.WithIndex(config.GetConfig().ElasticsearchIndex),
|
ElasticClient.Search.WithIndex(config.GetConfig().ElasticsearchIndex),
|
||||||
ElasticClient.Search.WithScroll(time.Minute),
|
ElasticClient.Search.WithScroll(time.Minute),
|
||||||
ElasticClient.Search.WithSize(1000),
|
ElasticClient.Search.WithSize(withSize),
|
||||||
|
ElasticClient.Search.WithSourceIncludes("path"), // Only return the 'path' field
|
||||||
)
|
)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
msg := fmt.Sprintf("Error getting response: %s", err)
|
msg := fmt.Sprintf("Error getting response: %s", err)
|
||||||
|
@ -70,6 +71,10 @@ func getPathsFromIndex() (map[string]string, map[string]string, error) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if !doScroll {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
// Next scroll
|
// Next scroll
|
||||||
res, err = ElasticClient.Scroll(ElasticClient.Scroll.WithScrollID(scrollID), ElasticClient.Scroll.WithScroll(time.Minute))
|
res, err = ElasticClient.Scroll(ElasticClient.Scroll.WithScrollID(scrollID), ElasticClient.Scroll.WithScroll(time.Minute))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
@ -10,4 +10,5 @@ var DirectoryCrawlers *DcWorkers
|
||||||
type DcWorkers struct {
|
type DcWorkers struct {
|
||||||
Queue *queuedwalk.JobQueue
|
Queue *queuedwalk.JobQueue
|
||||||
BusyWorkers int32
|
BusyWorkers int32
|
||||||
|
Extra interface{} // Used to store additional info.
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,7 +11,7 @@ import (
|
||||||
type Job struct {
|
type Job struct {
|
||||||
StartPath string
|
StartPath string
|
||||||
Walker *Walker // A pointer to the shared Walker object is passed as well.
|
Walker *Walker // A pointer to the shared Walker object is passed as well.
|
||||||
Extra *map[string]interface{}
|
Extra interface{}
|
||||||
Terminate bool
|
Terminate bool
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -48,6 +48,11 @@ func (q *JobQueue) AddJob(job Job) bool {
|
||||||
|
|
||||||
// GetJob is how a worker pulls a job from the queue.
|
// GetJob is how a worker pulls a job from the queue.
|
||||||
func (q *JobQueue) GetJob() Job {
|
func (q *JobQueue) GetJob() Job {
|
||||||
|
q.mutex.Lock()
|
||||||
|
defer q.mutex.Unlock()
|
||||||
|
for q.GetQueuedJobs() == 0 && !q.terminate {
|
||||||
|
q.cond.Wait()
|
||||||
|
}
|
||||||
if q.terminate {
|
if q.terminate {
|
||||||
// Return an empty job that tells the worker to quit.
|
// Return an empty job that tells the worker to quit.
|
||||||
return Job{
|
return Job{
|
||||||
|
@ -55,12 +60,6 @@ func (q *JobQueue) GetJob() Job {
|
||||||
Terminate: true,
|
Terminate: true,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
q.mutex.Lock()
|
|
||||||
defer q.mutex.Unlock()
|
|
||||||
for q.GetQueueSize() == 0 {
|
|
||||||
q.cond.Wait()
|
|
||||||
}
|
|
||||||
job, err := q.fifo.DequeueOrWaitForNextElement()
|
job, err := q.fifo.DequeueOrWaitForNextElement()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
|
@ -68,8 +67,8 @@ func (q *JobQueue) GetJob() Job {
|
||||||
return job.(Job)
|
return job.(Job)
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetQueueSize returns the size of the queue.
|
// GetQueuedJobs returns the size of the queue.
|
||||||
func (q *JobQueue) GetQueueSize() int {
|
func (q *JobQueue) GetQueuedJobs() int {
|
||||||
return q.fifo.GetLen()
|
return q.fifo.GetLen()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -78,4 +77,5 @@ func (q *JobQueue) GetQueueSize() int {
|
||||||
func (q *JobQueue) Terminate() {
|
func (q *JobQueue) Terminate() {
|
||||||
q.terminate = true
|
q.terminate = true
|
||||||
q.fifo.Lock()
|
q.fifo.Lock()
|
||||||
|
q.cond.Broadcast()
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,6 @@
|
||||||
package workers
|
package workers
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"crazyfs/config"
|
|
||||||
"crazyfs/queuedwalk"
|
"crazyfs/queuedwalk"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -16,12 +15,12 @@ type CrawlWorkers struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
// InitializeWorkers starts the number of workers defined by the config.
|
// InitializeWorkers starts the number of workers defined by the config.
|
||||||
func InitializeWorkers(workerFunc CrawlWorkerFunc) *CrawlWorkers {
|
func InitializeWorkers(workerCount int, workerFunc CrawlWorkerFunc) *CrawlWorkers {
|
||||||
w := &CrawlWorkers{
|
w := &CrawlWorkers{
|
||||||
WorkerFunc: workerFunc,
|
WorkerFunc: workerFunc,
|
||||||
}
|
}
|
||||||
w.Queue = queuedwalk.NewJobQueue()
|
w.Queue = queuedwalk.NewJobQueue()
|
||||||
for n := 1; n <= config.GetConfig().DirectoryCrawlers; n++ {
|
for n := 1; n <= workerCount; n++ {
|
||||||
go w.WorkerFunc(w)
|
go w.WorkerFunc(w)
|
||||||
}
|
}
|
||||||
return w
|
return w
|
||||||
|
|
Loading…
Reference in New Issue