make workers global, fix worker setup, clean up

This commit is contained in:
Cyberes 2023-12-11 18:50:30 -07:00
parent 7078712bc3
commit 157f80a463
8 changed files with 96 additions and 107 deletions

View File

@ -33,7 +33,7 @@ func NewItem(fullPath string, info os.FileInfo) *Item {
// Ignore symlinks // Ignore symlinks
return nil return nil
} else { } else {
log.Warnf("NewItem - Path does not exist: %s", fullPath) log.Warnf("NewItem - StartPath does not exist: %s", fullPath)
return nil return nil
} }
} }
@ -65,7 +65,7 @@ func NewItem(fullPath string, info os.FileInfo) *Item {
} }
if os.IsNotExist(err) { if os.IsNotExist(err) {
log.Warnf("Path does not exist: %s", fullPath) log.Warnf("StartPath does not exist: %s", fullPath)
return nil return nil
} else if err != nil { } else if err != nil {
log.Warnf("Error detecting MIME type of file %s - %v", fullPath, err) log.Warnf("Error detecting MIME type of file %s - %v", fullPath, err)

View File

@ -3,7 +3,6 @@ package api
import ( import (
"crazyfs/CacheItem" "crazyfs/CacheItem"
"crazyfs/api/helpers" "crazyfs/api/helpers"
"crazyfs/cache/DirectoryCrawler"
"crazyfs/config" "crazyfs/config"
"crazyfs/elastic" "crazyfs/elastic"
"crypto/sha256" "crypto/sha256"
@ -30,12 +29,11 @@ func AdminCacheInfo(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cac
cacheLen := sharedCache.Len() cacheLen := sharedCache.Len()
response := map[string]interface{}{ response := map[string]interface{}{
"cache_size": cacheLen, "cachedItems": cacheLen,
"cache_max": config.GetConfig().CacheSize, "cacheMax": config.GetConfig().CacheSize,
"crawls_running": DirectoryCrawler.GetTotalActiveCrawls(), "recacheCrawlLimit": config.GetConfig().CacheRecacheCrawlerLimit,
"busy_workers": DirectoryCrawler.BusyWorkers, "newSyncRunning": elastic.ElasticRefreshSyncRunning,
"new_sync_running": elastic.ElasticRefreshSyncRunning, "refreshSyncRunning": elastic.ElasticRefreshSyncRunning,
"refresh_sync_running": elastic.ElasticRefreshSyncRunning,
} }
w.Header().Set("Cache-Control", "no-store") w.Header().Set("Cache-Control", "no-store")
w.Header().Set("Content-Type", "application/json") w.Header().Set("Content-Type", "application/json")

View File

@ -27,8 +27,14 @@ func AdminCrawlsInfo(w http.ResponseWriter, r *http.Request, sharedCache *lru.Ca
return return
} else { } else {
response := map[string]interface{}{ response := map[string]interface{}{
"active": DirectoryCrawler.GetActiveCrawls(), "crawls": map[string]interface{}{
"finished": DirectoryCrawler.GetFinishedCrawls(), "active": DirectoryCrawler.GetActiveCrawls(),
"finished": DirectoryCrawler.GetFinishedCrawls(),
},
"workers": map[string]interface{}{
"busy": DirectoryCrawler.BusyWorkers,
"max": config.GetConfig().DirectoryCrawlers,
},
} }
w.Header().Set("Cache-Control", "no-store") w.Header().Set("Cache-Control", "no-store")
w.Header().Set("Content-Type", "application/json") w.Header().Set("Content-Type", "application/json")

View File

@ -13,6 +13,9 @@ import (
// WorkerPool is a buffered channel acting as a semaphore to limit the number of active workers globally // WorkerPool is a buffered channel acting as a semaphore to limit the number of active workers globally
var WorkerPool chan struct{} var WorkerPool chan struct{}
// Jobs is a global channel that all Walker instances submit jobs to
var Jobs chan WalkJob
// BusyWorkers is an atomic counter for the number of active workers // BusyWorkers is an atomic counter for the number of active workers
var BusyWorkers int32 var BusyWorkers int32
@ -20,15 +23,53 @@ var BusyWorkers int32
// to a walker function, does not point to a directory // to a walker function, does not point to a directory
var ErrNotDir = errors.New("not a directory") var ErrNotDir = errors.New("not a directory")
// WalkJob is a job that's passed to the workers.
type WalkJob struct {
StartPath string
Walker *Walker
}
// Walker is constructed for each Walk() function invocation // Walker is constructed for each Walk() function invocation
type Walker struct { type Walker struct {
wg sync.WaitGroup wg sync.WaitGroup
jobs chan string
root string root string
followSymlinks bool followSymlinks bool
walkFunc filepath.WalkFunc walkFunc filepath.WalkFunc
} }
// InitializeWorkers starts the number of workers defined by config.GetConfig().DirectoryCrawlers
func InitializeWorkers() {
WorkerPool = make(chan struct{}, config.GetConfig().DirectoryCrawlers)
Jobs = make(chan WalkJob, config.GetConfig().CacheSize)
for n := 1; n <= config.GetConfig().DirectoryCrawlers; n++ {
go worker()
}
log.Debugf("Started %d directory crawler workers.", config.GetConfig().DirectoryCrawlers)
}
// worker processes all the jobs until the jobs channel is explicitly closed
func worker() {
for job := range Jobs {
WorkerPool <- struct{}{} // acquire a worker
atomic.AddInt32(&BusyWorkers, 1) // increment the number of active workers
err := job.Walker.processPath(job.StartPath)
if err != nil {
log.Warnf("worker - %s - %s", job.StartPath, err)
}
job.Walker.wg.Done() // decrement the WaitGroup counter
<-WorkerPool // release the worker when done
atomic.AddInt32(&BusyWorkers, -1) // decrement the number of active workers
}
}
// addJob increments the job counter
// and pushes the path to the jobs channel
func (w *Walker) addJob(job WalkJob) {
w.wg.Add(1)
Jobs <- job
}
// the readDirNames function below was taken from the original // the readDirNames function below was taken from the original
// implementation (see https://golang.org/src/path/filepath/path.go) // implementation (see https://golang.org/src/path/filepath/path.go)
// but has sorting removed (sorting doesn't make sense // but has sorting removed (sorting doesn't make sense
@ -57,8 +98,8 @@ func readDirNames(dirname string) ([]string, error) {
// lstat is a wrapper for os.Lstat which accepts a path // lstat is a wrapper for os.Lstat which accepts a path
// relative to Walker.root and also follows symlinks // relative to Walker.root and also follows symlinks
func (w *Walker) lstat(relpath string) (info os.FileInfo, err error) { func (w *Walker) lstat(relPath string) (info os.FileInfo, err error) {
path := filepath.Join(w.root, relpath) path := filepath.Join(w.root, relPath)
info, err = os.Lstat(path) info, err = os.Lstat(path)
if err != nil { if err != nil {
return nil, err return nil, err
@ -81,82 +122,51 @@ func (w *Walker) lstat(relpath string) (info os.FileInfo, err error) {
// processPath processes one directory and adds // processPath processes one directory and adds
// its subdirectories to the queue for further processing // its subdirectories to the queue for further processing
func (w *Walker) processPath(relpath string) error { func (w *Walker) processPath(relPath string) error {
defer w.wg.Done() fullPath := filepath.Join(w.root, relPath)
names, err := readDirNames(fullPath)
path := filepath.Join(w.root, relpath)
names, err := readDirNames(path)
if err != nil { if err != nil {
log.Errorf("Walker - processPath - readDirNames - %s", err) log.Errorf("Walker - processPath - readDirNames - %s", err)
return err return err
} }
for _, name := range names { for _, name := range names {
subpath := filepath.Join(relpath, name) subPath := filepath.Join(relPath, name)
info, err := w.lstat(subpath) info, err := w.lstat(subPath)
if err != nil { if err != nil {
log.Warnf("processPath - %s - %s", relpath, err) log.Warnf("processPath - %s - %s", relPath, err)
continue continue
} }
if info == nil { if info == nil {
log.Warnf("processPath - %s - %s", relpath, err) log.Warnf("processPath - %s - %s", relPath, err)
continue continue
} }
subPathFull := filepath.Join(w.root, subPath)
err = w.walkFunc(filepath.Join(w.root, subpath), info, err) err = w.walkFunc(subPathFull, info, err)
if errors.Is(err, filepath.SkipDir) { if errors.Is(err, filepath.SkipDir) {
return nil return nil
} }
if info.Mode().IsDir() { if info.Mode().IsDir() {
w.addJob(subpath) w.addJob(WalkJob{
StartPath: subPath,
Walker: w,
})
} }
} }
return nil return nil
} }
// addJob increments the job counter
// and pushes the path to the jobs channel
func (w *Walker) addJob(path string) {
w.wg.Add(1)
select {
// try to push the job to the channel
case w.jobs <- path: // ok
default: // buffer overflow
// process job synchronously
err := w.processPath(path)
if err != nil {
log.Warnf("addJob - %s - %s", path, err)
}
}
}
// worker processes all the jobs until the jobs channel is explicitly closed
func (w *Walker) worker() {
for path := range w.jobs {
WorkerPool <- struct{}{} // acquire a worker
atomic.AddInt32(&BusyWorkers, 1) // increment the number of active workers
err := w.processPath(path)
if err != nil {
log.Warnf("worker - %s", err)
}
<-WorkerPool // release the worker when done
atomic.AddInt32(&BusyWorkers, -1) // decrement the number of active workers
}
}
// Walk recursively descends into subdirectories, calling walkFn for each file or directory // Walk recursively descends into subdirectories, calling walkFn for each file or directory
// in the tree, including the root directory. // in the tree, including the root directory.
func (w *Walker) Walk(relpath string, walkFn filepath.WalkFunc) error { func (w *Walker) Walk(relPath string, walkFn filepath.WalkFunc) error {
w.jobs = make(chan string, config.GetConfig().DirectoryCrawlers)
w.walkFunc = walkFn w.walkFunc = walkFn
info, err := w.lstat(relpath) fullPath := filepath.Join(w.root, relPath)
err = w.walkFunc(filepath.Join(w.root, relpath), info, err) info, err := w.lstat(relPath)
err = w.walkFunc(fullPath, info, err)
if errors.Is(err, filepath.SkipDir) { if errors.Is(err, filepath.SkipDir) {
return nil return nil
} }
@ -165,21 +175,18 @@ func (w *Walker) Walk(relpath string, walkFn filepath.WalkFunc) error {
} }
if info == nil { if info == nil {
return fmt.Errorf("broken symlink: %s", relpath) return fmt.Errorf("broken symlink: %s", relPath)
} }
if !info.Mode().IsDir() { if !info.Mode().IsDir() {
return ErrNotDir return ErrNotDir
} }
// Spawn workers w.addJob(WalkJob{
for n := 1; n <= config.GetConfig().DirectoryCrawlers; n++ { StartPath: relPath,
go w.worker() Walker: w,
} }) // add this path as a first job
w.wg.Wait() // wait till all paths are processed
w.addJob(relpath) // add this path as a first job
w.wg.Wait() // wait till all paths are processed
close(w.jobs) // signal workers to close
return nil return nil
} }

View File

@ -49,7 +49,7 @@ func (dc *DirectoryCrawler) processPath(fullPath string, info os.FileInfo) error
} }
} }
} else { } else {
// Path is a file // StartPath is a file
dc.AddCacheItem(fullPath, info) dc.AddCacheItem(fullPath, info)
} }
return nil return nil

View File

@ -59,8 +59,7 @@ func startCrawl(sharedCache *lru.Cache[string, *CacheItem.Item], wg *sync.WaitGr
func logCacheStatus(msg string, ticker *time.Ticker, sharedCache *lru.Cache[string, *CacheItem.Item], logFn func(format string, args ...interface{})) { func logCacheStatus(msg string, ticker *time.Ticker, sharedCache *lru.Cache[string, *CacheItem.Item], logFn func(format string, args ...interface{})) {
defer ticker.Stop() defer ticker.Stop()
for range ticker.C { for range ticker.C {
activeWorkers := int(DirectoryCrawler.BusyWorkers) logFn("%s - %d/%d items in the cache. Busy workers: %d, running crawls: %d",
runningCrawls := DirectoryCrawler.GetTotalActiveCrawls() msg, len(sharedCache.Keys()), config.GetConfig().CacheSize, DirectoryCrawler.BusyWorkers, DirectoryCrawler.GetTotalActiveCrawls())
logFn("%s - %d/%d items in the cache. Active workers: %d Active crawls: %d", msg, len(sharedCache.Keys()), config.GetConfig().CacheSize, activeWorkers, runningCrawls)
} }
} }

View File

@ -14,11 +14,9 @@ type Config struct {
HTTPPort string HTTPPort string
CrawlModeCrawlInterval int CrawlModeCrawlInterval int
DirectoryCrawlers int DirectoryCrawlers int
CrawlWorkers int
CacheSize int CacheSize int
CacheTime int CacheTime int
CachePrintNew bool CachePrintNew bool
CachePrintChanges bool
InitialCrawl bool InitialCrawl bool
CacheRecacheCrawlerLimit int CacheRecacheCrawlerLimit int
CrawlerParseMIME bool CrawlerParseMIME bool
@ -31,7 +29,6 @@ type Config struct {
RestrictedDownloadPaths []string RestrictedDownloadPaths []string
ApiSearchMaxResults int ApiSearchMaxResults int
ApiSearchShowChildren bool ApiSearchShowChildren bool
WorkersJobQueueSize int
ElasticsearchEnable bool ElasticsearchEnable bool
ElasticsearchEndpoint string ElasticsearchEndpoint string
ElasticsearchSyncEnable bool ElasticsearchSyncEnable bool
@ -59,8 +56,7 @@ func SetConfig(configFile string) (*Config, error) {
viper.SetDefault("watch_interval", 1) viper.SetDefault("watch_interval", 1)
viper.SetDefault("watch_mode", "crawl") viper.SetDefault("watch_mode", "crawl")
viper.SetDefault("crawl_mode_crawl_interval", 3600) viper.SetDefault("crawl_mode_crawl_interval", 3600)
viper.SetDefault("directory_crawlers", 4) viper.SetDefault("directory_crawlers", 10)
viper.SetDefault("crawl_workers", 10)
viper.SetDefault("cache_size", 100000000) viper.SetDefault("cache_size", 100000000)
viper.SetDefault("cache_time", 30) viper.SetDefault("cache_time", 30)
viper.SetDefault("cache_print_new", false) viper.SetDefault("cache_print_new", false)
@ -110,24 +106,22 @@ func SetConfig(configFile string) (*Config, error) {
rootDir = "/" rootDir = "/"
} }
workersJobQueueSizeValue := viper.GetInt("crawler_worker_job_queue_size") //workersJobQueueSizeValue := viper.GetInt("crawler_worker_job_queue_size")
var workersJobQueueSize int //var workersJobQueueSize int
if workersJobQueueSizeValue == 0 { //if workersJobQueueSizeValue == 0 {
workersJobQueueSize = viper.GetInt("crawl_workers") * 100 // workersJobQueueSize = viper.GetInt("crawl_workers") * 100
} else { //} else {
workersJobQueueSize = workersJobQueueSizeValue // workersJobQueueSize = workersJobQueueSizeValue
} //}
config := &Config{ config := &Config{
RootDir: rootDir, RootDir: rootDir,
HTTPPort: viper.GetString("http_port"), HTTPPort: viper.GetString("http_port"),
CrawlModeCrawlInterval: viper.GetInt("crawl_mode_crawl_interval"), CrawlModeCrawlInterval: viper.GetInt("crawl_mode_crawl_interval"),
DirectoryCrawlers: viper.GetInt("crawl_mode_crawl_interval"), DirectoryCrawlers: viper.GetInt("directory_crawlers"),
CrawlWorkers: viper.GetInt("crawl_workers"),
CacheSize: viper.GetInt("cache_size"), CacheSize: viper.GetInt("cache_size"),
CacheTime: viper.GetInt("cache_time"), CacheTime: viper.GetInt("cache_time"),
CachePrintNew: viper.GetBool("cache_print_new"), CachePrintNew: viper.GetBool("cache_print_new"),
CachePrintChanges: viper.GetBool("cache_print_changes"),
InitialCrawl: viper.GetBool("initial_crawl"), InitialCrawl: viper.GetBool("initial_crawl"),
CacheRecacheCrawlerLimit: viper.GetInt("cache_recache_crawler_limit"), CacheRecacheCrawlerLimit: viper.GetInt("cache_recache_crawler_limit"),
CrawlerParseMIME: viper.GetBool("crawler_parse_mime"), CrawlerParseMIME: viper.GetBool("crawler_parse_mime"),
@ -140,7 +134,6 @@ func SetConfig(configFile string) (*Config, error) {
RestrictedDownloadPaths: restrictedPaths, RestrictedDownloadPaths: restrictedPaths,
ApiSearchMaxResults: viper.GetInt("api_search_max_results"), ApiSearchMaxResults: viper.GetInt("api_search_max_results"),
ApiSearchShowChildren: viper.GetBool("api_search_show_children"), ApiSearchShowChildren: viper.GetBool("api_search_show_children"),
WorkersJobQueueSize: workersJobQueueSize,
ElasticsearchEnable: viper.GetBool("elasticsearch_enable"), ElasticsearchEnable: viper.GetBool("elasticsearch_enable"),
ElasticsearchEndpoint: viper.GetString("elasticsearch_endpoint"), ElasticsearchEndpoint: viper.GetString("elasticsearch_endpoint"),
ElasticsearchSyncEnable: viper.GetBool("elasticsearch_sync_enable"), ElasticsearchSyncEnable: viper.GetBool("elasticsearch_sync_enable"),
@ -165,10 +158,6 @@ func SetConfig(configFile string) (*Config, error) {
return nil, errors.New("crawl_mode_crawl_interval must be more than 1") return nil, errors.New("crawl_mode_crawl_interval must be more than 1")
} }
if config.CrawlWorkers < 1 {
return nil, errors.New("crawl_workers must be more than 1")
}
if config.CacheSize < 1 { if config.CacheSize < 1 {
return nil, errors.New("crawl_workers must be more than 1") return nil, errors.New("crawl_workers must be more than 1")
} }

View File

@ -39,14 +39,6 @@ type cliConfig struct {
// TODO: admin api endpoint to get status and progress of the full refresh of elasticsearch // TODO: admin api endpoint to get status and progress of the full refresh of elasticsearch
func main() { func main() {
//fullPath := "/srv/chub-archive"
//RootDir := "/srv/chub-archive"
//
//fmt.Println(strings.HasPrefix(fullPath, RootDir))
////fmt.Println(fullPath != RootDir)
//
//return
cliArgs := parseArgs() cliArgs := parseArgs()
if cliArgs.help { if cliArgs.help {
flag.Usage() flag.Usage()
@ -101,9 +93,7 @@ func main() {
log.Infof("Elasticsearch enabled: %t", cfg.ElasticsearchEnable) log.Infof("Elasticsearch enabled: %t", cfg.ElasticsearchEnable)
// Init global variables DirectoryCrawler.InitializeWorkers()
//DirectoryCrawler.CrawlWorkerPool = DirectoryCrawler.NewWorkerPool(config.MaxWorkers)
DirectoryCrawler.WorkerPool = make(chan struct{}, cfg.CrawlWorkers)
cache.InitRecacheSemaphore(cfg.CacheRecacheCrawlerLimit) cache.InitRecacheSemaphore(cfg.CacheRecacheCrawlerLimit)
@ -166,7 +156,7 @@ func main() {
func parseArgs() cliConfig { func parseArgs() cliConfig {
var cliArgs cliConfig var cliArgs cliConfig
flag.StringVar(&cliArgs.configFile, "config", "", "Path to the config file") flag.StringVar(&cliArgs.configFile, "config", "", "StartPath to the config file")
flag.BoolVar(&cliArgs.initialCrawl, "initial-crawl", false, "Do an initial crawl to fill the cache") flag.BoolVar(&cliArgs.initialCrawl, "initial-crawl", false, "Do an initial crawl to fill the cache")
flag.BoolVar(&cliArgs.initialCrawl, "i", false, "Do an initial crawl to fill the cache") flag.BoolVar(&cliArgs.initialCrawl, "i", false, "Do an initial crawl to fill the cache")
flag.BoolVar(&cliArgs.debug, "d", false, "Enable debug mode") flag.BoolVar(&cliArgs.debug, "d", false, "Enable debug mode")