make workers global, fix worker setup, clean up

2023-12-11 18:50:30 -07:00 · 2023-12-11 18:50:30 -07:00 · 157f80a463
parent 7078712bc3
commit 157f80a463
8 changed files with 96 additions and 107 deletions
--- a/src/CacheItem/Item.go
+++ b/src/CacheItem/Item.go
@ -33,7 +33,7 @@ func NewItem(fullPath string, info os.FileInfo) *Item {
 			// Ignore symlinks
 			return nil
 		} else {
-			log.Warnf("NewItem - Path does not exist: %s", fullPath)
+			log.Warnf("NewItem - StartPath does not exist: %s", fullPath)
 			return nil
 		}
 	}
@ -65,7 +65,7 @@ func NewItem(fullPath string, info os.FileInfo) *Item {
 		}

 		if os.IsNotExist(err) {
-			log.Warnf("Path does not exist: %s", fullPath)
+			log.Warnf("StartPath does not exist: %s", fullPath)
 			return nil
 		} else if err != nil {
 			log.Warnf("Error detecting MIME type of file %s - %v", fullPath, err)
--- a/src/api/AdminCacheInfo.go
+++ b/src/api/AdminCacheInfo.go
@ -3,7 +3,6 @@ package api
 import (
 	"crazyfs/CacheItem"
 	"crazyfs/api/helpers"
-	"crazyfs/cache/DirectoryCrawler"
 	"crazyfs/config"
 	"crazyfs/elastic"
 	"crypto/sha256"
@ -30,12 +29,11 @@ func AdminCacheInfo(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cac
 			cacheLen := sharedCache.Len()

 			response := map[string]interface{}{
-				"cache_size":           cacheLen,
-				"cache_max":            config.GetConfig().CacheSize,
-				"crawls_running":       DirectoryCrawler.GetTotalActiveCrawls(),
-				"busy_workers":         DirectoryCrawler.BusyWorkers,
-				"new_sync_running":     elastic.ElasticRefreshSyncRunning,
-				"refresh_sync_running": elastic.ElasticRefreshSyncRunning,
+				"cachedItems":        cacheLen,
+				"cacheMax":           config.GetConfig().CacheSize,
+				"recacheCrawlLimit":  config.GetConfig().CacheRecacheCrawlerLimit,
+				"newSyncRunning":     elastic.ElasticRefreshSyncRunning,
+				"refreshSyncRunning": elastic.ElasticRefreshSyncRunning,
 			}
 			w.Header().Set("Cache-Control", "no-store")
 			w.Header().Set("Content-Type", "application/json")
--- a/src/api/AdminCrawlsInfo.go
+++ b/src/api/AdminCrawlsInfo.go
@ -27,8 +27,14 @@ func AdminCrawlsInfo(w http.ResponseWriter, r *http.Request, sharedCache *lru.Ca
 			return
 		} else {
 			response := map[string]interface{}{
-				"active":   DirectoryCrawler.GetActiveCrawls(),
-				"finished": DirectoryCrawler.GetFinishedCrawls(),
+				"crawls": map[string]interface{}{
+					"active":   DirectoryCrawler.GetActiveCrawls(),
+					"finished": DirectoryCrawler.GetFinishedCrawls(),
+				},
+				"workers": map[string]interface{}{
+					"busy": DirectoryCrawler.BusyWorkers,
+					"max":  config.GetConfig().DirectoryCrawlers,
+				},
 			}
 			w.Header().Set("Cache-Control", "no-store")
 			w.Header().Set("Content-Type", "application/json")
--- a/src/cache/DirectoryCrawler/Walker.go
+++ b/src/cache/DirectoryCrawler/Walker.go
@ -13,6 +13,9 @@ import (
 // WorkerPool is a buffered channel acting as a semaphore to limit the number of active workers globally
 var WorkerPool chan struct{}

+// Jobs is a global channel that all Walker instances submit jobs to
+var Jobs chan WalkJob
+
 // BusyWorkers is an atomic counter for the number of active workers
 var BusyWorkers int32

@ -20,15 +23,53 @@ var BusyWorkers int32
 // to a walker function, does not point to a directory
 var ErrNotDir = errors.New("not a directory")

+// WalkJob is a job that's passed to the workers.
+type WalkJob struct {
+	StartPath string
+	Walker    *Walker
+}
+
 // Walker is constructed for each Walk() function invocation
 type Walker struct {
 	wg             sync.WaitGroup
-	jobs           chan string
 	root           string
 	followSymlinks bool
 	walkFunc       filepath.WalkFunc
 }

+// InitializeWorkers starts the number of workers defined by config.GetConfig().DirectoryCrawlers
+func InitializeWorkers() {
+	WorkerPool = make(chan struct{}, config.GetConfig().DirectoryCrawlers)
+	Jobs = make(chan WalkJob, config.GetConfig().CacheSize)
+	for n := 1; n <= config.GetConfig().DirectoryCrawlers; n++ {
+		go worker()
+	}
+	log.Debugf("Started %d directory crawler workers.", config.GetConfig().DirectoryCrawlers)
+}
+
+// worker processes all the jobs until the jobs channel is explicitly closed
+func worker() {
+	for job := range Jobs {
+		WorkerPool <- struct{}{}         // acquire a worker
+		atomic.AddInt32(&BusyWorkers, 1) // increment the number of active workers
+		err := job.Walker.processPath(job.StartPath)
+		if err != nil {
+			log.Warnf("worker - %s - %s", job.StartPath, err)
+		}
+
+		job.Walker.wg.Done()              // decrement the WaitGroup counter
+		<-WorkerPool                      // release the worker when done
+		atomic.AddInt32(&BusyWorkers, -1) // decrement the number of active workers
+	}
+}
+
+// addJob increments the job counter
+// and pushes the path to the jobs channel
+func (w *Walker) addJob(job WalkJob) {
+	w.wg.Add(1)
+	Jobs <- job
+}
+
 // the readDirNames function below was taken from the original
 // implementation (see https://golang.org/src/path/filepath/path.go)
 // but has sorting removed (sorting doesn't make sense
@ -57,8 +98,8 @@ func readDirNames(dirname string) ([]string, error) {

 // lstat is a wrapper for os.Lstat which accepts a path
 // relative to Walker.root and also follows symlinks
-func (w *Walker) lstat(relpath string) (info os.FileInfo, err error) {
-	path := filepath.Join(w.root, relpath)
+func (w *Walker) lstat(relPath string) (info os.FileInfo, err error) {
+	path := filepath.Join(w.root, relPath)
 	info, err = os.Lstat(path)
 	if err != nil {
 		return nil, err
@ -81,82 +122,51 @@ func (w *Walker) lstat(relpath string) (info os.FileInfo, err error) {

 // processPath processes one directory and adds
 // its subdirectories to the queue for further processing
-func (w *Walker) processPath(relpath string) error {
-	defer w.wg.Done()
-
-	path := filepath.Join(w.root, relpath)
-	names, err := readDirNames(path)
+func (w *Walker) processPath(relPath string) error {
+	fullPath := filepath.Join(w.root, relPath)
+	names, err := readDirNames(fullPath)
 	if err != nil {
 		log.Errorf("Walker - processPath - readDirNames - %s", err)
 		return err
 	}

 	for _, name := range names {
-		subpath := filepath.Join(relpath, name)
-		info, err := w.lstat(subpath)
+		subPath := filepath.Join(relPath, name)
+		info, err := w.lstat(subPath)

 		if err != nil {
-			log.Warnf("processPath - %s - %s", relpath, err)
+			log.Warnf("processPath - %s - %s", relPath, err)
 			continue
 		}

 		if info == nil {
-			log.Warnf("processPath - %s - %s", relpath, err)
+			log.Warnf("processPath - %s - %s", relPath, err)
 			continue
 		}
-
-		err = w.walkFunc(filepath.Join(w.root, subpath), info, err)
+		subPathFull := filepath.Join(w.root, subPath)
+		err = w.walkFunc(subPathFull, info, err)
 		if errors.Is(err, filepath.SkipDir) {
 			return nil
 		}

 		if info.Mode().IsDir() {
-			w.addJob(subpath)
+			w.addJob(WalkJob{
+				StartPath: subPath,
+				Walker:    w,
+			})
 		}
 	}
 	return nil
 }

-// addJob increments the job counter
-// and pushes the path to the jobs channel
-func (w *Walker) addJob(path string) {
-	w.wg.Add(1)
-	select {
-	// try to push the job to the channel
-	case w.jobs <- path: // ok
-	default: // buffer overflow
-		// process job synchronously
-		err := w.processPath(path)
-		if err != nil {
-			log.Warnf("addJob - %s - %s", path, err)
-		}
-	}
-}
-
-// worker processes all the jobs until the jobs channel is explicitly closed
-func (w *Walker) worker() {
-	for path := range w.jobs {
-		WorkerPool <- struct{}{}         // acquire a worker
-		atomic.AddInt32(&BusyWorkers, 1) // increment the number of active workers
-
-		err := w.processPath(path)
-		if err != nil {
-			log.Warnf("worker - %s", err)
-		}
-
-		<-WorkerPool                      // release the worker when done
-		atomic.AddInt32(&BusyWorkers, -1) // decrement the number of active workers
-	}
-}
-
 // Walk recursively descends into subdirectories, calling walkFn for each file or directory
 // in the tree, including the root directory.
-func (w *Walker) Walk(relpath string, walkFn filepath.WalkFunc) error {
-	w.jobs = make(chan string, config.GetConfig().DirectoryCrawlers)
+func (w *Walker) Walk(relPath string, walkFn filepath.WalkFunc) error {
 	w.walkFunc = walkFn

-	info, err := w.lstat(relpath)
-	err = w.walkFunc(filepath.Join(w.root, relpath), info, err)
+	fullPath := filepath.Join(w.root, relPath)
+	info, err := w.lstat(relPath)
+	err = w.walkFunc(fullPath, info, err)
 	if errors.Is(err, filepath.SkipDir) {
 		return nil
 	}
@ -165,21 +175,18 @@ func (w *Walker) Walk(relpath string, walkFn filepath.WalkFunc) error {
 	}

 	if info == nil {
-		return fmt.Errorf("broken symlink: %s", relpath)
+		return fmt.Errorf("broken symlink: %s", relPath)
 	}

 	if !info.Mode().IsDir() {
 		return ErrNotDir
 	}

-	// Spawn workers
-	for n := 1; n <= config.GetConfig().DirectoryCrawlers; n++ {
-		go w.worker()
-	}
-
-	w.addJob(relpath) // add this path as a first job
-	w.wg.Wait()       // wait till all paths are processed
-	close(w.jobs)     // signal workers to close
+	w.addJob(WalkJob{
+		StartPath: relPath,
+		Walker:    w,
+	}) // add this path as a first job
+	w.wg.Wait() // wait till all paths are processed
 	return nil
 }

--- a/src/cache/DirectoryCrawler/process.go
+++ b/src/cache/DirectoryCrawler/process.go
@ -49,7 +49,7 @@ func (dc *DirectoryCrawler) processPath(fullPath string, info os.FileInfo) error
 			}
 		}
 	} else {
-		// Path is a file
+		// StartPath is a file
 		dc.AddCacheItem(fullPath, info)
 	}
 	return nil
--- a/src/cache/crawler.go
+++ b/src/cache/crawler.go
@ -59,8 +59,7 @@ func startCrawl(sharedCache *lru.Cache[string, *CacheItem.Item], wg *sync.WaitGr
 func logCacheStatus(msg string, ticker *time.Ticker, sharedCache *lru.Cache[string, *CacheItem.Item], logFn func(format string, args ...interface{})) {
 	defer ticker.Stop()
 	for range ticker.C {
-		activeWorkers := int(DirectoryCrawler.BusyWorkers)
-		runningCrawls := DirectoryCrawler.GetTotalActiveCrawls()
-		logFn("%s - %d/%d items in the cache. Active workers: %d Active crawls: %d", msg, len(sharedCache.Keys()), config.GetConfig().CacheSize, activeWorkers, runningCrawls)
+		logFn("%s - %d/%d items in the cache. Busy workers: %d, running crawls: %d",
+			msg, len(sharedCache.Keys()), config.GetConfig().CacheSize, DirectoryCrawler.BusyWorkers, DirectoryCrawler.GetTotalActiveCrawls())
 	}
 }
--- a/src/config/config.go
+++ b/src/config/config.go
@ -14,11 +14,9 @@ type Config struct {
 	HTTPPort                          string
 	CrawlModeCrawlInterval            int
 	DirectoryCrawlers                 int
-	CrawlWorkers                      int
 	CacheSize                         int
 	CacheTime                         int
 	CachePrintNew                     bool
-	CachePrintChanges                 bool
 	InitialCrawl                      bool
 	CacheRecacheCrawlerLimit          int
 	CrawlerParseMIME                  bool
@ -31,7 +29,6 @@ type Config struct {
 	RestrictedDownloadPaths           []string
 	ApiSearchMaxResults               int
 	ApiSearchShowChildren             bool
-	WorkersJobQueueSize               int
 	ElasticsearchEnable               bool
 	ElasticsearchEndpoint             string
 	ElasticsearchSyncEnable           bool
@ -59,8 +56,7 @@ func SetConfig(configFile string) (*Config, error) {
 	viper.SetDefault("watch_interval", 1)
 	viper.SetDefault("watch_mode", "crawl")
 	viper.SetDefault("crawl_mode_crawl_interval", 3600)
-	viper.SetDefault("directory_crawlers", 4)
-	viper.SetDefault("crawl_workers", 10)
+	viper.SetDefault("directory_crawlers", 10)
 	viper.SetDefault("cache_size", 100000000)
 	viper.SetDefault("cache_time", 30)
 	viper.SetDefault("cache_print_new", false)
@ -110,24 +106,22 @@ func SetConfig(configFile string) (*Config, error) {
 		rootDir = "/"
 	}

-	workersJobQueueSizeValue := viper.GetInt("crawler_worker_job_queue_size")
-	var workersJobQueueSize int
-	if workersJobQueueSizeValue == 0 {
-		workersJobQueueSize = viper.GetInt("crawl_workers") * 100
-	} else {
-		workersJobQueueSize = workersJobQueueSizeValue
-	}
+	//workersJobQueueSizeValue := viper.GetInt("crawler_worker_job_queue_size")
+	//var workersJobQueueSize int
+	//if workersJobQueueSizeValue == 0 {
+	//	workersJobQueueSize = viper.GetInt("crawl_workers") * 100
+	//} else {
+	//	workersJobQueueSize = workersJobQueueSizeValue
+	//}

 	config := &Config{
 		RootDir:                           rootDir,
 		HTTPPort:                          viper.GetString("http_port"),
 		CrawlModeCrawlInterval:            viper.GetInt("crawl_mode_crawl_interval"),
-		DirectoryCrawlers:                 viper.GetInt("crawl_mode_crawl_interval"),
-		CrawlWorkers:                      viper.GetInt("crawl_workers"),
+		DirectoryCrawlers:                 viper.GetInt("directory_crawlers"),
 		CacheSize:                         viper.GetInt("cache_size"),
 		CacheTime:                         viper.GetInt("cache_time"),
 		CachePrintNew:                     viper.GetBool("cache_print_new"),
-		CachePrintChanges:                 viper.GetBool("cache_print_changes"),
 		InitialCrawl:                      viper.GetBool("initial_crawl"),
 		CacheRecacheCrawlerLimit:          viper.GetInt("cache_recache_crawler_limit"),
 		CrawlerParseMIME:                  viper.GetBool("crawler_parse_mime"),
@ -140,7 +134,6 @@ func SetConfig(configFile string) (*Config, error) {
 		RestrictedDownloadPaths:           restrictedPaths,
 		ApiSearchMaxResults:               viper.GetInt("api_search_max_results"),
 		ApiSearchShowChildren:             viper.GetBool("api_search_show_children"),
-		WorkersJobQueueSize:               workersJobQueueSize,
 		ElasticsearchEnable:               viper.GetBool("elasticsearch_enable"),
 		ElasticsearchEndpoint:             viper.GetString("elasticsearch_endpoint"),
 		ElasticsearchSyncEnable:           viper.GetBool("elasticsearch_sync_enable"),
@ -165,10 +158,6 @@ func SetConfig(configFile string) (*Config, error) {
 		return nil, errors.New("crawl_mode_crawl_interval must be more than 1")
 	}

-	if config.CrawlWorkers < 1 {
-		return nil, errors.New("crawl_workers must be more than 1")
-	}
-
 	if config.CacheSize < 1 {
 		return nil, errors.New("crawl_workers must be more than 1")
 	}
--- a/src/crazyfs.go
+++ b/src/crazyfs.go
@ -39,14 +39,6 @@ type cliConfig struct {
 // TODO: admin api endpoint to get status and progress of the full refresh of elasticsearch

 func main() {
-	//fullPath := "/srv/chub-archive"
-	//RootDir := "/srv/chub-archive"
-	//
-	//fmt.Println(strings.HasPrefix(fullPath, RootDir))
-	////fmt.Println(fullPath != RootDir)
-	//
-	//return
-
 	cliArgs := parseArgs()
 	if cliArgs.help {
 		flag.Usage()
@ -101,9 +93,7 @@ func main() {

 	log.Infof("Elasticsearch enabled: %t", cfg.ElasticsearchEnable)

-	// Init global variables
-	//DirectoryCrawler.CrawlWorkerPool = DirectoryCrawler.NewWorkerPool(config.MaxWorkers)
-	DirectoryCrawler.WorkerPool = make(chan struct{}, cfg.CrawlWorkers)
+	DirectoryCrawler.InitializeWorkers()

 	cache.InitRecacheSemaphore(cfg.CacheRecacheCrawlerLimit)

@ -166,7 +156,7 @@ func main() {

 func parseArgs() cliConfig {
 	var cliArgs cliConfig
-	flag.StringVar(&cliArgs.configFile, "config", "", "Path to the config file")
+	flag.StringVar(&cliArgs.configFile, "config", "", "StartPath to the config file")
 	flag.BoolVar(&cliArgs.initialCrawl, "initial-crawl", false, "Do an initial crawl to fill the cache")
 	flag.BoolVar(&cliArgs.initialCrawl, "i", false, "Do an initial crawl to fill the cache")
 	flag.BoolVar(&cliArgs.debug, "d", false, "Enable debug mode")