make workers global, fix worker setup, clean up
This commit is contained in:
parent
7078712bc3
commit
157f80a463
|
@ -33,7 +33,7 @@ func NewItem(fullPath string, info os.FileInfo) *Item {
|
|||
// Ignore symlinks
|
||||
return nil
|
||||
} else {
|
||||
log.Warnf("NewItem - Path does not exist: %s", fullPath)
|
||||
log.Warnf("NewItem - StartPath does not exist: %s", fullPath)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
@ -65,7 +65,7 @@ func NewItem(fullPath string, info os.FileInfo) *Item {
|
|||
}
|
||||
|
||||
if os.IsNotExist(err) {
|
||||
log.Warnf("Path does not exist: %s", fullPath)
|
||||
log.Warnf("StartPath does not exist: %s", fullPath)
|
||||
return nil
|
||||
} else if err != nil {
|
||||
log.Warnf("Error detecting MIME type of file %s - %v", fullPath, err)
|
||||
|
|
|
@ -3,7 +3,6 @@ package api
|
|||
import (
|
||||
"crazyfs/CacheItem"
|
||||
"crazyfs/api/helpers"
|
||||
"crazyfs/cache/DirectoryCrawler"
|
||||
"crazyfs/config"
|
||||
"crazyfs/elastic"
|
||||
"crypto/sha256"
|
||||
|
@ -30,12 +29,11 @@ func AdminCacheInfo(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cac
|
|||
cacheLen := sharedCache.Len()
|
||||
|
||||
response := map[string]interface{}{
|
||||
"cache_size": cacheLen,
|
||||
"cache_max": config.GetConfig().CacheSize,
|
||||
"crawls_running": DirectoryCrawler.GetTotalActiveCrawls(),
|
||||
"busy_workers": DirectoryCrawler.BusyWorkers,
|
||||
"new_sync_running": elastic.ElasticRefreshSyncRunning,
|
||||
"refresh_sync_running": elastic.ElasticRefreshSyncRunning,
|
||||
"cachedItems": cacheLen,
|
||||
"cacheMax": config.GetConfig().CacheSize,
|
||||
"recacheCrawlLimit": config.GetConfig().CacheRecacheCrawlerLimit,
|
||||
"newSyncRunning": elastic.ElasticRefreshSyncRunning,
|
||||
"refreshSyncRunning": elastic.ElasticRefreshSyncRunning,
|
||||
}
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
|
|
|
@ -27,8 +27,14 @@ func AdminCrawlsInfo(w http.ResponseWriter, r *http.Request, sharedCache *lru.Ca
|
|||
return
|
||||
} else {
|
||||
response := map[string]interface{}{
|
||||
"active": DirectoryCrawler.GetActiveCrawls(),
|
||||
"finished": DirectoryCrawler.GetFinishedCrawls(),
|
||||
"crawls": map[string]interface{}{
|
||||
"active": DirectoryCrawler.GetActiveCrawls(),
|
||||
"finished": DirectoryCrawler.GetFinishedCrawls(),
|
||||
},
|
||||
"workers": map[string]interface{}{
|
||||
"busy": DirectoryCrawler.BusyWorkers,
|
||||
"max": config.GetConfig().DirectoryCrawlers,
|
||||
},
|
||||
}
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
|
|
|
@ -13,6 +13,9 @@ import (
|
|||
// WorkerPool is a buffered channel acting as a semaphore to limit the number of active workers globally
|
||||
var WorkerPool chan struct{}
|
||||
|
||||
// Jobs is a global channel that all Walker instances submit jobs to
|
||||
var Jobs chan WalkJob
|
||||
|
||||
// BusyWorkers is an atomic counter for the number of active workers
|
||||
var BusyWorkers int32
|
||||
|
||||
|
@ -20,15 +23,53 @@ var BusyWorkers int32
|
|||
// to a walker function, does not point to a directory
|
||||
var ErrNotDir = errors.New("not a directory")
|
||||
|
||||
// WalkJob is a job that's passed to the workers.
|
||||
type WalkJob struct {
|
||||
StartPath string
|
||||
Walker *Walker
|
||||
}
|
||||
|
||||
// Walker is constructed for each Walk() function invocation
|
||||
type Walker struct {
|
||||
wg sync.WaitGroup
|
||||
jobs chan string
|
||||
root string
|
||||
followSymlinks bool
|
||||
walkFunc filepath.WalkFunc
|
||||
}
|
||||
|
||||
// InitializeWorkers starts the number of workers defined by config.GetConfig().DirectoryCrawlers
|
||||
func InitializeWorkers() {
|
||||
WorkerPool = make(chan struct{}, config.GetConfig().DirectoryCrawlers)
|
||||
Jobs = make(chan WalkJob, config.GetConfig().CacheSize)
|
||||
for n := 1; n <= config.GetConfig().DirectoryCrawlers; n++ {
|
||||
go worker()
|
||||
}
|
||||
log.Debugf("Started %d directory crawler workers.", config.GetConfig().DirectoryCrawlers)
|
||||
}
|
||||
|
||||
// worker processes all the jobs until the jobs channel is explicitly closed
|
||||
func worker() {
|
||||
for job := range Jobs {
|
||||
WorkerPool <- struct{}{} // acquire a worker
|
||||
atomic.AddInt32(&BusyWorkers, 1) // increment the number of active workers
|
||||
err := job.Walker.processPath(job.StartPath)
|
||||
if err != nil {
|
||||
log.Warnf("worker - %s - %s", job.StartPath, err)
|
||||
}
|
||||
|
||||
job.Walker.wg.Done() // decrement the WaitGroup counter
|
||||
<-WorkerPool // release the worker when done
|
||||
atomic.AddInt32(&BusyWorkers, -1) // decrement the number of active workers
|
||||
}
|
||||
}
|
||||
|
||||
// addJob increments the job counter
|
||||
// and pushes the path to the jobs channel
|
||||
func (w *Walker) addJob(job WalkJob) {
|
||||
w.wg.Add(1)
|
||||
Jobs <- job
|
||||
}
|
||||
|
||||
// the readDirNames function below was taken from the original
|
||||
// implementation (see https://golang.org/src/path/filepath/path.go)
|
||||
// but has sorting removed (sorting doesn't make sense
|
||||
|
@ -57,8 +98,8 @@ func readDirNames(dirname string) ([]string, error) {
|
|||
|
||||
// lstat is a wrapper for os.Lstat which accepts a path
|
||||
// relative to Walker.root and also follows symlinks
|
||||
func (w *Walker) lstat(relpath string) (info os.FileInfo, err error) {
|
||||
path := filepath.Join(w.root, relpath)
|
||||
func (w *Walker) lstat(relPath string) (info os.FileInfo, err error) {
|
||||
path := filepath.Join(w.root, relPath)
|
||||
info, err = os.Lstat(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -81,82 +122,51 @@ func (w *Walker) lstat(relpath string) (info os.FileInfo, err error) {
|
|||
|
||||
// processPath processes one directory and adds
|
||||
// its subdirectories to the queue for further processing
|
||||
func (w *Walker) processPath(relpath string) error {
|
||||
defer w.wg.Done()
|
||||
|
||||
path := filepath.Join(w.root, relpath)
|
||||
names, err := readDirNames(path)
|
||||
func (w *Walker) processPath(relPath string) error {
|
||||
fullPath := filepath.Join(w.root, relPath)
|
||||
names, err := readDirNames(fullPath)
|
||||
if err != nil {
|
||||
log.Errorf("Walker - processPath - readDirNames - %s", err)
|
||||
return err
|
||||
}
|
||||
|
||||
for _, name := range names {
|
||||
subpath := filepath.Join(relpath, name)
|
||||
info, err := w.lstat(subpath)
|
||||
subPath := filepath.Join(relPath, name)
|
||||
info, err := w.lstat(subPath)
|
||||
|
||||
if err != nil {
|
||||
log.Warnf("processPath - %s - %s", relpath, err)
|
||||
log.Warnf("processPath - %s - %s", relPath, err)
|
||||
continue
|
||||
}
|
||||
|
||||
if info == nil {
|
||||
log.Warnf("processPath - %s - %s", relpath, err)
|
||||
log.Warnf("processPath - %s - %s", relPath, err)
|
||||
continue
|
||||
}
|
||||
|
||||
err = w.walkFunc(filepath.Join(w.root, subpath), info, err)
|
||||
subPathFull := filepath.Join(w.root, subPath)
|
||||
err = w.walkFunc(subPathFull, info, err)
|
||||
if errors.Is(err, filepath.SkipDir) {
|
||||
return nil
|
||||
}
|
||||
|
||||
if info.Mode().IsDir() {
|
||||
w.addJob(subpath)
|
||||
w.addJob(WalkJob{
|
||||
StartPath: subPath,
|
||||
Walker: w,
|
||||
})
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// addJob increments the job counter
|
||||
// and pushes the path to the jobs channel
|
||||
func (w *Walker) addJob(path string) {
|
||||
w.wg.Add(1)
|
||||
select {
|
||||
// try to push the job to the channel
|
||||
case w.jobs <- path: // ok
|
||||
default: // buffer overflow
|
||||
// process job synchronously
|
||||
err := w.processPath(path)
|
||||
if err != nil {
|
||||
log.Warnf("addJob - %s - %s", path, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// worker processes all the jobs until the jobs channel is explicitly closed
|
||||
func (w *Walker) worker() {
|
||||
for path := range w.jobs {
|
||||
WorkerPool <- struct{}{} // acquire a worker
|
||||
atomic.AddInt32(&BusyWorkers, 1) // increment the number of active workers
|
||||
|
||||
err := w.processPath(path)
|
||||
if err != nil {
|
||||
log.Warnf("worker - %s", err)
|
||||
}
|
||||
|
||||
<-WorkerPool // release the worker when done
|
||||
atomic.AddInt32(&BusyWorkers, -1) // decrement the number of active workers
|
||||
}
|
||||
}
|
||||
|
||||
// Walk recursively descends into subdirectories, calling walkFn for each file or directory
|
||||
// in the tree, including the root directory.
|
||||
func (w *Walker) Walk(relpath string, walkFn filepath.WalkFunc) error {
|
||||
w.jobs = make(chan string, config.GetConfig().DirectoryCrawlers)
|
||||
func (w *Walker) Walk(relPath string, walkFn filepath.WalkFunc) error {
|
||||
w.walkFunc = walkFn
|
||||
|
||||
info, err := w.lstat(relpath)
|
||||
err = w.walkFunc(filepath.Join(w.root, relpath), info, err)
|
||||
fullPath := filepath.Join(w.root, relPath)
|
||||
info, err := w.lstat(relPath)
|
||||
err = w.walkFunc(fullPath, info, err)
|
||||
if errors.Is(err, filepath.SkipDir) {
|
||||
return nil
|
||||
}
|
||||
|
@ -165,21 +175,18 @@ func (w *Walker) Walk(relpath string, walkFn filepath.WalkFunc) error {
|
|||
}
|
||||
|
||||
if info == nil {
|
||||
return fmt.Errorf("broken symlink: %s", relpath)
|
||||
return fmt.Errorf("broken symlink: %s", relPath)
|
||||
}
|
||||
|
||||
if !info.Mode().IsDir() {
|
||||
return ErrNotDir
|
||||
}
|
||||
|
||||
// Spawn workers
|
||||
for n := 1; n <= config.GetConfig().DirectoryCrawlers; n++ {
|
||||
go w.worker()
|
||||
}
|
||||
|
||||
w.addJob(relpath) // add this path as a first job
|
||||
w.wg.Wait() // wait till all paths are processed
|
||||
close(w.jobs) // signal workers to close
|
||||
w.addJob(WalkJob{
|
||||
StartPath: relPath,
|
||||
Walker: w,
|
||||
}) // add this path as a first job
|
||||
w.wg.Wait() // wait till all paths are processed
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
|
@ -49,7 +49,7 @@ func (dc *DirectoryCrawler) processPath(fullPath string, info os.FileInfo) error
|
|||
}
|
||||
}
|
||||
} else {
|
||||
// Path is a file
|
||||
// StartPath is a file
|
||||
dc.AddCacheItem(fullPath, info)
|
||||
}
|
||||
return nil
|
||||
|
|
|
@ -59,8 +59,7 @@ func startCrawl(sharedCache *lru.Cache[string, *CacheItem.Item], wg *sync.WaitGr
|
|||
func logCacheStatus(msg string, ticker *time.Ticker, sharedCache *lru.Cache[string, *CacheItem.Item], logFn func(format string, args ...interface{})) {
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
activeWorkers := int(DirectoryCrawler.BusyWorkers)
|
||||
runningCrawls := DirectoryCrawler.GetTotalActiveCrawls()
|
||||
logFn("%s - %d/%d items in the cache. Active workers: %d Active crawls: %d", msg, len(sharedCache.Keys()), config.GetConfig().CacheSize, activeWorkers, runningCrawls)
|
||||
logFn("%s - %d/%d items in the cache. Busy workers: %d, running crawls: %d",
|
||||
msg, len(sharedCache.Keys()), config.GetConfig().CacheSize, DirectoryCrawler.BusyWorkers, DirectoryCrawler.GetTotalActiveCrawls())
|
||||
}
|
||||
}
|
||||
|
|
|
@ -14,11 +14,9 @@ type Config struct {
|
|||
HTTPPort string
|
||||
CrawlModeCrawlInterval int
|
||||
DirectoryCrawlers int
|
||||
CrawlWorkers int
|
||||
CacheSize int
|
||||
CacheTime int
|
||||
CachePrintNew bool
|
||||
CachePrintChanges bool
|
||||
InitialCrawl bool
|
||||
CacheRecacheCrawlerLimit int
|
||||
CrawlerParseMIME bool
|
||||
|
@ -31,7 +29,6 @@ type Config struct {
|
|||
RestrictedDownloadPaths []string
|
||||
ApiSearchMaxResults int
|
||||
ApiSearchShowChildren bool
|
||||
WorkersJobQueueSize int
|
||||
ElasticsearchEnable bool
|
||||
ElasticsearchEndpoint string
|
||||
ElasticsearchSyncEnable bool
|
||||
|
@ -59,8 +56,7 @@ func SetConfig(configFile string) (*Config, error) {
|
|||
viper.SetDefault("watch_interval", 1)
|
||||
viper.SetDefault("watch_mode", "crawl")
|
||||
viper.SetDefault("crawl_mode_crawl_interval", 3600)
|
||||
viper.SetDefault("directory_crawlers", 4)
|
||||
viper.SetDefault("crawl_workers", 10)
|
||||
viper.SetDefault("directory_crawlers", 10)
|
||||
viper.SetDefault("cache_size", 100000000)
|
||||
viper.SetDefault("cache_time", 30)
|
||||
viper.SetDefault("cache_print_new", false)
|
||||
|
@ -110,24 +106,22 @@ func SetConfig(configFile string) (*Config, error) {
|
|||
rootDir = "/"
|
||||
}
|
||||
|
||||
workersJobQueueSizeValue := viper.GetInt("crawler_worker_job_queue_size")
|
||||
var workersJobQueueSize int
|
||||
if workersJobQueueSizeValue == 0 {
|
||||
workersJobQueueSize = viper.GetInt("crawl_workers") * 100
|
||||
} else {
|
||||
workersJobQueueSize = workersJobQueueSizeValue
|
||||
}
|
||||
//workersJobQueueSizeValue := viper.GetInt("crawler_worker_job_queue_size")
|
||||
//var workersJobQueueSize int
|
||||
//if workersJobQueueSizeValue == 0 {
|
||||
// workersJobQueueSize = viper.GetInt("crawl_workers") * 100
|
||||
//} else {
|
||||
// workersJobQueueSize = workersJobQueueSizeValue
|
||||
//}
|
||||
|
||||
config := &Config{
|
||||
RootDir: rootDir,
|
||||
HTTPPort: viper.GetString("http_port"),
|
||||
CrawlModeCrawlInterval: viper.GetInt("crawl_mode_crawl_interval"),
|
||||
DirectoryCrawlers: viper.GetInt("crawl_mode_crawl_interval"),
|
||||
CrawlWorkers: viper.GetInt("crawl_workers"),
|
||||
DirectoryCrawlers: viper.GetInt("directory_crawlers"),
|
||||
CacheSize: viper.GetInt("cache_size"),
|
||||
CacheTime: viper.GetInt("cache_time"),
|
||||
CachePrintNew: viper.GetBool("cache_print_new"),
|
||||
CachePrintChanges: viper.GetBool("cache_print_changes"),
|
||||
InitialCrawl: viper.GetBool("initial_crawl"),
|
||||
CacheRecacheCrawlerLimit: viper.GetInt("cache_recache_crawler_limit"),
|
||||
CrawlerParseMIME: viper.GetBool("crawler_parse_mime"),
|
||||
|
@ -140,7 +134,6 @@ func SetConfig(configFile string) (*Config, error) {
|
|||
RestrictedDownloadPaths: restrictedPaths,
|
||||
ApiSearchMaxResults: viper.GetInt("api_search_max_results"),
|
||||
ApiSearchShowChildren: viper.GetBool("api_search_show_children"),
|
||||
WorkersJobQueueSize: workersJobQueueSize,
|
||||
ElasticsearchEnable: viper.GetBool("elasticsearch_enable"),
|
||||
ElasticsearchEndpoint: viper.GetString("elasticsearch_endpoint"),
|
||||
ElasticsearchSyncEnable: viper.GetBool("elasticsearch_sync_enable"),
|
||||
|
@ -165,10 +158,6 @@ func SetConfig(configFile string) (*Config, error) {
|
|||
return nil, errors.New("crawl_mode_crawl_interval must be more than 1")
|
||||
}
|
||||
|
||||
if config.CrawlWorkers < 1 {
|
||||
return nil, errors.New("crawl_workers must be more than 1")
|
||||
}
|
||||
|
||||
if config.CacheSize < 1 {
|
||||
return nil, errors.New("crawl_workers must be more than 1")
|
||||
}
|
||||
|
|
|
@ -39,14 +39,6 @@ type cliConfig struct {
|
|||
// TODO: admin api endpoint to get status and progress of the full refresh of elasticsearch
|
||||
|
||||
func main() {
|
||||
//fullPath := "/srv/chub-archive"
|
||||
//RootDir := "/srv/chub-archive"
|
||||
//
|
||||
//fmt.Println(strings.HasPrefix(fullPath, RootDir))
|
||||
////fmt.Println(fullPath != RootDir)
|
||||
//
|
||||
//return
|
||||
|
||||
cliArgs := parseArgs()
|
||||
if cliArgs.help {
|
||||
flag.Usage()
|
||||
|
@ -101,9 +93,7 @@ func main() {
|
|||
|
||||
log.Infof("Elasticsearch enabled: %t", cfg.ElasticsearchEnable)
|
||||
|
||||
// Init global variables
|
||||
//DirectoryCrawler.CrawlWorkerPool = DirectoryCrawler.NewWorkerPool(config.MaxWorkers)
|
||||
DirectoryCrawler.WorkerPool = make(chan struct{}, cfg.CrawlWorkers)
|
||||
DirectoryCrawler.InitializeWorkers()
|
||||
|
||||
cache.InitRecacheSemaphore(cfg.CacheRecacheCrawlerLimit)
|
||||
|
||||
|
@ -166,7 +156,7 @@ func main() {
|
|||
|
||||
func parseArgs() cliConfig {
|
||||
var cliArgs cliConfig
|
||||
flag.StringVar(&cliArgs.configFile, "config", "", "Path to the config file")
|
||||
flag.StringVar(&cliArgs.configFile, "config", "", "StartPath to the config file")
|
||||
flag.BoolVar(&cliArgs.initialCrawl, "initial-crawl", false, "Do an initial crawl to fill the cache")
|
||||
flag.BoolVar(&cliArgs.initialCrawl, "i", false, "Do an initial crawl to fill the cache")
|
||||
flag.BoolVar(&cliArgs.debug, "d", false, "Enable debug mode")
|
||||
|
|
Loading…
Reference in New Issue