track running crawls and add an admin page, use basic auth for admin, reject crawl if already running for a path, limit max directory crawlers, fix some issues
This commit is contained in:
parent
a96708f6cf
commit
7078712bc3
|
@ -39,4 +39,4 @@ If you're using initial cache and have tons of files to scan you'll need at leas
|
|||
so minutes for it to traverse the directory structure. CrazyFS is heavily threaded, so you'll want at least an 8-core
|
||||
machine.
|
||||
|
||||
CrazyFS works great with an HTTP cache in front of it.
|
||||
You'll need something line Nginx if you want SSL or HTTP. Also, CrazyFS works great with an HTTP cache in front of it.
|
|
@ -1,8 +0,0 @@
|
|||
The code you've posted is already quite efficient, but there are a few things you could consider to improve its performance:
|
||||
|
||||
1. **Use a more efficient file watcher:** The `github.com/radovskyb/watcher` package uses polling to detect file changes, which can be inefficient for large directories. If you're on Linux, consider using a package like `github.com/fsnotify/fsnotify` which uses inotify, a Linux kernel subsystem that provides more efficient file change notifications.
|
||||
2. **Reduce the number of goroutines:** Each time a file change event is received, a new goroutine is created to handle it. This could potentially create a large number of goroutines if many file changes are happening at once. Consider using a worker pool pattern to limit the number of concurrent goroutines.
|
||||
3. **Optimize your cache:** The LRU cache you're using is thread-safe, but it uses a mutex to achieve this. If you have a lot of contention (i.e., many goroutines trying to access the cache at once), this could slow things down. Consider using a sharded cache, which reduces contention by dividing the cache into several smaller caches, each with its own lock.
|
||||
4. **Avoid unnecessary work:** If a file is created and then immediately modified, your code will crawl it twice. Consider adding a delay before crawling a file, and if another event for the same file is received during this delay, only crawl it once.
|
||||
5. **Optimize your logging:** Writing to the log can be slow, especially if it's writing to a file or over the network. Consider using a buffered logger, which can improve performance by batching log messages together.
|
||||
6. **Use a profiler:** The best way to find out where your code is spending its time is to use a profiler. The `net/http/pprof` package provides a simple way to add profiling endpoints to your application, which you can then view with the `go tool pprof` command.
|
|
@ -10,12 +10,19 @@ import (
|
|||
"time"
|
||||
)
|
||||
|
||||
func NewItem(fullPath string, info os.FileInfo) *Item {
|
||||
if !strings.HasPrefix(fullPath, config.GetConfig().RootDir) {
|
||||
// Retard check
|
||||
panic(fmt.Sprintf("NewItem was not passed an absolute path. The path must start with the RootDir: %s", fullPath))
|
||||
}
|
||||
func PathOutsideRoot(fullPath string) bool {
|
||||
return !strings.HasPrefix(fullPath, config.GetConfig().RootDir)
|
||||
}
|
||||
|
||||
func RetardCheck(fullPath string) {
|
||||
// Make sure we never do anything outside of the root dir.
|
||||
if PathOutsideRoot(fullPath) {
|
||||
panic(fmt.Sprintf("NewItem was not passed an absolute path. The path must start with the RootDir (%s). Failing path: %s", config.GetConfig().RootDir, fullPath))
|
||||
}
|
||||
}
|
||||
|
||||
func NewItem(fullPath string, info os.FileInfo) *Item {
|
||||
RetardCheck(fullPath)
|
||||
if config.GetConfig().CachePrintNew {
|
||||
log.Debugf("CACHE - new: %s", fullPath)
|
||||
}
|
||||
|
|
|
@ -60,7 +60,6 @@ func NewResponseItem(cacheItem *CacheItem.Item, sharedCache *lru.Cache[string, *
|
|||
|
||||
dc := DirectoryCrawler.NewDirectoryCrawler(sharedCache)
|
||||
item, err := dc.CrawlNoRecursion(filepath.Join(config.GetConfig().RootDir, child))
|
||||
|
||||
if err != nil {
|
||||
log.Errorf("NewResponseItem - CrawlNoRecursion - %s", err)
|
||||
continue // skip this child
|
||||
|
|
|
@ -6,34 +6,47 @@ import (
|
|||
"crazyfs/cache/DirectoryCrawler"
|
||||
"crazyfs/config"
|
||||
"crazyfs/elastic"
|
||||
"crypto/sha256"
|
||||
"crypto/subtle"
|
||||
"encoding/json"
|
||||
lru "github.com/hashicorp/golang-lru/v2"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
func AdminCacheInfo(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache[string, *CacheItem.Item]) {
|
||||
auth := r.URL.Query().Get("auth")
|
||||
if auth == "" || auth != config.GetConfig().HttpAdminKey {
|
||||
helpers.Return403Msg("access denied", w)
|
||||
return
|
||||
}
|
||||
username, password, ok := r.BasicAuth()
|
||||
if ok {
|
||||
usernameHash := sha256.Sum256([]byte(username))
|
||||
passwordHash := sha256.Sum256([]byte(password))
|
||||
expectedUsernameHash := sha256.Sum256([]byte("admin"))
|
||||
expectedPasswordHash := sha256.Sum256([]byte(config.GetConfig().HttpAdminKey))
|
||||
usernameMatch := subtle.ConstantTimeCompare(usernameHash[:], expectedUsernameHash[:]) == 1
|
||||
passwordMatch := subtle.ConstantTimeCompare(passwordHash[:], expectedPasswordHash[:]) == 1
|
||||
|
||||
cacheLen := sharedCache.Len()
|
||||
if !usernameMatch || !passwordMatch {
|
||||
helpers.Return401Msg("unauthorized", w)
|
||||
return
|
||||
} else {
|
||||
cacheLen := sharedCache.Len()
|
||||
|
||||
response := map[string]interface{}{
|
||||
"cache_size": cacheLen,
|
||||
"cache_max": config.GetConfig().CacheSize,
|
||||
"crawls_running": DirectoryCrawler.GetGlobalActiveCrawls(),
|
||||
"active_workers": DirectoryCrawler.ActiveWorkers,
|
||||
"busy_workers": DirectoryCrawler.ActiveWalks,
|
||||
"new_sync_running": elastic.ElasticRefreshSyncRunning,
|
||||
"refresh_sync_running": elastic.ElasticRefreshSyncRunning,
|
||||
}
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
err := json.NewEncoder(w).Encode(response)
|
||||
if err != nil {
|
||||
log.Errorf("AdminCacheInfo - Failed to serialize JSON: %s", err)
|
||||
return
|
||||
response := map[string]interface{}{
|
||||
"cache_size": cacheLen,
|
||||
"cache_max": config.GetConfig().CacheSize,
|
||||
"crawls_running": DirectoryCrawler.GetTotalActiveCrawls(),
|
||||
"busy_workers": DirectoryCrawler.BusyWorkers,
|
||||
"new_sync_running": elastic.ElasticRefreshSyncRunning,
|
||||
"refresh_sync_running": elastic.ElasticRefreshSyncRunning,
|
||||
}
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
err := json.NewEncoder(w).Encode(response)
|
||||
if err != nil {
|
||||
log.Errorf("AdminCacheInfo - Failed to serialize JSON: %s", err)
|
||||
return
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
w.Header().Set("WWW-Authenticate", `Basic realm="restricted", charset="UTF-8"`)
|
||||
helpers.Return401Msg("unauthorized", w)
|
||||
}
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
package api
|
||||
|
||||
import (
|
||||
"crazyfs/CacheItem"
|
||||
"crazyfs/api/helpers"
|
||||
"crazyfs/cache/DirectoryCrawler"
|
||||
"crazyfs/config"
|
||||
"crypto/sha256"
|
||||
"crypto/subtle"
|
||||
"encoding/json"
|
||||
lru "github.com/hashicorp/golang-lru/v2"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
func AdminCrawlsInfo(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache[string, *CacheItem.Item]) {
|
||||
username, password, ok := r.BasicAuth()
|
||||
if ok {
|
||||
usernameHash := sha256.Sum256([]byte(username))
|
||||
passwordHash := sha256.Sum256([]byte(password))
|
||||
expectedUsernameHash := sha256.Sum256([]byte("admin"))
|
||||
expectedPasswordHash := sha256.Sum256([]byte(config.GetConfig().HttpAdminKey))
|
||||
usernameMatch := subtle.ConstantTimeCompare(usernameHash[:], expectedUsernameHash[:]) == 1
|
||||
passwordMatch := subtle.ConstantTimeCompare(passwordHash[:], expectedPasswordHash[:]) == 1
|
||||
|
||||
if !usernameMatch || !passwordMatch {
|
||||
helpers.Return401Msg("unauthorized", w)
|
||||
return
|
||||
} else {
|
||||
response := map[string]interface{}{
|
||||
"active": DirectoryCrawler.GetActiveCrawls(),
|
||||
"finished": DirectoryCrawler.GetFinishedCrawls(),
|
||||
}
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
err := json.NewEncoder(w).Encode(response)
|
||||
if err != nil {
|
||||
log.Errorf("AdminCrawlsInfo - Failed to serialize JSON: %s", err)
|
||||
helpers.Return500Msg(w)
|
||||
return
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
w.Header().Set("WWW-Authenticate", `Basic realm="restricted", charset="UTF-8"`)
|
||||
helpers.Return401Msg("unauthorized", w)
|
||||
}
|
|
@ -7,6 +7,7 @@ import (
|
|||
"crazyfs/config"
|
||||
"crazyfs/file"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
lru "github.com/hashicorp/golang-lru/v2"
|
||||
"net/http"
|
||||
)
|
||||
|
@ -27,7 +28,7 @@ func AdminReCache(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache
|
|||
|
||||
auth := requestBody["auth"]
|
||||
if auth == "" || auth != config.GetConfig().HttpAdminKey {
|
||||
helpers.Return403Msg("access denied", w)
|
||||
helpers.Return401Msg("unauthorized", w)
|
||||
return
|
||||
}
|
||||
|
||||
|
@ -42,16 +43,21 @@ func AdminReCache(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache
|
|||
return
|
||||
}
|
||||
|
||||
//relPath := cache.StripRootDir(fullPath, cfg.RootDir)
|
||||
|
||||
// Check and re-cache the directory
|
||||
cache.Recache(fullPath, sharedCache)
|
||||
|
||||
response := map[string]interface{}{
|
||||
"message": "Re-cache triggered for directory: " + fullPath,
|
||||
"error": nil,
|
||||
}
|
||||
log.Infof("Admin triggered recache for %s", fullPath)
|
||||
|
||||
// Check and re-cache the directory
|
||||
err = cache.Recache(fullPath, sharedCache)
|
||||
if err != nil {
|
||||
response["message"] = fmt.Sprintf("recache failed")
|
||||
response["error"] = err.Error()
|
||||
w.WriteHeader(http.StatusConflict)
|
||||
log.Errorf("Admin triggered recache for %s - %s", fullPath, err)
|
||||
} else {
|
||||
log.Infof("Admin triggered recache for %s", fullPath)
|
||||
}
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
err = json.NewEncoder(w).Encode(response)
|
||||
if err != nil {
|
||||
|
|
|
@ -16,7 +16,7 @@ func HealthCheck(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache[
|
|||
|
||||
response := map[string]interface{}{}
|
||||
|
||||
response["scan_running"] = DirectoryCrawler.GetGlobalActiveCrawls() > 0
|
||||
response["scan_running"] = DirectoryCrawler.GetTotalActiveCrawls() > 0
|
||||
response["initial_scan_running"] = cache.InitialCrawlInProgress
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
|
|
|
@ -71,7 +71,7 @@ func SearchFile(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache[s
|
|||
|
||||
if config.GetConfig().ElasticsearchEnable {
|
||||
// Perform the Elasticsearch query
|
||||
resp, err := elastic.Search(queryString, excludeElements)
|
||||
resp, err := elastic.SimpleQuery(queryString, excludeElements)
|
||||
if err != nil {
|
||||
log.Errorf(`SEARCH - Failed to perform Elasticsearch query "%s" - %s`, queryString, err)
|
||||
helpers.Return500Msg(w)
|
||||
|
|
|
@ -14,7 +14,7 @@ import (
|
|||
func ClientHealthCheck(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache[string, *CacheItem.Item]) {
|
||||
response := map[string]interface{}{}
|
||||
|
||||
response["scan_running"] = DirectoryCrawler.GetGlobalActiveCrawls() > 0
|
||||
response["scan_running"] = DirectoryCrawler.GetTotalActiveCrawls() > 0
|
||||
response["initial_scan_running"] = cache.InitialCrawlInProgress
|
||||
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
|
|
|
@ -1,15 +1,11 @@
|
|||
package helpers
|
||||
|
||||
import (
|
||||
"crazyfs/logging"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
func WriteErrorResponse(jsonCode, httpCode int, msg string, w http.ResponseWriter) {
|
||||
log := logging.GetLogger()
|
||||
log.Warnln(msg)
|
||||
|
||||
w.Header().Set("Cache-Control", "no-store")
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
w.WriteHeader(httpCode)
|
||||
|
@ -25,7 +21,6 @@ func WriteErrorResponse(jsonCode, httpCode int, msg string, w http.ResponseWrite
|
|||
}
|
||||
|
||||
func ReturnFake404Msg(msg string, w http.ResponseWriter) {
|
||||
log.Fatalf(msg)
|
||||
WriteErrorResponse(404, http.StatusBadRequest, msg, w)
|
||||
}
|
||||
|
||||
|
@ -44,3 +39,7 @@ func Return500Msg(w http.ResponseWriter) {
|
|||
func Return403Msg(msg string, w http.ResponseWriter) {
|
||||
WriteErrorResponse(http.StatusForbidden, http.StatusForbidden, msg, w)
|
||||
}
|
||||
|
||||
func Return401Msg(msg string, w http.ResponseWriter) {
|
||||
WriteErrorResponse(http.StatusUnauthorized, http.StatusUnauthorized, msg, w)
|
||||
}
|
||||
|
|
|
@ -47,7 +47,7 @@ func HandleFileNotFound(relPath string, fullPath string, sharedCache *lru.Cache[
|
|||
|
||||
// Start a blocking non-recursive crawl.
|
||||
item, err := dc.CrawlNoRecursion(fullPath)
|
||||
if os.IsNotExist(err) || item == nil {
|
||||
if err == nil && (os.IsNotExist(err) || item == nil) {
|
||||
ReturnFake404Msg("path not found", w)
|
||||
return nil
|
||||
} else if err != nil {
|
||||
|
|
|
@ -65,6 +65,12 @@ var routes = Routes{
|
|||
"/api/admin/cache/recache",
|
||||
wrongMethod("POST", AdminReCache),
|
||||
},
|
||||
Route{
|
||||
"Crawls Info",
|
||||
"GET",
|
||||
"/api/admin/crawls/info",
|
||||
AdminCrawlsInfo,
|
||||
},
|
||||
Route{
|
||||
"Server Health",
|
||||
"GET",
|
||||
|
|
|
@ -8,10 +8,27 @@ import (
|
|||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
var globalActiveCrawls int32
|
||||
const maxFinishedCrawls = 100
|
||||
|
||||
var activeCrawls = make(map[string]*ActiveCrawl)
|
||||
var finishedCrawls = make([]FinishedCrawl, 0, maxFinishedCrawls)
|
||||
var activeCrawlsMutex = &sync.Mutex{}
|
||||
var finishedCrawlsMutex = &sync.Mutex{}
|
||||
|
||||
type ActiveCrawl struct {
|
||||
Path string `json:"path"`
|
||||
Start int64 `json:"start"`
|
||||
Elapsed int64 `json:"elapsed"`
|
||||
}
|
||||
|
||||
type FinishedCrawl struct {
|
||||
Path string `json:"path"`
|
||||
Start int64 `json:"start"`
|
||||
Elapsed int64 `json:"elapsed"`
|
||||
}
|
||||
|
||||
type DirectoryCrawler struct {
|
||||
cache *lru.Cache[string, *CacheItem.Item]
|
||||
|
@ -76,14 +93,49 @@ func isSubpath(path, subpath string) bool {
|
|||
return true
|
||||
}
|
||||
|
||||
func (dc *DirectoryCrawler) incrementGlobalActiveCrawls() {
|
||||
atomic.AddInt32(&globalActiveCrawls, 1)
|
||||
func (dc *DirectoryCrawler) startCrawl(path string) bool {
|
||||
if dc.IsCrawlActive(path) {
|
||||
return false
|
||||
}
|
||||
activeCrawls[path] = &ActiveCrawl{Path: path, Start: time.Now().Unix(), Elapsed: int64(0)}
|
||||
return true
|
||||
}
|
||||
|
||||
func (dc *DirectoryCrawler) decrementGlobalActiveCrawls() {
|
||||
atomic.AddInt32(&globalActiveCrawls, -1)
|
||||
func (dc *DirectoryCrawler) endCrawl(path string) {
|
||||
activeCrawlsMutex.Lock()
|
||||
finishedCrawlsMutex.Lock()
|
||||
defer activeCrawlsMutex.Unlock()
|
||||
defer finishedCrawlsMutex.Unlock()
|
||||
if len(finishedCrawls) >= maxFinishedCrawls {
|
||||
finishedCrawls = finishedCrawls[1:]
|
||||
}
|
||||
finishedCrawls = append(finishedCrawls, FinishedCrawl{Path: path, Start: activeCrawls[path].Start, Elapsed: int64(time.Since(time.Unix(activeCrawls[path].Start, 0)).Seconds())})
|
||||
delete(activeCrawls, path)
|
||||
}
|
||||
|
||||
func GetGlobalActiveCrawls() int32 {
|
||||
return atomic.LoadInt32(&globalActiveCrawls)
|
||||
func (dc *DirectoryCrawler) IsCrawlActive(path string) bool {
|
||||
activeCrawlsMutex.Lock()
|
||||
defer activeCrawlsMutex.Unlock()
|
||||
_, active := activeCrawls[path]
|
||||
return active
|
||||
}
|
||||
|
||||
func GetActiveCrawls() map[string]*ActiveCrawl {
|
||||
activeCrawlsMutex.Lock()
|
||||
defer activeCrawlsMutex.Unlock()
|
||||
for path := range activeCrawls {
|
||||
a := activeCrawls[path]
|
||||
a.Elapsed = int64(time.Since(time.Unix(a.Start, 0)).Seconds())
|
||||
}
|
||||
return activeCrawls
|
||||
}
|
||||
|
||||
func GetFinishedCrawls() []FinishedCrawl {
|
||||
finishedCrawlsMutex.Lock()
|
||||
defer finishedCrawlsMutex.Unlock()
|
||||
return finishedCrawls
|
||||
}
|
||||
|
||||
func GetTotalActiveCrawls() int {
|
||||
return len(activeCrawls)
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
package DirectoryCrawler
|
||||
|
||||
import (
|
||||
"crazyfs/config"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
|
@ -9,16 +10,11 @@ import (
|
|||
"sync/atomic"
|
||||
)
|
||||
|
||||
var JobQueueSize int
|
||||
|
||||
// WorkerPool is a buffered channel acting as a semaphore to limit the number of active workers globally
|
||||
var WorkerPool chan struct{}
|
||||
|
||||
// ActiveWorkers is an atomic counter for the number of active workers
|
||||
var ActiveWorkers int32
|
||||
|
||||
// ActiveWalks is an atomic counter for the number of active Walk crawls
|
||||
var ActiveWalks int32
|
||||
// BusyWorkers is an atomic counter for the number of active workers
|
||||
var BusyWorkers int32
|
||||
|
||||
// ErrNotDir indicates that the path, which is being passed
|
||||
// to a walker function, does not point to a directory
|
||||
|
@ -109,11 +105,10 @@ func (w *Walker) processPath(relpath string) error {
|
|||
continue
|
||||
}
|
||||
|
||||
w.walkFunc(filepath.Join(w.root, subpath), info, err)
|
||||
|
||||
//if err == filepath.SkipDir {
|
||||
// return nil
|
||||
//}
|
||||
err = w.walkFunc(filepath.Join(w.root, subpath), info, err)
|
||||
if errors.Is(err, filepath.SkipDir) {
|
||||
return nil
|
||||
}
|
||||
|
||||
if info.Mode().IsDir() {
|
||||
w.addJob(subpath)
|
||||
|
@ -138,36 +133,31 @@ func (w *Walker) addJob(path string) {
|
|||
}
|
||||
}
|
||||
|
||||
// worker processes all the jobs
|
||||
// until the jobs channel is explicitly closed
|
||||
// worker processes all the jobs until the jobs channel is explicitly closed
|
||||
func (w *Walker) worker() {
|
||||
for path := range w.jobs {
|
||||
WorkerPool <- struct{}{} // acquire a worker
|
||||
atomic.AddInt32(&ActiveWorkers, 1) // increment the number of active workers
|
||||
WorkerPool <- struct{}{} // acquire a worker
|
||||
atomic.AddInt32(&BusyWorkers, 1) // increment the number of active workers
|
||||
|
||||
err := w.processPath(path)
|
||||
if err != nil {
|
||||
log.Warnf("worker - %s", err)
|
||||
}
|
||||
|
||||
<-WorkerPool // release the worker when done
|
||||
atomic.AddInt32(&ActiveWorkers, -1) // decrement the number of active workers
|
||||
<-WorkerPool // release the worker when done
|
||||
atomic.AddInt32(&BusyWorkers, -1) // decrement the number of active workers
|
||||
}
|
||||
}
|
||||
|
||||
// Walk recursively descends into subdirectories,
|
||||
// calling walkFn for each file or directory
|
||||
// Walk recursively descends into subdirectories, calling walkFn for each file or directory
|
||||
// in the tree, including the root directory.
|
||||
func (w *Walker) Walk(relpath string, walkFn filepath.WalkFunc) error {
|
||||
atomic.AddInt32(&ActiveWalks, 1) // increment the number of active Walk crawls
|
||||
defer atomic.AddInt32(&ActiveWalks, -1) // decrement the number of active Walk crawls when done
|
||||
|
||||
w.jobs = make(chan string, JobQueueSize)
|
||||
w.jobs = make(chan string, config.GetConfig().DirectoryCrawlers)
|
||||
w.walkFunc = walkFn
|
||||
|
||||
info, err := w.lstat(relpath)
|
||||
err = w.walkFunc(filepath.Join(w.root, relpath), info, err)
|
||||
if err == filepath.SkipDir {
|
||||
if errors.Is(err, filepath.SkipDir) {
|
||||
return nil
|
||||
}
|
||||
if err != nil {
|
||||
|
@ -182,8 +172,8 @@ func (w *Walker) Walk(relpath string, walkFn filepath.WalkFunc) error {
|
|||
return ErrNotDir
|
||||
}
|
||||
|
||||
// spawn workers
|
||||
for n := 1; n <= JobQueueSize; n++ {
|
||||
// Spawn workers
|
||||
for n := 1; n <= config.GetConfig().DirectoryCrawlers; n++ {
|
||||
go w.worker()
|
||||
}
|
||||
|
||||
|
|
|
@ -4,34 +4,45 @@ import (
|
|||
"crazyfs/CacheItem"
|
||||
"crazyfs/config"
|
||||
"crazyfs/file"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
)
|
||||
|
||||
func (dc *DirectoryCrawler) walkRecursiveFunc(path string, info os.FileInfo, err error) error {
|
||||
processErr := dc.processPath(path, info)
|
||||
func (dc *DirectoryCrawler) walkRecursiveFunc(fullPath string, info os.FileInfo, err error) error {
|
||||
CacheItem.RetardCheck(fullPath)
|
||||
processErr := dc.processPath(fullPath, info)
|
||||
if processErr != nil {
|
||||
log.Errorf("CRAWLER - walkRecursiveFunc() failed - %s - %s", processErr, path)
|
||||
log.Errorf("CRAWLER - walkRecursiveFunc() failed - %s - %s", processErr, fullPath)
|
||||
return processErr
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (dc *DirectoryCrawler) walkNonRecursiveFunc(path string, dir os.DirEntry, err error) error {
|
||||
func (dc *DirectoryCrawler) walkNonRecursiveFunc(fullPath string, dir os.DirEntry, err error) error {
|
||||
CacheItem.RetardCheck(fullPath)
|
||||
info, infoErr := dir.Info()
|
||||
if infoErr != nil {
|
||||
log.Errorf("CRAWLER - walkNonRecursiveFunc() - get info failed - %s - %s", infoErr, path)
|
||||
log.Errorf("CRAWLER - walkNonRecursiveFunc() - get info failed - %s - %s", infoErr, fullPath)
|
||||
return infoErr
|
||||
}
|
||||
processErr := dc.processPath(path, info)
|
||||
processErr := dc.processPath(fullPath, info)
|
||||
if processErr != nil {
|
||||
log.Errorf("CRAWLER - walkNonRecursiveFunc() failed - %s - %s", processErr, path)
|
||||
log.Errorf("CRAWLER - walkNonRecursiveFunc() failed - %s - %s", processErr, fullPath)
|
||||
return processErr
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (dc *DirectoryCrawler) Crawl(fullPath string) error {
|
||||
CacheItem.RetardCheck(fullPath)
|
||||
readyToStart := dc.startCrawl(fullPath)
|
||||
if !readyToStart {
|
||||
return errors.New(fmt.Sprintf(`rejecting crawl, already in progress for "%s"`, fullPath))
|
||||
}
|
||||
defer dc.endCrawl(fullPath)
|
||||
|
||||
info, err := os.Lstat(fullPath)
|
||||
if os.IsNotExist(err) {
|
||||
// If the path doesn't exist, just silently exit
|
||||
|
@ -84,6 +95,13 @@ func (dc *DirectoryCrawler) Crawl(fullPath string) error {
|
|||
|
||||
// CrawlNoRecursion this function crawls a file or directory and does not recurse into any subdirectories. Also returns the result of the crawl.
|
||||
func (dc *DirectoryCrawler) CrawlNoRecursion(fullPath string) (*CacheItem.Item, error) {
|
||||
CacheItem.RetardCheck(fullPath)
|
||||
readyToStart := dc.startCrawl(fullPath)
|
||||
if !readyToStart {
|
||||
return nil, errors.New(fmt.Sprintf(`rejecting crawl, already in progress for "%s"`, fullPath))
|
||||
}
|
||||
defer dc.endCrawl(fullPath)
|
||||
|
||||
info, err := os.Lstat(fullPath)
|
||||
if os.IsNotExist(err) {
|
||||
// If the path doesn't exist, just silently exit
|
||||
|
|
|
@ -44,10 +44,10 @@ func startCrawl(sharedCache *lru.Cache[string, *CacheItem.Item], wg *sync.WaitGr
|
|||
log.Infoln("CRAWLER - Starting a crawl...")
|
||||
start := time.Now()
|
||||
err := dc.Crawl(config.GetConfig().RootDir)
|
||||
duration := time.Since(start).Round(time.Second)
|
||||
if err != nil {
|
||||
log.Warnf("CRAWLER - Crawl failed: %s", err)
|
||||
} else {
|
||||
duration := time.Since(start).Round(time.Second)
|
||||
log.Infof("CRAWLER - Crawl completed in %s", duration)
|
||||
log.Debugf("%d/%d items in the cache.", config.GetConfig().CacheSize, len(sharedCache.Keys()))
|
||||
}
|
||||
|
@ -59,8 +59,8 @@ func startCrawl(sharedCache *lru.Cache[string, *CacheItem.Item], wg *sync.WaitGr
|
|||
func logCacheStatus(msg string, ticker *time.Ticker, sharedCache *lru.Cache[string, *CacheItem.Item], logFn func(format string, args ...interface{})) {
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
activeWorkers := int(DirectoryCrawler.ActiveWorkers)
|
||||
busyWorkers := int(DirectoryCrawler.ActiveWalks)
|
||||
logFn("%s - %d/%d items in the cache. Active workers: %d Active crawls: %d", msg, len(sharedCache.Keys()), config.GetConfig().CacheSize, activeWorkers, busyWorkers)
|
||||
activeWorkers := int(DirectoryCrawler.BusyWorkers)
|
||||
runningCrawls := DirectoryCrawler.GetTotalActiveCrawls()
|
||||
logFn("%s - %d/%d items in the cache. Active workers: %d Active crawls: %d", msg, len(sharedCache.Keys()), config.GetConfig().CacheSize, activeWorkers, runningCrawls)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ import (
|
|||
"crazyfs/cache/DirectoryCrawler"
|
||||
"crazyfs/config"
|
||||
"crazyfs/file"
|
||||
"crazyfs/logging"
|
||||
"errors"
|
||||
lru "github.com/hashicorp/golang-lru/v2"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
@ -21,7 +21,6 @@ func InitRecacheSemaphore(limit int) {
|
|||
func CheckAndRecache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) {
|
||||
item, found := sharedCache.Get(path)
|
||||
if found && time.Now().UnixNano()/int64(time.Millisecond)-item.Cached > int64(config.GetConfig().CacheTime)*60*1000 {
|
||||
log := logging.GetLogger()
|
||||
log.Debugf("Re-caching: %s", path)
|
||||
sem <- struct{}{} // acquire a token
|
||||
go func() {
|
||||
|
@ -30,13 +29,18 @@ func CheckAndRecache(path string, sharedCache *lru.Cache[string, *CacheItem.Item
|
|||
err := dc.Crawl(path)
|
||||
if err != nil {
|
||||
log.Errorf("RECACHE ERROR: %s", err.Error())
|
||||
return
|
||||
}
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
||||
func Recache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) {
|
||||
log := logging.GetLogger()
|
||||
func Recache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) error {
|
||||
dc := DirectoryCrawler.NewDirectoryCrawler(sharedCache)
|
||||
if dc.IsCrawlActive(path) {
|
||||
return errors.New("rejecting crawl, already in progress for this path")
|
||||
}
|
||||
|
||||
log.Debugf("Re-caching: %s", path)
|
||||
start := time.Now()
|
||||
sem <- struct{}{} // acquire a token
|
||||
|
@ -46,6 +50,7 @@ func Recache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) {
|
|||
err := dc.Crawl(path)
|
||||
if err != nil {
|
||||
log.Errorf("RECACHE ERROR: %s", err.Error())
|
||||
return
|
||||
}
|
||||
|
||||
// Get the parent directory from the cache
|
||||
|
@ -53,7 +58,7 @@ func Recache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) {
|
|||
parentDirRel := file.StripRootDir(parentDir)
|
||||
parentItem, found := sharedCache.Get(parentDirRel)
|
||||
if found {
|
||||
// Remove the old sub-directory from the parent directory's Children field
|
||||
// Remove the old subdirectory from the parent directory's Children field
|
||||
for i, child := range parentItem.Children {
|
||||
if child == path {
|
||||
parentItem.Children = append(parentItem.Children[:i], parentItem.Children[i+1:]...)
|
||||
|
@ -61,10 +66,11 @@ func Recache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) {
|
|||
}
|
||||
}
|
||||
|
||||
// Update the parent directory's Children field to include the new sub-directory
|
||||
// Update the parent directory's Children field to include the new subdirectory
|
||||
info, err := os.Stat(path)
|
||||
if err != nil {
|
||||
log.Errorf("RECACHE ERROR: %s", err.Error())
|
||||
return
|
||||
} else {
|
||||
newItem := CacheItem.NewItem(path, info)
|
||||
// Create a new slice that contains all items from the Children field except the old directory
|
||||
|
@ -81,15 +87,17 @@ func Recache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) {
|
|||
// Update the parent directory in the cache
|
||||
sharedCache.Add(parentDir, parentItem)
|
||||
}
|
||||
} else {
|
||||
} else if !CacheItem.PathOutsideRoot(parentDir) {
|
||||
// If the parent directory isn't in the cache, crawl it
|
||||
log.Infof("RECACHE - crawling parent directory since it isn't in the cache yet: %s", parentDir)
|
||||
err := dc.Crawl(parentDir)
|
||||
_, err := dc.CrawlNoRecursion(parentDir)
|
||||
if err != nil {
|
||||
log.Errorf("RECACHE ERROR: %s", err.Error())
|
||||
return
|
||||
}
|
||||
}
|
||||
duration := time.Since(start).Round(time.Second)
|
||||
log.Infof("RECACHE - completed in %s - %s", duration, path)
|
||||
}()
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -39,6 +39,14 @@ type cliConfig struct {
|
|||
// TODO: admin api endpoint to get status and progress of the full refresh of elasticsearch
|
||||
|
||||
func main() {
|
||||
//fullPath := "/srv/chub-archive"
|
||||
//RootDir := "/srv/chub-archive"
|
||||
//
|
||||
//fmt.Println(strings.HasPrefix(fullPath, RootDir))
|
||||
////fmt.Println(fullPath != RootDir)
|
||||
//
|
||||
//return
|
||||
|
||||
cliArgs := parseArgs()
|
||||
if cliArgs.help {
|
||||
flag.Usage()
|
||||
|
|
|
@ -10,7 +10,7 @@ import (
|
|||
"strings"
|
||||
)
|
||||
|
||||
func Search(query string, exclude []string) (*esapi.Response, error) {
|
||||
func SimpleQuery(query string, exclude []string) (*esapi.Response, error) {
|
||||
var excludeQuery string
|
||||
if len(exclude) > 0 {
|
||||
var excludeConditions []string
|
5
todo.txt
5
todo.txt
|
@ -1,13 +1,10 @@
|
|||
- Track active crawls and list them on the admin page
|
||||
- Limit to one on-demand crawl per path. Don't start another if one is already running. See HandleFileNotFound()
|
||||
- Add config value to limit the number of on-demand crawls
|
||||
- Add config value to limit the number of concurrent crawls, other crawls get queued.
|
||||
- add an admin endpoint to fetch the last n modified files.
|
||||
- fix /api/file/download when an item is in the cache but does not exist on the disk
|
||||
- Is using scroll for the Elastic query really the best way to do a real-time query?
|
||||
|
||||
|
||||
Later:
|
||||
- Add a wildcard option to restricted_download_paths to block all sub-directories
|
||||
- Add a dict to each restricted_download_paths item to specify how many levels recursive the block should be applied
|
||||
- add a "last modified" to "sort" https://chub-archive.evulid.cc/api/file/list?path=/chub.ai/characters&page=1&limit=50&sort=folders
|
||||
- add an admin endpoint to fetch the last n modified files.
|
||||
|
|
Loading…
Reference in New Issue