track running crawls and add an admin page, use basic auth for admin, reject crawl if already running for a path, limit max directory crawlers, fix some issues

This commit is contained in:
Cyberes 2023-12-11 18:05:59 -07:00
parent a96708f6cf
commit 7078712bc3
21 changed files with 253 additions and 112 deletions

View File

@ -39,4 +39,4 @@ If you're using initial cache and have tons of files to scan you'll need at leas
so minutes for it to traverse the directory structure. CrazyFS is heavily threaded, so you'll want at least an 8-core
machine.
CrazyFS works great with an HTTP cache in front of it.
You'll need something line Nginx if you want SSL or HTTP. Also, CrazyFS works great with an HTTP cache in front of it.

View File

@ -1,8 +0,0 @@
The code you've posted is already quite efficient, but there are a few things you could consider to improve its performance:
1. **Use a more efficient file watcher:** The `github.com/radovskyb/watcher` package uses polling to detect file changes, which can be inefficient for large directories. If you're on Linux, consider using a package like `github.com/fsnotify/fsnotify` which uses inotify, a Linux kernel subsystem that provides more efficient file change notifications.
2. **Reduce the number of goroutines:** Each time a file change event is received, a new goroutine is created to handle it. This could potentially create a large number of goroutines if many file changes are happening at once. Consider using a worker pool pattern to limit the number of concurrent goroutines.
3. **Optimize your cache:** The LRU cache you're using is thread-safe, but it uses a mutex to achieve this. If you have a lot of contention (i.e., many goroutines trying to access the cache at once), this could slow things down. Consider using a sharded cache, which reduces contention by dividing the cache into several smaller caches, each with its own lock.
4. **Avoid unnecessary work:** If a file is created and then immediately modified, your code will crawl it twice. Consider adding a delay before crawling a file, and if another event for the same file is received during this delay, only crawl it once.
5. **Optimize your logging:** Writing to the log can be slow, especially if it's writing to a file or over the network. Consider using a buffered logger, which can improve performance by batching log messages together.
6. **Use a profiler:** The best way to find out where your code is spending its time is to use a profiler. The `net/http/pprof` package provides a simple way to add profiling endpoints to your application, which you can then view with the `go tool pprof` command.

View File

@ -10,12 +10,19 @@ import (
"time"
)
func NewItem(fullPath string, info os.FileInfo) *Item {
if !strings.HasPrefix(fullPath, config.GetConfig().RootDir) {
// Retard check
panic(fmt.Sprintf("NewItem was not passed an absolute path. The path must start with the RootDir: %s", fullPath))
}
func PathOutsideRoot(fullPath string) bool {
return !strings.HasPrefix(fullPath, config.GetConfig().RootDir)
}
func RetardCheck(fullPath string) {
// Make sure we never do anything outside of the root dir.
if PathOutsideRoot(fullPath) {
panic(fmt.Sprintf("NewItem was not passed an absolute path. The path must start with the RootDir (%s). Failing path: %s", config.GetConfig().RootDir, fullPath))
}
}
func NewItem(fullPath string, info os.FileInfo) *Item {
RetardCheck(fullPath)
if config.GetConfig().CachePrintNew {
log.Debugf("CACHE - new: %s", fullPath)
}

View File

@ -60,7 +60,6 @@ func NewResponseItem(cacheItem *CacheItem.Item, sharedCache *lru.Cache[string, *
dc := DirectoryCrawler.NewDirectoryCrawler(sharedCache)
item, err := dc.CrawlNoRecursion(filepath.Join(config.GetConfig().RootDir, child))
if err != nil {
log.Errorf("NewResponseItem - CrawlNoRecursion - %s", err)
continue // skip this child

View File

@ -6,34 +6,47 @@ import (
"crazyfs/cache/DirectoryCrawler"
"crazyfs/config"
"crazyfs/elastic"
"crypto/sha256"
"crypto/subtle"
"encoding/json"
lru "github.com/hashicorp/golang-lru/v2"
"net/http"
)
func AdminCacheInfo(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache[string, *CacheItem.Item]) {
auth := r.URL.Query().Get("auth")
if auth == "" || auth != config.GetConfig().HttpAdminKey {
helpers.Return403Msg("access denied", w)
return
}
username, password, ok := r.BasicAuth()
if ok {
usernameHash := sha256.Sum256([]byte(username))
passwordHash := sha256.Sum256([]byte(password))
expectedUsernameHash := sha256.Sum256([]byte("admin"))
expectedPasswordHash := sha256.Sum256([]byte(config.GetConfig().HttpAdminKey))
usernameMatch := subtle.ConstantTimeCompare(usernameHash[:], expectedUsernameHash[:]) == 1
passwordMatch := subtle.ConstantTimeCompare(passwordHash[:], expectedPasswordHash[:]) == 1
cacheLen := sharedCache.Len()
if !usernameMatch || !passwordMatch {
helpers.Return401Msg("unauthorized", w)
return
} else {
cacheLen := sharedCache.Len()
response := map[string]interface{}{
"cache_size": cacheLen,
"cache_max": config.GetConfig().CacheSize,
"crawls_running": DirectoryCrawler.GetGlobalActiveCrawls(),
"active_workers": DirectoryCrawler.ActiveWorkers,
"busy_workers": DirectoryCrawler.ActiveWalks,
"new_sync_running": elastic.ElasticRefreshSyncRunning,
"refresh_sync_running": elastic.ElasticRefreshSyncRunning,
}
w.Header().Set("Content-Type", "application/json")
err := json.NewEncoder(w).Encode(response)
if err != nil {
log.Errorf("AdminCacheInfo - Failed to serialize JSON: %s", err)
return
response := map[string]interface{}{
"cache_size": cacheLen,
"cache_max": config.GetConfig().CacheSize,
"crawls_running": DirectoryCrawler.GetTotalActiveCrawls(),
"busy_workers": DirectoryCrawler.BusyWorkers,
"new_sync_running": elastic.ElasticRefreshSyncRunning,
"refresh_sync_running": elastic.ElasticRefreshSyncRunning,
}
w.Header().Set("Cache-Control", "no-store")
w.Header().Set("Content-Type", "application/json")
err := json.NewEncoder(w).Encode(response)
if err != nil {
log.Errorf("AdminCacheInfo - Failed to serialize JSON: %s", err)
return
}
return
}
}
w.Header().Set("WWW-Authenticate", `Basic realm="restricted", charset="UTF-8"`)
helpers.Return401Msg("unauthorized", w)
}

View File

@ -0,0 +1,46 @@
package api
import (
"crazyfs/CacheItem"
"crazyfs/api/helpers"
"crazyfs/cache/DirectoryCrawler"
"crazyfs/config"
"crypto/sha256"
"crypto/subtle"
"encoding/json"
lru "github.com/hashicorp/golang-lru/v2"
"net/http"
)
func AdminCrawlsInfo(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache[string, *CacheItem.Item]) {
username, password, ok := r.BasicAuth()
if ok {
usernameHash := sha256.Sum256([]byte(username))
passwordHash := sha256.Sum256([]byte(password))
expectedUsernameHash := sha256.Sum256([]byte("admin"))
expectedPasswordHash := sha256.Sum256([]byte(config.GetConfig().HttpAdminKey))
usernameMatch := subtle.ConstantTimeCompare(usernameHash[:], expectedUsernameHash[:]) == 1
passwordMatch := subtle.ConstantTimeCompare(passwordHash[:], expectedPasswordHash[:]) == 1
if !usernameMatch || !passwordMatch {
helpers.Return401Msg("unauthorized", w)
return
} else {
response := map[string]interface{}{
"active": DirectoryCrawler.GetActiveCrawls(),
"finished": DirectoryCrawler.GetFinishedCrawls(),
}
w.Header().Set("Cache-Control", "no-store")
w.Header().Set("Content-Type", "application/json")
err := json.NewEncoder(w).Encode(response)
if err != nil {
log.Errorf("AdminCrawlsInfo - Failed to serialize JSON: %s", err)
helpers.Return500Msg(w)
return
}
return
}
}
w.Header().Set("WWW-Authenticate", `Basic realm="restricted", charset="UTF-8"`)
helpers.Return401Msg("unauthorized", w)
}

View File

@ -7,6 +7,7 @@ import (
"crazyfs/config"
"crazyfs/file"
"encoding/json"
"fmt"
lru "github.com/hashicorp/golang-lru/v2"
"net/http"
)
@ -27,7 +28,7 @@ func AdminReCache(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache
auth := requestBody["auth"]
if auth == "" || auth != config.GetConfig().HttpAdminKey {
helpers.Return403Msg("access denied", w)
helpers.Return401Msg("unauthorized", w)
return
}
@ -42,16 +43,21 @@ func AdminReCache(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache
return
}
//relPath := cache.StripRootDir(fullPath, cfg.RootDir)
// Check and re-cache the directory
cache.Recache(fullPath, sharedCache)
response := map[string]interface{}{
"message": "Re-cache triggered for directory: " + fullPath,
"error": nil,
}
log.Infof("Admin triggered recache for %s", fullPath)
// Check and re-cache the directory
err = cache.Recache(fullPath, sharedCache)
if err != nil {
response["message"] = fmt.Sprintf("recache failed")
response["error"] = err.Error()
w.WriteHeader(http.StatusConflict)
log.Errorf("Admin triggered recache for %s - %s", fullPath, err)
} else {
log.Infof("Admin triggered recache for %s", fullPath)
}
w.Header().Set("Cache-Control", "no-store")
w.Header().Set("Content-Type", "application/json")
err = json.NewEncoder(w).Encode(response)
if err != nil {

View File

@ -16,7 +16,7 @@ func HealthCheck(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache[
response := map[string]interface{}{}
response["scan_running"] = DirectoryCrawler.GetGlobalActiveCrawls() > 0
response["scan_running"] = DirectoryCrawler.GetTotalActiveCrawls() > 0
response["initial_scan_running"] = cache.InitialCrawlInProgress
w.Header().Set("Content-Type", "application/json")

View File

@ -71,7 +71,7 @@ func SearchFile(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache[s
if config.GetConfig().ElasticsearchEnable {
// Perform the Elasticsearch query
resp, err := elastic.Search(queryString, excludeElements)
resp, err := elastic.SimpleQuery(queryString, excludeElements)
if err != nil {
log.Errorf(`SEARCH - Failed to perform Elasticsearch query "%s" - %s`, queryString, err)
helpers.Return500Msg(w)

View File

@ -14,7 +14,7 @@ import (
func ClientHealthCheck(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache[string, *CacheItem.Item]) {
response := map[string]interface{}{}
response["scan_running"] = DirectoryCrawler.GetGlobalActiveCrawls() > 0
response["scan_running"] = DirectoryCrawler.GetTotalActiveCrawls() > 0
response["initial_scan_running"] = cache.InitialCrawlInProgress
w.Header().Set("Content-Type", "application/json")

View File

@ -1,15 +1,11 @@
package helpers
import (
"crazyfs/logging"
"encoding/json"
"net/http"
)
func WriteErrorResponse(jsonCode, httpCode int, msg string, w http.ResponseWriter) {
log := logging.GetLogger()
log.Warnln(msg)
w.Header().Set("Cache-Control", "no-store")
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(httpCode)
@ -25,7 +21,6 @@ func WriteErrorResponse(jsonCode, httpCode int, msg string, w http.ResponseWrite
}
func ReturnFake404Msg(msg string, w http.ResponseWriter) {
log.Fatalf(msg)
WriteErrorResponse(404, http.StatusBadRequest, msg, w)
}
@ -44,3 +39,7 @@ func Return500Msg(w http.ResponseWriter) {
func Return403Msg(msg string, w http.ResponseWriter) {
WriteErrorResponse(http.StatusForbidden, http.StatusForbidden, msg, w)
}
func Return401Msg(msg string, w http.ResponseWriter) {
WriteErrorResponse(http.StatusUnauthorized, http.StatusUnauthorized, msg, w)
}

View File

@ -47,7 +47,7 @@ func HandleFileNotFound(relPath string, fullPath string, sharedCache *lru.Cache[
// Start a blocking non-recursive crawl.
item, err := dc.CrawlNoRecursion(fullPath)
if os.IsNotExist(err) || item == nil {
if err == nil && (os.IsNotExist(err) || item == nil) {
ReturnFake404Msg("path not found", w)
return nil
} else if err != nil {

View File

@ -65,6 +65,12 @@ var routes = Routes{
"/api/admin/cache/recache",
wrongMethod("POST", AdminReCache),
},
Route{
"Crawls Info",
"GET",
"/api/admin/crawls/info",
AdminCrawlsInfo,
},
Route{
"Server Health",
"GET",

View File

@ -8,10 +8,27 @@ import (
"path/filepath"
"strings"
"sync"
"sync/atomic"
"time"
)
var globalActiveCrawls int32
const maxFinishedCrawls = 100
var activeCrawls = make(map[string]*ActiveCrawl)
var finishedCrawls = make([]FinishedCrawl, 0, maxFinishedCrawls)
var activeCrawlsMutex = &sync.Mutex{}
var finishedCrawlsMutex = &sync.Mutex{}
type ActiveCrawl struct {
Path string `json:"path"`
Start int64 `json:"start"`
Elapsed int64 `json:"elapsed"`
}
type FinishedCrawl struct {
Path string `json:"path"`
Start int64 `json:"start"`
Elapsed int64 `json:"elapsed"`
}
type DirectoryCrawler struct {
cache *lru.Cache[string, *CacheItem.Item]
@ -76,14 +93,49 @@ func isSubpath(path, subpath string) bool {
return true
}
func (dc *DirectoryCrawler) incrementGlobalActiveCrawls() {
atomic.AddInt32(&globalActiveCrawls, 1)
func (dc *DirectoryCrawler) startCrawl(path string) bool {
if dc.IsCrawlActive(path) {
return false
}
activeCrawls[path] = &ActiveCrawl{Path: path, Start: time.Now().Unix(), Elapsed: int64(0)}
return true
}
func (dc *DirectoryCrawler) decrementGlobalActiveCrawls() {
atomic.AddInt32(&globalActiveCrawls, -1)
func (dc *DirectoryCrawler) endCrawl(path string) {
activeCrawlsMutex.Lock()
finishedCrawlsMutex.Lock()
defer activeCrawlsMutex.Unlock()
defer finishedCrawlsMutex.Unlock()
if len(finishedCrawls) >= maxFinishedCrawls {
finishedCrawls = finishedCrawls[1:]
}
finishedCrawls = append(finishedCrawls, FinishedCrawl{Path: path, Start: activeCrawls[path].Start, Elapsed: int64(time.Since(time.Unix(activeCrawls[path].Start, 0)).Seconds())})
delete(activeCrawls, path)
}
func GetGlobalActiveCrawls() int32 {
return atomic.LoadInt32(&globalActiveCrawls)
func (dc *DirectoryCrawler) IsCrawlActive(path string) bool {
activeCrawlsMutex.Lock()
defer activeCrawlsMutex.Unlock()
_, active := activeCrawls[path]
return active
}
func GetActiveCrawls() map[string]*ActiveCrawl {
activeCrawlsMutex.Lock()
defer activeCrawlsMutex.Unlock()
for path := range activeCrawls {
a := activeCrawls[path]
a.Elapsed = int64(time.Since(time.Unix(a.Start, 0)).Seconds())
}
return activeCrawls
}
func GetFinishedCrawls() []FinishedCrawl {
finishedCrawlsMutex.Lock()
defer finishedCrawlsMutex.Unlock()
return finishedCrawls
}
func GetTotalActiveCrawls() int {
return len(activeCrawls)
}

View File

@ -1,6 +1,7 @@
package DirectoryCrawler
import (
"crazyfs/config"
"errors"
"fmt"
"os"
@ -9,16 +10,11 @@ import (
"sync/atomic"
)
var JobQueueSize int
// WorkerPool is a buffered channel acting as a semaphore to limit the number of active workers globally
var WorkerPool chan struct{}
// ActiveWorkers is an atomic counter for the number of active workers
var ActiveWorkers int32
// ActiveWalks is an atomic counter for the number of active Walk crawls
var ActiveWalks int32
// BusyWorkers is an atomic counter for the number of active workers
var BusyWorkers int32
// ErrNotDir indicates that the path, which is being passed
// to a walker function, does not point to a directory
@ -109,11 +105,10 @@ func (w *Walker) processPath(relpath string) error {
continue
}
w.walkFunc(filepath.Join(w.root, subpath), info, err)
//if err == filepath.SkipDir {
// return nil
//}
err = w.walkFunc(filepath.Join(w.root, subpath), info, err)
if errors.Is(err, filepath.SkipDir) {
return nil
}
if info.Mode().IsDir() {
w.addJob(subpath)
@ -138,36 +133,31 @@ func (w *Walker) addJob(path string) {
}
}
// worker processes all the jobs
// until the jobs channel is explicitly closed
// worker processes all the jobs until the jobs channel is explicitly closed
func (w *Walker) worker() {
for path := range w.jobs {
WorkerPool <- struct{}{} // acquire a worker
atomic.AddInt32(&ActiveWorkers, 1) // increment the number of active workers
WorkerPool <- struct{}{} // acquire a worker
atomic.AddInt32(&BusyWorkers, 1) // increment the number of active workers
err := w.processPath(path)
if err != nil {
log.Warnf("worker - %s", err)
}
<-WorkerPool // release the worker when done
atomic.AddInt32(&ActiveWorkers, -1) // decrement the number of active workers
<-WorkerPool // release the worker when done
atomic.AddInt32(&BusyWorkers, -1) // decrement the number of active workers
}
}
// Walk recursively descends into subdirectories,
// calling walkFn for each file or directory
// Walk recursively descends into subdirectories, calling walkFn for each file or directory
// in the tree, including the root directory.
func (w *Walker) Walk(relpath string, walkFn filepath.WalkFunc) error {
atomic.AddInt32(&ActiveWalks, 1) // increment the number of active Walk crawls
defer atomic.AddInt32(&ActiveWalks, -1) // decrement the number of active Walk crawls when done
w.jobs = make(chan string, JobQueueSize)
w.jobs = make(chan string, config.GetConfig().DirectoryCrawlers)
w.walkFunc = walkFn
info, err := w.lstat(relpath)
err = w.walkFunc(filepath.Join(w.root, relpath), info, err)
if err == filepath.SkipDir {
if errors.Is(err, filepath.SkipDir) {
return nil
}
if err != nil {
@ -182,8 +172,8 @@ func (w *Walker) Walk(relpath string, walkFn filepath.WalkFunc) error {
return ErrNotDir
}
// spawn workers
for n := 1; n <= JobQueueSize; n++ {
// Spawn workers
for n := 1; n <= config.GetConfig().DirectoryCrawlers; n++ {
go w.worker()
}

View File

@ -4,34 +4,45 @@ import (
"crazyfs/CacheItem"
"crazyfs/config"
"crazyfs/file"
"errors"
"fmt"
"os"
"path/filepath"
)
func (dc *DirectoryCrawler) walkRecursiveFunc(path string, info os.FileInfo, err error) error {
processErr := dc.processPath(path, info)
func (dc *DirectoryCrawler) walkRecursiveFunc(fullPath string, info os.FileInfo, err error) error {
CacheItem.RetardCheck(fullPath)
processErr := dc.processPath(fullPath, info)
if processErr != nil {
log.Errorf("CRAWLER - walkRecursiveFunc() failed - %s - %s", processErr, path)
log.Errorf("CRAWLER - walkRecursiveFunc() failed - %s - %s", processErr, fullPath)
return processErr
}
return nil
}
func (dc *DirectoryCrawler) walkNonRecursiveFunc(path string, dir os.DirEntry, err error) error {
func (dc *DirectoryCrawler) walkNonRecursiveFunc(fullPath string, dir os.DirEntry, err error) error {
CacheItem.RetardCheck(fullPath)
info, infoErr := dir.Info()
if infoErr != nil {
log.Errorf("CRAWLER - walkNonRecursiveFunc() - get info failed - %s - %s", infoErr, path)
log.Errorf("CRAWLER - walkNonRecursiveFunc() - get info failed - %s - %s", infoErr, fullPath)
return infoErr
}
processErr := dc.processPath(path, info)
processErr := dc.processPath(fullPath, info)
if processErr != nil {
log.Errorf("CRAWLER - walkNonRecursiveFunc() failed - %s - %s", processErr, path)
log.Errorf("CRAWLER - walkNonRecursiveFunc() failed - %s - %s", processErr, fullPath)
return processErr
}
return nil
}
func (dc *DirectoryCrawler) Crawl(fullPath string) error {
CacheItem.RetardCheck(fullPath)
readyToStart := dc.startCrawl(fullPath)
if !readyToStart {
return errors.New(fmt.Sprintf(`rejecting crawl, already in progress for "%s"`, fullPath))
}
defer dc.endCrawl(fullPath)
info, err := os.Lstat(fullPath)
if os.IsNotExist(err) {
// If the path doesn't exist, just silently exit
@ -84,6 +95,13 @@ func (dc *DirectoryCrawler) Crawl(fullPath string) error {
// CrawlNoRecursion this function crawls a file or directory and does not recurse into any subdirectories. Also returns the result of the crawl.
func (dc *DirectoryCrawler) CrawlNoRecursion(fullPath string) (*CacheItem.Item, error) {
CacheItem.RetardCheck(fullPath)
readyToStart := dc.startCrawl(fullPath)
if !readyToStart {
return nil, errors.New(fmt.Sprintf(`rejecting crawl, already in progress for "%s"`, fullPath))
}
defer dc.endCrawl(fullPath)
info, err := os.Lstat(fullPath)
if os.IsNotExist(err) {
// If the path doesn't exist, just silently exit

View File

@ -44,10 +44,10 @@ func startCrawl(sharedCache *lru.Cache[string, *CacheItem.Item], wg *sync.WaitGr
log.Infoln("CRAWLER - Starting a crawl...")
start := time.Now()
err := dc.Crawl(config.GetConfig().RootDir)
duration := time.Since(start).Round(time.Second)
if err != nil {
log.Warnf("CRAWLER - Crawl failed: %s", err)
} else {
duration := time.Since(start).Round(time.Second)
log.Infof("CRAWLER - Crawl completed in %s", duration)
log.Debugf("%d/%d items in the cache.", config.GetConfig().CacheSize, len(sharedCache.Keys()))
}
@ -59,8 +59,8 @@ func startCrawl(sharedCache *lru.Cache[string, *CacheItem.Item], wg *sync.WaitGr
func logCacheStatus(msg string, ticker *time.Ticker, sharedCache *lru.Cache[string, *CacheItem.Item], logFn func(format string, args ...interface{})) {
defer ticker.Stop()
for range ticker.C {
activeWorkers := int(DirectoryCrawler.ActiveWorkers)
busyWorkers := int(DirectoryCrawler.ActiveWalks)
logFn("%s - %d/%d items in the cache. Active workers: %d Active crawls: %d", msg, len(sharedCache.Keys()), config.GetConfig().CacheSize, activeWorkers, busyWorkers)
activeWorkers := int(DirectoryCrawler.BusyWorkers)
runningCrawls := DirectoryCrawler.GetTotalActiveCrawls()
logFn("%s - %d/%d items in the cache. Active workers: %d Active crawls: %d", msg, len(sharedCache.Keys()), config.GetConfig().CacheSize, activeWorkers, runningCrawls)
}
}

24
src/cache/recache.go vendored
View File

@ -5,7 +5,7 @@ import (
"crazyfs/cache/DirectoryCrawler"
"crazyfs/config"
"crazyfs/file"
"crazyfs/logging"
"errors"
lru "github.com/hashicorp/golang-lru/v2"
"os"
"path/filepath"
@ -21,7 +21,6 @@ func InitRecacheSemaphore(limit int) {
func CheckAndRecache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) {
item, found := sharedCache.Get(path)
if found && time.Now().UnixNano()/int64(time.Millisecond)-item.Cached > int64(config.GetConfig().CacheTime)*60*1000 {
log := logging.GetLogger()
log.Debugf("Re-caching: %s", path)
sem <- struct{}{} // acquire a token
go func() {
@ -30,13 +29,18 @@ func CheckAndRecache(path string, sharedCache *lru.Cache[string, *CacheItem.Item
err := dc.Crawl(path)
if err != nil {
log.Errorf("RECACHE ERROR: %s", err.Error())
return
}
}()
}
}
func Recache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) {
log := logging.GetLogger()
func Recache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) error {
dc := DirectoryCrawler.NewDirectoryCrawler(sharedCache)
if dc.IsCrawlActive(path) {
return errors.New("rejecting crawl, already in progress for this path")
}
log.Debugf("Re-caching: %s", path)
start := time.Now()
sem <- struct{}{} // acquire a token
@ -46,6 +50,7 @@ func Recache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) {
err := dc.Crawl(path)
if err != nil {
log.Errorf("RECACHE ERROR: %s", err.Error())
return
}
// Get the parent directory from the cache
@ -53,7 +58,7 @@ func Recache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) {
parentDirRel := file.StripRootDir(parentDir)
parentItem, found := sharedCache.Get(parentDirRel)
if found {
// Remove the old sub-directory from the parent directory's Children field
// Remove the old subdirectory from the parent directory's Children field
for i, child := range parentItem.Children {
if child == path {
parentItem.Children = append(parentItem.Children[:i], parentItem.Children[i+1:]...)
@ -61,10 +66,11 @@ func Recache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) {
}
}
// Update the parent directory's Children field to include the new sub-directory
// Update the parent directory's Children field to include the new subdirectory
info, err := os.Stat(path)
if err != nil {
log.Errorf("RECACHE ERROR: %s", err.Error())
return
} else {
newItem := CacheItem.NewItem(path, info)
// Create a new slice that contains all items from the Children field except the old directory
@ -81,15 +87,17 @@ func Recache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) {
// Update the parent directory in the cache
sharedCache.Add(parentDir, parentItem)
}
} else {
} else if !CacheItem.PathOutsideRoot(parentDir) {
// If the parent directory isn't in the cache, crawl it
log.Infof("RECACHE - crawling parent directory since it isn't in the cache yet: %s", parentDir)
err := dc.Crawl(parentDir)
_, err := dc.CrawlNoRecursion(parentDir)
if err != nil {
log.Errorf("RECACHE ERROR: %s", err.Error())
return
}
}
duration := time.Since(start).Round(time.Second)
log.Infof("RECACHE - completed in %s - %s", duration, path)
}()
return nil
}

View File

@ -39,6 +39,14 @@ type cliConfig struct {
// TODO: admin api endpoint to get status and progress of the full refresh of elasticsearch
func main() {
//fullPath := "/srv/chub-archive"
//RootDir := "/srv/chub-archive"
//
//fmt.Println(strings.HasPrefix(fullPath, RootDir))
////fmt.Println(fullPath != RootDir)
//
//return
cliArgs := parseArgs()
if cliArgs.help {
flag.Usage()

View File

@ -10,7 +10,7 @@ import (
"strings"
)
func Search(query string, exclude []string) (*esapi.Response, error) {
func SimpleQuery(query string, exclude []string) (*esapi.Response, error) {
var excludeQuery string
if len(exclude) > 0 {
var excludeConditions []string

View File

@ -1,13 +1,10 @@
- Track active crawls and list them on the admin page
- Limit to one on-demand crawl per path. Don't start another if one is already running. See HandleFileNotFound()
- Add config value to limit the number of on-demand crawls
- Add config value to limit the number of concurrent crawls, other crawls get queued.
- add an admin endpoint to fetch the last n modified files.
- fix /api/file/download when an item is in the cache but does not exist on the disk
- Is using scroll for the Elastic query really the best way to do a real-time query?
Later:
- Add a wildcard option to restricted_download_paths to block all sub-directories
- Add a dict to each restricted_download_paths item to specify how many levels recursive the block should be applied
- add a "last modified" to "sort" https://chub-archive.evulid.cc/api/file/list?path=/chub.ai/characters&page=1&limit=50&sort=folders
- add an admin endpoint to fetch the last n modified files.