track running crawls and add an admin page, use basic auth for admin, reject crawl if already running for a path, limit max directory crawlers, fix some issues
This commit is contained in:
parent
a96708f6cf
commit
7078712bc3
|
@ -39,4 +39,4 @@ If you're using initial cache and have tons of files to scan you'll need at leas
|
||||||
so minutes for it to traverse the directory structure. CrazyFS is heavily threaded, so you'll want at least an 8-core
|
so minutes for it to traverse the directory structure. CrazyFS is heavily threaded, so you'll want at least an 8-core
|
||||||
machine.
|
machine.
|
||||||
|
|
||||||
CrazyFS works great with an HTTP cache in front of it.
|
You'll need something line Nginx if you want SSL or HTTP. Also, CrazyFS works great with an HTTP cache in front of it.
|
|
@ -1,8 +0,0 @@
|
||||||
The code you've posted is already quite efficient, but there are a few things you could consider to improve its performance:
|
|
||||||
|
|
||||||
1. **Use a more efficient file watcher:** The `github.com/radovskyb/watcher` package uses polling to detect file changes, which can be inefficient for large directories. If you're on Linux, consider using a package like `github.com/fsnotify/fsnotify` which uses inotify, a Linux kernel subsystem that provides more efficient file change notifications.
|
|
||||||
2. **Reduce the number of goroutines:** Each time a file change event is received, a new goroutine is created to handle it. This could potentially create a large number of goroutines if many file changes are happening at once. Consider using a worker pool pattern to limit the number of concurrent goroutines.
|
|
||||||
3. **Optimize your cache:** The LRU cache you're using is thread-safe, but it uses a mutex to achieve this. If you have a lot of contention (i.e., many goroutines trying to access the cache at once), this could slow things down. Consider using a sharded cache, which reduces contention by dividing the cache into several smaller caches, each with its own lock.
|
|
||||||
4. **Avoid unnecessary work:** If a file is created and then immediately modified, your code will crawl it twice. Consider adding a delay before crawling a file, and if another event for the same file is received during this delay, only crawl it once.
|
|
||||||
5. **Optimize your logging:** Writing to the log can be slow, especially if it's writing to a file or over the network. Consider using a buffered logger, which can improve performance by batching log messages together.
|
|
||||||
6. **Use a profiler:** The best way to find out where your code is spending its time is to use a profiler. The `net/http/pprof` package provides a simple way to add profiling endpoints to your application, which you can then view with the `go tool pprof` command.
|
|
|
@ -10,12 +10,19 @@ import (
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
func NewItem(fullPath string, info os.FileInfo) *Item {
|
func PathOutsideRoot(fullPath string) bool {
|
||||||
if !strings.HasPrefix(fullPath, config.GetConfig().RootDir) {
|
return !strings.HasPrefix(fullPath, config.GetConfig().RootDir)
|
||||||
// Retard check
|
|
||||||
panic(fmt.Sprintf("NewItem was not passed an absolute path. The path must start with the RootDir: %s", fullPath))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func RetardCheck(fullPath string) {
|
||||||
|
// Make sure we never do anything outside of the root dir.
|
||||||
|
if PathOutsideRoot(fullPath) {
|
||||||
|
panic(fmt.Sprintf("NewItem was not passed an absolute path. The path must start with the RootDir (%s). Failing path: %s", config.GetConfig().RootDir, fullPath))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func NewItem(fullPath string, info os.FileInfo) *Item {
|
||||||
|
RetardCheck(fullPath)
|
||||||
if config.GetConfig().CachePrintNew {
|
if config.GetConfig().CachePrintNew {
|
||||||
log.Debugf("CACHE - new: %s", fullPath)
|
log.Debugf("CACHE - new: %s", fullPath)
|
||||||
}
|
}
|
||||||
|
|
|
@ -60,7 +60,6 @@ func NewResponseItem(cacheItem *CacheItem.Item, sharedCache *lru.Cache[string, *
|
||||||
|
|
||||||
dc := DirectoryCrawler.NewDirectoryCrawler(sharedCache)
|
dc := DirectoryCrawler.NewDirectoryCrawler(sharedCache)
|
||||||
item, err := dc.CrawlNoRecursion(filepath.Join(config.GetConfig().RootDir, child))
|
item, err := dc.CrawlNoRecursion(filepath.Join(config.GetConfig().RootDir, child))
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("NewResponseItem - CrawlNoRecursion - %s", err)
|
log.Errorf("NewResponseItem - CrawlNoRecursion - %s", err)
|
||||||
continue // skip this child
|
continue // skip this child
|
||||||
|
|
|
@ -6,34 +6,47 @@ import (
|
||||||
"crazyfs/cache/DirectoryCrawler"
|
"crazyfs/cache/DirectoryCrawler"
|
||||||
"crazyfs/config"
|
"crazyfs/config"
|
||||||
"crazyfs/elastic"
|
"crazyfs/elastic"
|
||||||
|
"crypto/sha256"
|
||||||
|
"crypto/subtle"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
lru "github.com/hashicorp/golang-lru/v2"
|
lru "github.com/hashicorp/golang-lru/v2"
|
||||||
"net/http"
|
"net/http"
|
||||||
)
|
)
|
||||||
|
|
||||||
func AdminCacheInfo(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache[string, *CacheItem.Item]) {
|
func AdminCacheInfo(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache[string, *CacheItem.Item]) {
|
||||||
auth := r.URL.Query().Get("auth")
|
username, password, ok := r.BasicAuth()
|
||||||
if auth == "" || auth != config.GetConfig().HttpAdminKey {
|
if ok {
|
||||||
helpers.Return403Msg("access denied", w)
|
usernameHash := sha256.Sum256([]byte(username))
|
||||||
return
|
passwordHash := sha256.Sum256([]byte(password))
|
||||||
}
|
expectedUsernameHash := sha256.Sum256([]byte("admin"))
|
||||||
|
expectedPasswordHash := sha256.Sum256([]byte(config.GetConfig().HttpAdminKey))
|
||||||
|
usernameMatch := subtle.ConstantTimeCompare(usernameHash[:], expectedUsernameHash[:]) == 1
|
||||||
|
passwordMatch := subtle.ConstantTimeCompare(passwordHash[:], expectedPasswordHash[:]) == 1
|
||||||
|
|
||||||
|
if !usernameMatch || !passwordMatch {
|
||||||
|
helpers.Return401Msg("unauthorized", w)
|
||||||
|
return
|
||||||
|
} else {
|
||||||
cacheLen := sharedCache.Len()
|
cacheLen := sharedCache.Len()
|
||||||
|
|
||||||
response := map[string]interface{}{
|
response := map[string]interface{}{
|
||||||
"cache_size": cacheLen,
|
"cache_size": cacheLen,
|
||||||
"cache_max": config.GetConfig().CacheSize,
|
"cache_max": config.GetConfig().CacheSize,
|
||||||
"crawls_running": DirectoryCrawler.GetGlobalActiveCrawls(),
|
"crawls_running": DirectoryCrawler.GetTotalActiveCrawls(),
|
||||||
"active_workers": DirectoryCrawler.ActiveWorkers,
|
"busy_workers": DirectoryCrawler.BusyWorkers,
|
||||||
"busy_workers": DirectoryCrawler.ActiveWalks,
|
|
||||||
"new_sync_running": elastic.ElasticRefreshSyncRunning,
|
"new_sync_running": elastic.ElasticRefreshSyncRunning,
|
||||||
"refresh_sync_running": elastic.ElasticRefreshSyncRunning,
|
"refresh_sync_running": elastic.ElasticRefreshSyncRunning,
|
||||||
}
|
}
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
w.Header().Set("Content-Type", "application/json")
|
w.Header().Set("Content-Type", "application/json")
|
||||||
err := json.NewEncoder(w).Encode(response)
|
err := json.NewEncoder(w).Encode(response)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("AdminCacheInfo - Failed to serialize JSON: %s", err)
|
log.Errorf("AdminCacheInfo - Failed to serialize JSON: %s", err)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
w.Header().Set("WWW-Authenticate", `Basic realm="restricted", charset="UTF-8"`)
|
||||||
|
helpers.Return401Msg("unauthorized", w)
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,46 @@
|
||||||
|
package api
|
||||||
|
|
||||||
|
import (
|
||||||
|
"crazyfs/CacheItem"
|
||||||
|
"crazyfs/api/helpers"
|
||||||
|
"crazyfs/cache/DirectoryCrawler"
|
||||||
|
"crazyfs/config"
|
||||||
|
"crypto/sha256"
|
||||||
|
"crypto/subtle"
|
||||||
|
"encoding/json"
|
||||||
|
lru "github.com/hashicorp/golang-lru/v2"
|
||||||
|
"net/http"
|
||||||
|
)
|
||||||
|
|
||||||
|
func AdminCrawlsInfo(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache[string, *CacheItem.Item]) {
|
||||||
|
username, password, ok := r.BasicAuth()
|
||||||
|
if ok {
|
||||||
|
usernameHash := sha256.Sum256([]byte(username))
|
||||||
|
passwordHash := sha256.Sum256([]byte(password))
|
||||||
|
expectedUsernameHash := sha256.Sum256([]byte("admin"))
|
||||||
|
expectedPasswordHash := sha256.Sum256([]byte(config.GetConfig().HttpAdminKey))
|
||||||
|
usernameMatch := subtle.ConstantTimeCompare(usernameHash[:], expectedUsernameHash[:]) == 1
|
||||||
|
passwordMatch := subtle.ConstantTimeCompare(passwordHash[:], expectedPasswordHash[:]) == 1
|
||||||
|
|
||||||
|
if !usernameMatch || !passwordMatch {
|
||||||
|
helpers.Return401Msg("unauthorized", w)
|
||||||
|
return
|
||||||
|
} else {
|
||||||
|
response := map[string]interface{}{
|
||||||
|
"active": DirectoryCrawler.GetActiveCrawls(),
|
||||||
|
"finished": DirectoryCrawler.GetFinishedCrawls(),
|
||||||
|
}
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
err := json.NewEncoder(w).Encode(response)
|
||||||
|
if err != nil {
|
||||||
|
log.Errorf("AdminCrawlsInfo - Failed to serialize JSON: %s", err)
|
||||||
|
helpers.Return500Msg(w)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
w.Header().Set("WWW-Authenticate", `Basic realm="restricted", charset="UTF-8"`)
|
||||||
|
helpers.Return401Msg("unauthorized", w)
|
||||||
|
}
|
|
@ -7,6 +7,7 @@ import (
|
||||||
"crazyfs/config"
|
"crazyfs/config"
|
||||||
"crazyfs/file"
|
"crazyfs/file"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
lru "github.com/hashicorp/golang-lru/v2"
|
lru "github.com/hashicorp/golang-lru/v2"
|
||||||
"net/http"
|
"net/http"
|
||||||
)
|
)
|
||||||
|
@ -27,7 +28,7 @@ func AdminReCache(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache
|
||||||
|
|
||||||
auth := requestBody["auth"]
|
auth := requestBody["auth"]
|
||||||
if auth == "" || auth != config.GetConfig().HttpAdminKey {
|
if auth == "" || auth != config.GetConfig().HttpAdminKey {
|
||||||
helpers.Return403Msg("access denied", w)
|
helpers.Return401Msg("unauthorized", w)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -42,16 +43,21 @@ func AdminReCache(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
//relPath := cache.StripRootDir(fullPath, cfg.RootDir)
|
|
||||||
|
|
||||||
// Check and re-cache the directory
|
|
||||||
cache.Recache(fullPath, sharedCache)
|
|
||||||
|
|
||||||
response := map[string]interface{}{
|
response := map[string]interface{}{
|
||||||
"message": "Re-cache triggered for directory: " + fullPath,
|
"message": "Re-cache triggered for directory: " + fullPath,
|
||||||
|
"error": nil,
|
||||||
}
|
}
|
||||||
|
// Check and re-cache the directory
|
||||||
|
err = cache.Recache(fullPath, sharedCache)
|
||||||
|
if err != nil {
|
||||||
|
response["message"] = fmt.Sprintf("recache failed")
|
||||||
|
response["error"] = err.Error()
|
||||||
|
w.WriteHeader(http.StatusConflict)
|
||||||
|
log.Errorf("Admin triggered recache for %s - %s", fullPath, err)
|
||||||
|
} else {
|
||||||
log.Infof("Admin triggered recache for %s", fullPath)
|
log.Infof("Admin triggered recache for %s", fullPath)
|
||||||
|
}
|
||||||
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
w.Header().Set("Content-Type", "application/json")
|
w.Header().Set("Content-Type", "application/json")
|
||||||
err = json.NewEncoder(w).Encode(response)
|
err = json.NewEncoder(w).Encode(response)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
@ -16,7 +16,7 @@ func HealthCheck(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache[
|
||||||
|
|
||||||
response := map[string]interface{}{}
|
response := map[string]interface{}{}
|
||||||
|
|
||||||
response["scan_running"] = DirectoryCrawler.GetGlobalActiveCrawls() > 0
|
response["scan_running"] = DirectoryCrawler.GetTotalActiveCrawls() > 0
|
||||||
response["initial_scan_running"] = cache.InitialCrawlInProgress
|
response["initial_scan_running"] = cache.InitialCrawlInProgress
|
||||||
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
|
|
@ -71,7 +71,7 @@ func SearchFile(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache[s
|
||||||
|
|
||||||
if config.GetConfig().ElasticsearchEnable {
|
if config.GetConfig().ElasticsearchEnable {
|
||||||
// Perform the Elasticsearch query
|
// Perform the Elasticsearch query
|
||||||
resp, err := elastic.Search(queryString, excludeElements)
|
resp, err := elastic.SimpleQuery(queryString, excludeElements)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf(`SEARCH - Failed to perform Elasticsearch query "%s" - %s`, queryString, err)
|
log.Errorf(`SEARCH - Failed to perform Elasticsearch query "%s" - %s`, queryString, err)
|
||||||
helpers.Return500Msg(w)
|
helpers.Return500Msg(w)
|
||||||
|
|
|
@ -14,7 +14,7 @@ import (
|
||||||
func ClientHealthCheck(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache[string, *CacheItem.Item]) {
|
func ClientHealthCheck(w http.ResponseWriter, r *http.Request, sharedCache *lru.Cache[string, *CacheItem.Item]) {
|
||||||
response := map[string]interface{}{}
|
response := map[string]interface{}{}
|
||||||
|
|
||||||
response["scan_running"] = DirectoryCrawler.GetGlobalActiveCrawls() > 0
|
response["scan_running"] = DirectoryCrawler.GetTotalActiveCrawls() > 0
|
||||||
response["initial_scan_running"] = cache.InitialCrawlInProgress
|
response["initial_scan_running"] = cache.InitialCrawlInProgress
|
||||||
|
|
||||||
w.Header().Set("Content-Type", "application/json")
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
|
|
@ -1,15 +1,11 @@
|
||||||
package helpers
|
package helpers
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"crazyfs/logging"
|
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"net/http"
|
"net/http"
|
||||||
)
|
)
|
||||||
|
|
||||||
func WriteErrorResponse(jsonCode, httpCode int, msg string, w http.ResponseWriter) {
|
func WriteErrorResponse(jsonCode, httpCode int, msg string, w http.ResponseWriter) {
|
||||||
log := logging.GetLogger()
|
|
||||||
log.Warnln(msg)
|
|
||||||
|
|
||||||
w.Header().Set("Cache-Control", "no-store")
|
w.Header().Set("Cache-Control", "no-store")
|
||||||
w.Header().Set("Content-Type", "application/json")
|
w.Header().Set("Content-Type", "application/json")
|
||||||
w.WriteHeader(httpCode)
|
w.WriteHeader(httpCode)
|
||||||
|
@ -25,7 +21,6 @@ func WriteErrorResponse(jsonCode, httpCode int, msg string, w http.ResponseWrite
|
||||||
}
|
}
|
||||||
|
|
||||||
func ReturnFake404Msg(msg string, w http.ResponseWriter) {
|
func ReturnFake404Msg(msg string, w http.ResponseWriter) {
|
||||||
log.Fatalf(msg)
|
|
||||||
WriteErrorResponse(404, http.StatusBadRequest, msg, w)
|
WriteErrorResponse(404, http.StatusBadRequest, msg, w)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -44,3 +39,7 @@ func Return500Msg(w http.ResponseWriter) {
|
||||||
func Return403Msg(msg string, w http.ResponseWriter) {
|
func Return403Msg(msg string, w http.ResponseWriter) {
|
||||||
WriteErrorResponse(http.StatusForbidden, http.StatusForbidden, msg, w)
|
WriteErrorResponse(http.StatusForbidden, http.StatusForbidden, msg, w)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func Return401Msg(msg string, w http.ResponseWriter) {
|
||||||
|
WriteErrorResponse(http.StatusUnauthorized, http.StatusUnauthorized, msg, w)
|
||||||
|
}
|
||||||
|
|
|
@ -47,7 +47,7 @@ func HandleFileNotFound(relPath string, fullPath string, sharedCache *lru.Cache[
|
||||||
|
|
||||||
// Start a blocking non-recursive crawl.
|
// Start a blocking non-recursive crawl.
|
||||||
item, err := dc.CrawlNoRecursion(fullPath)
|
item, err := dc.CrawlNoRecursion(fullPath)
|
||||||
if os.IsNotExist(err) || item == nil {
|
if err == nil && (os.IsNotExist(err) || item == nil) {
|
||||||
ReturnFake404Msg("path not found", w)
|
ReturnFake404Msg("path not found", w)
|
||||||
return nil
|
return nil
|
||||||
} else if err != nil {
|
} else if err != nil {
|
||||||
|
|
|
@ -65,6 +65,12 @@ var routes = Routes{
|
||||||
"/api/admin/cache/recache",
|
"/api/admin/cache/recache",
|
||||||
wrongMethod("POST", AdminReCache),
|
wrongMethod("POST", AdminReCache),
|
||||||
},
|
},
|
||||||
|
Route{
|
||||||
|
"Crawls Info",
|
||||||
|
"GET",
|
||||||
|
"/api/admin/crawls/info",
|
||||||
|
AdminCrawlsInfo,
|
||||||
|
},
|
||||||
Route{
|
Route{
|
||||||
"Server Health",
|
"Server Health",
|
||||||
"GET",
|
"GET",
|
||||||
|
|
|
@ -8,10 +8,27 @@ import (
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"sync/atomic"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
var globalActiveCrawls int32
|
const maxFinishedCrawls = 100
|
||||||
|
|
||||||
|
var activeCrawls = make(map[string]*ActiveCrawl)
|
||||||
|
var finishedCrawls = make([]FinishedCrawl, 0, maxFinishedCrawls)
|
||||||
|
var activeCrawlsMutex = &sync.Mutex{}
|
||||||
|
var finishedCrawlsMutex = &sync.Mutex{}
|
||||||
|
|
||||||
|
type ActiveCrawl struct {
|
||||||
|
Path string `json:"path"`
|
||||||
|
Start int64 `json:"start"`
|
||||||
|
Elapsed int64 `json:"elapsed"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type FinishedCrawl struct {
|
||||||
|
Path string `json:"path"`
|
||||||
|
Start int64 `json:"start"`
|
||||||
|
Elapsed int64 `json:"elapsed"`
|
||||||
|
}
|
||||||
|
|
||||||
type DirectoryCrawler struct {
|
type DirectoryCrawler struct {
|
||||||
cache *lru.Cache[string, *CacheItem.Item]
|
cache *lru.Cache[string, *CacheItem.Item]
|
||||||
|
@ -76,14 +93,49 @@ func isSubpath(path, subpath string) bool {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
func (dc *DirectoryCrawler) incrementGlobalActiveCrawls() {
|
func (dc *DirectoryCrawler) startCrawl(path string) bool {
|
||||||
atomic.AddInt32(&globalActiveCrawls, 1)
|
if dc.IsCrawlActive(path) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
activeCrawls[path] = &ActiveCrawl{Path: path, Start: time.Now().Unix(), Elapsed: int64(0)}
|
||||||
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
func (dc *DirectoryCrawler) decrementGlobalActiveCrawls() {
|
func (dc *DirectoryCrawler) endCrawl(path string) {
|
||||||
atomic.AddInt32(&globalActiveCrawls, -1)
|
activeCrawlsMutex.Lock()
|
||||||
|
finishedCrawlsMutex.Lock()
|
||||||
|
defer activeCrawlsMutex.Unlock()
|
||||||
|
defer finishedCrawlsMutex.Unlock()
|
||||||
|
if len(finishedCrawls) >= maxFinishedCrawls {
|
||||||
|
finishedCrawls = finishedCrawls[1:]
|
||||||
|
}
|
||||||
|
finishedCrawls = append(finishedCrawls, FinishedCrawl{Path: path, Start: activeCrawls[path].Start, Elapsed: int64(time.Since(time.Unix(activeCrawls[path].Start, 0)).Seconds())})
|
||||||
|
delete(activeCrawls, path)
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetGlobalActiveCrawls() int32 {
|
func (dc *DirectoryCrawler) IsCrawlActive(path string) bool {
|
||||||
return atomic.LoadInt32(&globalActiveCrawls)
|
activeCrawlsMutex.Lock()
|
||||||
|
defer activeCrawlsMutex.Unlock()
|
||||||
|
_, active := activeCrawls[path]
|
||||||
|
return active
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetActiveCrawls() map[string]*ActiveCrawl {
|
||||||
|
activeCrawlsMutex.Lock()
|
||||||
|
defer activeCrawlsMutex.Unlock()
|
||||||
|
for path := range activeCrawls {
|
||||||
|
a := activeCrawls[path]
|
||||||
|
a.Elapsed = int64(time.Since(time.Unix(a.Start, 0)).Seconds())
|
||||||
|
}
|
||||||
|
return activeCrawls
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetFinishedCrawls() []FinishedCrawl {
|
||||||
|
finishedCrawlsMutex.Lock()
|
||||||
|
defer finishedCrawlsMutex.Unlock()
|
||||||
|
return finishedCrawls
|
||||||
|
}
|
||||||
|
|
||||||
|
func GetTotalActiveCrawls() int {
|
||||||
|
return len(activeCrawls)
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package DirectoryCrawler
|
package DirectoryCrawler
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"crazyfs/config"
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
@ -9,16 +10,11 @@ import (
|
||||||
"sync/atomic"
|
"sync/atomic"
|
||||||
)
|
)
|
||||||
|
|
||||||
var JobQueueSize int
|
|
||||||
|
|
||||||
// WorkerPool is a buffered channel acting as a semaphore to limit the number of active workers globally
|
// WorkerPool is a buffered channel acting as a semaphore to limit the number of active workers globally
|
||||||
var WorkerPool chan struct{}
|
var WorkerPool chan struct{}
|
||||||
|
|
||||||
// ActiveWorkers is an atomic counter for the number of active workers
|
// BusyWorkers is an atomic counter for the number of active workers
|
||||||
var ActiveWorkers int32
|
var BusyWorkers int32
|
||||||
|
|
||||||
// ActiveWalks is an atomic counter for the number of active Walk crawls
|
|
||||||
var ActiveWalks int32
|
|
||||||
|
|
||||||
// ErrNotDir indicates that the path, which is being passed
|
// ErrNotDir indicates that the path, which is being passed
|
||||||
// to a walker function, does not point to a directory
|
// to a walker function, does not point to a directory
|
||||||
|
@ -109,11 +105,10 @@ func (w *Walker) processPath(relpath string) error {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
w.walkFunc(filepath.Join(w.root, subpath), info, err)
|
err = w.walkFunc(filepath.Join(w.root, subpath), info, err)
|
||||||
|
if errors.Is(err, filepath.SkipDir) {
|
||||||
//if err == filepath.SkipDir {
|
return nil
|
||||||
// return nil
|
}
|
||||||
//}
|
|
||||||
|
|
||||||
if info.Mode().IsDir() {
|
if info.Mode().IsDir() {
|
||||||
w.addJob(subpath)
|
w.addJob(subpath)
|
||||||
|
@ -138,12 +133,11 @@ func (w *Walker) addJob(path string) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// worker processes all the jobs
|
// worker processes all the jobs until the jobs channel is explicitly closed
|
||||||
// until the jobs channel is explicitly closed
|
|
||||||
func (w *Walker) worker() {
|
func (w *Walker) worker() {
|
||||||
for path := range w.jobs {
|
for path := range w.jobs {
|
||||||
WorkerPool <- struct{}{} // acquire a worker
|
WorkerPool <- struct{}{} // acquire a worker
|
||||||
atomic.AddInt32(&ActiveWorkers, 1) // increment the number of active workers
|
atomic.AddInt32(&BusyWorkers, 1) // increment the number of active workers
|
||||||
|
|
||||||
err := w.processPath(path)
|
err := w.processPath(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -151,23 +145,19 @@ func (w *Walker) worker() {
|
||||||
}
|
}
|
||||||
|
|
||||||
<-WorkerPool // release the worker when done
|
<-WorkerPool // release the worker when done
|
||||||
atomic.AddInt32(&ActiveWorkers, -1) // decrement the number of active workers
|
atomic.AddInt32(&BusyWorkers, -1) // decrement the number of active workers
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Walk recursively descends into subdirectories,
|
// Walk recursively descends into subdirectories, calling walkFn for each file or directory
|
||||||
// calling walkFn for each file or directory
|
|
||||||
// in the tree, including the root directory.
|
// in the tree, including the root directory.
|
||||||
func (w *Walker) Walk(relpath string, walkFn filepath.WalkFunc) error {
|
func (w *Walker) Walk(relpath string, walkFn filepath.WalkFunc) error {
|
||||||
atomic.AddInt32(&ActiveWalks, 1) // increment the number of active Walk crawls
|
w.jobs = make(chan string, config.GetConfig().DirectoryCrawlers)
|
||||||
defer atomic.AddInt32(&ActiveWalks, -1) // decrement the number of active Walk crawls when done
|
|
||||||
|
|
||||||
w.jobs = make(chan string, JobQueueSize)
|
|
||||||
w.walkFunc = walkFn
|
w.walkFunc = walkFn
|
||||||
|
|
||||||
info, err := w.lstat(relpath)
|
info, err := w.lstat(relpath)
|
||||||
err = w.walkFunc(filepath.Join(w.root, relpath), info, err)
|
err = w.walkFunc(filepath.Join(w.root, relpath), info, err)
|
||||||
if err == filepath.SkipDir {
|
if errors.Is(err, filepath.SkipDir) {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -182,8 +172,8 @@ func (w *Walker) Walk(relpath string, walkFn filepath.WalkFunc) error {
|
||||||
return ErrNotDir
|
return ErrNotDir
|
||||||
}
|
}
|
||||||
|
|
||||||
// spawn workers
|
// Spawn workers
|
||||||
for n := 1; n <= JobQueueSize; n++ {
|
for n := 1; n <= config.GetConfig().DirectoryCrawlers; n++ {
|
||||||
go w.worker()
|
go w.worker()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -4,34 +4,45 @@ import (
|
||||||
"crazyfs/CacheItem"
|
"crazyfs/CacheItem"
|
||||||
"crazyfs/config"
|
"crazyfs/config"
|
||||||
"crazyfs/file"
|
"crazyfs/file"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
)
|
)
|
||||||
|
|
||||||
func (dc *DirectoryCrawler) walkRecursiveFunc(path string, info os.FileInfo, err error) error {
|
func (dc *DirectoryCrawler) walkRecursiveFunc(fullPath string, info os.FileInfo, err error) error {
|
||||||
processErr := dc.processPath(path, info)
|
CacheItem.RetardCheck(fullPath)
|
||||||
|
processErr := dc.processPath(fullPath, info)
|
||||||
if processErr != nil {
|
if processErr != nil {
|
||||||
log.Errorf("CRAWLER - walkRecursiveFunc() failed - %s - %s", processErr, path)
|
log.Errorf("CRAWLER - walkRecursiveFunc() failed - %s - %s", processErr, fullPath)
|
||||||
return processErr
|
return processErr
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (dc *DirectoryCrawler) walkNonRecursiveFunc(path string, dir os.DirEntry, err error) error {
|
func (dc *DirectoryCrawler) walkNonRecursiveFunc(fullPath string, dir os.DirEntry, err error) error {
|
||||||
|
CacheItem.RetardCheck(fullPath)
|
||||||
info, infoErr := dir.Info()
|
info, infoErr := dir.Info()
|
||||||
if infoErr != nil {
|
if infoErr != nil {
|
||||||
log.Errorf("CRAWLER - walkNonRecursiveFunc() - get info failed - %s - %s", infoErr, path)
|
log.Errorf("CRAWLER - walkNonRecursiveFunc() - get info failed - %s - %s", infoErr, fullPath)
|
||||||
return infoErr
|
return infoErr
|
||||||
}
|
}
|
||||||
processErr := dc.processPath(path, info)
|
processErr := dc.processPath(fullPath, info)
|
||||||
if processErr != nil {
|
if processErr != nil {
|
||||||
log.Errorf("CRAWLER - walkNonRecursiveFunc() failed - %s - %s", processErr, path)
|
log.Errorf("CRAWLER - walkNonRecursiveFunc() failed - %s - %s", processErr, fullPath)
|
||||||
return processErr
|
return processErr
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (dc *DirectoryCrawler) Crawl(fullPath string) error {
|
func (dc *DirectoryCrawler) Crawl(fullPath string) error {
|
||||||
|
CacheItem.RetardCheck(fullPath)
|
||||||
|
readyToStart := dc.startCrawl(fullPath)
|
||||||
|
if !readyToStart {
|
||||||
|
return errors.New(fmt.Sprintf(`rejecting crawl, already in progress for "%s"`, fullPath))
|
||||||
|
}
|
||||||
|
defer dc.endCrawl(fullPath)
|
||||||
|
|
||||||
info, err := os.Lstat(fullPath)
|
info, err := os.Lstat(fullPath)
|
||||||
if os.IsNotExist(err) {
|
if os.IsNotExist(err) {
|
||||||
// If the path doesn't exist, just silently exit
|
// If the path doesn't exist, just silently exit
|
||||||
|
@ -84,6 +95,13 @@ func (dc *DirectoryCrawler) Crawl(fullPath string) error {
|
||||||
|
|
||||||
// CrawlNoRecursion this function crawls a file or directory and does not recurse into any subdirectories. Also returns the result of the crawl.
|
// CrawlNoRecursion this function crawls a file or directory and does not recurse into any subdirectories. Also returns the result of the crawl.
|
||||||
func (dc *DirectoryCrawler) CrawlNoRecursion(fullPath string) (*CacheItem.Item, error) {
|
func (dc *DirectoryCrawler) CrawlNoRecursion(fullPath string) (*CacheItem.Item, error) {
|
||||||
|
CacheItem.RetardCheck(fullPath)
|
||||||
|
readyToStart := dc.startCrawl(fullPath)
|
||||||
|
if !readyToStart {
|
||||||
|
return nil, errors.New(fmt.Sprintf(`rejecting crawl, already in progress for "%s"`, fullPath))
|
||||||
|
}
|
||||||
|
defer dc.endCrawl(fullPath)
|
||||||
|
|
||||||
info, err := os.Lstat(fullPath)
|
info, err := os.Lstat(fullPath)
|
||||||
if os.IsNotExist(err) {
|
if os.IsNotExist(err) {
|
||||||
// If the path doesn't exist, just silently exit
|
// If the path doesn't exist, just silently exit
|
||||||
|
|
|
@ -44,10 +44,10 @@ func startCrawl(sharedCache *lru.Cache[string, *CacheItem.Item], wg *sync.WaitGr
|
||||||
log.Infoln("CRAWLER - Starting a crawl...")
|
log.Infoln("CRAWLER - Starting a crawl...")
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
err := dc.Crawl(config.GetConfig().RootDir)
|
err := dc.Crawl(config.GetConfig().RootDir)
|
||||||
duration := time.Since(start).Round(time.Second)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Warnf("CRAWLER - Crawl failed: %s", err)
|
log.Warnf("CRAWLER - Crawl failed: %s", err)
|
||||||
} else {
|
} else {
|
||||||
|
duration := time.Since(start).Round(time.Second)
|
||||||
log.Infof("CRAWLER - Crawl completed in %s", duration)
|
log.Infof("CRAWLER - Crawl completed in %s", duration)
|
||||||
log.Debugf("%d/%d items in the cache.", config.GetConfig().CacheSize, len(sharedCache.Keys()))
|
log.Debugf("%d/%d items in the cache.", config.GetConfig().CacheSize, len(sharedCache.Keys()))
|
||||||
}
|
}
|
||||||
|
@ -59,8 +59,8 @@ func startCrawl(sharedCache *lru.Cache[string, *CacheItem.Item], wg *sync.WaitGr
|
||||||
func logCacheStatus(msg string, ticker *time.Ticker, sharedCache *lru.Cache[string, *CacheItem.Item], logFn func(format string, args ...interface{})) {
|
func logCacheStatus(msg string, ticker *time.Ticker, sharedCache *lru.Cache[string, *CacheItem.Item], logFn func(format string, args ...interface{})) {
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
for range ticker.C {
|
for range ticker.C {
|
||||||
activeWorkers := int(DirectoryCrawler.ActiveWorkers)
|
activeWorkers := int(DirectoryCrawler.BusyWorkers)
|
||||||
busyWorkers := int(DirectoryCrawler.ActiveWalks)
|
runningCrawls := DirectoryCrawler.GetTotalActiveCrawls()
|
||||||
logFn("%s - %d/%d items in the cache. Active workers: %d Active crawls: %d", msg, len(sharedCache.Keys()), config.GetConfig().CacheSize, activeWorkers, busyWorkers)
|
logFn("%s - %d/%d items in the cache. Active workers: %d Active crawls: %d", msg, len(sharedCache.Keys()), config.GetConfig().CacheSize, activeWorkers, runningCrawls)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,7 @@ import (
|
||||||
"crazyfs/cache/DirectoryCrawler"
|
"crazyfs/cache/DirectoryCrawler"
|
||||||
"crazyfs/config"
|
"crazyfs/config"
|
||||||
"crazyfs/file"
|
"crazyfs/file"
|
||||||
"crazyfs/logging"
|
"errors"
|
||||||
lru "github.com/hashicorp/golang-lru/v2"
|
lru "github.com/hashicorp/golang-lru/v2"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
@ -21,7 +21,6 @@ func InitRecacheSemaphore(limit int) {
|
||||||
func CheckAndRecache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) {
|
func CheckAndRecache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) {
|
||||||
item, found := sharedCache.Get(path)
|
item, found := sharedCache.Get(path)
|
||||||
if found && time.Now().UnixNano()/int64(time.Millisecond)-item.Cached > int64(config.GetConfig().CacheTime)*60*1000 {
|
if found && time.Now().UnixNano()/int64(time.Millisecond)-item.Cached > int64(config.GetConfig().CacheTime)*60*1000 {
|
||||||
log := logging.GetLogger()
|
|
||||||
log.Debugf("Re-caching: %s", path)
|
log.Debugf("Re-caching: %s", path)
|
||||||
sem <- struct{}{} // acquire a token
|
sem <- struct{}{} // acquire a token
|
||||||
go func() {
|
go func() {
|
||||||
|
@ -30,13 +29,18 @@ func CheckAndRecache(path string, sharedCache *lru.Cache[string, *CacheItem.Item
|
||||||
err := dc.Crawl(path)
|
err := dc.Crawl(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("RECACHE ERROR: %s", err.Error())
|
log.Errorf("RECACHE ERROR: %s", err.Error())
|
||||||
|
return
|
||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func Recache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) {
|
func Recache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) error {
|
||||||
log := logging.GetLogger()
|
dc := DirectoryCrawler.NewDirectoryCrawler(sharedCache)
|
||||||
|
if dc.IsCrawlActive(path) {
|
||||||
|
return errors.New("rejecting crawl, already in progress for this path")
|
||||||
|
}
|
||||||
|
|
||||||
log.Debugf("Re-caching: %s", path)
|
log.Debugf("Re-caching: %s", path)
|
||||||
start := time.Now()
|
start := time.Now()
|
||||||
sem <- struct{}{} // acquire a token
|
sem <- struct{}{} // acquire a token
|
||||||
|
@ -46,6 +50,7 @@ func Recache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) {
|
||||||
err := dc.Crawl(path)
|
err := dc.Crawl(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("RECACHE ERROR: %s", err.Error())
|
log.Errorf("RECACHE ERROR: %s", err.Error())
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get the parent directory from the cache
|
// Get the parent directory from the cache
|
||||||
|
@ -53,7 +58,7 @@ func Recache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) {
|
||||||
parentDirRel := file.StripRootDir(parentDir)
|
parentDirRel := file.StripRootDir(parentDir)
|
||||||
parentItem, found := sharedCache.Get(parentDirRel)
|
parentItem, found := sharedCache.Get(parentDirRel)
|
||||||
if found {
|
if found {
|
||||||
// Remove the old sub-directory from the parent directory's Children field
|
// Remove the old subdirectory from the parent directory's Children field
|
||||||
for i, child := range parentItem.Children {
|
for i, child := range parentItem.Children {
|
||||||
if child == path {
|
if child == path {
|
||||||
parentItem.Children = append(parentItem.Children[:i], parentItem.Children[i+1:]...)
|
parentItem.Children = append(parentItem.Children[:i], parentItem.Children[i+1:]...)
|
||||||
|
@ -61,10 +66,11 @@ func Recache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update the parent directory's Children field to include the new sub-directory
|
// Update the parent directory's Children field to include the new subdirectory
|
||||||
info, err := os.Stat(path)
|
info, err := os.Stat(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("RECACHE ERROR: %s", err.Error())
|
log.Errorf("RECACHE ERROR: %s", err.Error())
|
||||||
|
return
|
||||||
} else {
|
} else {
|
||||||
newItem := CacheItem.NewItem(path, info)
|
newItem := CacheItem.NewItem(path, info)
|
||||||
// Create a new slice that contains all items from the Children field except the old directory
|
// Create a new slice that contains all items from the Children field except the old directory
|
||||||
|
@ -81,15 +87,17 @@ func Recache(path string, sharedCache *lru.Cache[string, *CacheItem.Item]) {
|
||||||
// Update the parent directory in the cache
|
// Update the parent directory in the cache
|
||||||
sharedCache.Add(parentDir, parentItem)
|
sharedCache.Add(parentDir, parentItem)
|
||||||
}
|
}
|
||||||
} else {
|
} else if !CacheItem.PathOutsideRoot(parentDir) {
|
||||||
// If the parent directory isn't in the cache, crawl it
|
// If the parent directory isn't in the cache, crawl it
|
||||||
log.Infof("RECACHE - crawling parent directory since it isn't in the cache yet: %s", parentDir)
|
log.Infof("RECACHE - crawling parent directory since it isn't in the cache yet: %s", parentDir)
|
||||||
err := dc.Crawl(parentDir)
|
_, err := dc.CrawlNoRecursion(parentDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Errorf("RECACHE ERROR: %s", err.Error())
|
log.Errorf("RECACHE ERROR: %s", err.Error())
|
||||||
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
duration := time.Since(start).Round(time.Second)
|
duration := time.Since(start).Round(time.Second)
|
||||||
log.Infof("RECACHE - completed in %s - %s", duration, path)
|
log.Infof("RECACHE - completed in %s - %s", duration, path)
|
||||||
}()
|
}()
|
||||||
|
return nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,6 +39,14 @@ type cliConfig struct {
|
||||||
// TODO: admin api endpoint to get status and progress of the full refresh of elasticsearch
|
// TODO: admin api endpoint to get status and progress of the full refresh of elasticsearch
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
//fullPath := "/srv/chub-archive"
|
||||||
|
//RootDir := "/srv/chub-archive"
|
||||||
|
//
|
||||||
|
//fmt.Println(strings.HasPrefix(fullPath, RootDir))
|
||||||
|
////fmt.Println(fullPath != RootDir)
|
||||||
|
//
|
||||||
|
//return
|
||||||
|
|
||||||
cliArgs := parseArgs()
|
cliArgs := parseArgs()
|
||||||
if cliArgs.help {
|
if cliArgs.help {
|
||||||
flag.Usage()
|
flag.Usage()
|
||||||
|
|
|
@ -10,7 +10,7 @@ import (
|
||||||
"strings"
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
func Search(query string, exclude []string) (*esapi.Response, error) {
|
func SimpleQuery(query string, exclude []string) (*esapi.Response, error) {
|
||||||
var excludeQuery string
|
var excludeQuery string
|
||||||
if len(exclude) > 0 {
|
if len(exclude) > 0 {
|
||||||
var excludeConditions []string
|
var excludeConditions []string
|
5
todo.txt
5
todo.txt
|
@ -1,13 +1,10 @@
|
||||||
- Track active crawls and list them on the admin page
|
|
||||||
- Limit to one on-demand crawl per path. Don't start another if one is already running. See HandleFileNotFound()
|
- Limit to one on-demand crawl per path. Don't start another if one is already running. See HandleFileNotFound()
|
||||||
- Add config value to limit the number of on-demand crawls
|
- Add config value to limit the number of on-demand crawls
|
||||||
- Add config value to limit the number of concurrent crawls, other crawls get queued.
|
- Add config value to limit the number of concurrent crawls, other crawls get queued.
|
||||||
- add an admin endpoint to fetch the last n modified files.
|
|
||||||
- fix /api/file/download when an item is in the cache but does not exist on the disk
|
|
||||||
- Is using scroll for the Elastic query really the best way to do a real-time query?
|
|
||||||
|
|
||||||
|
|
||||||
Later:
|
Later:
|
||||||
- Add a wildcard option to restricted_download_paths to block all sub-directories
|
- Add a wildcard option to restricted_download_paths to block all sub-directories
|
||||||
- Add a dict to each restricted_download_paths item to specify how many levels recursive the block should be applied
|
- Add a dict to each restricted_download_paths item to specify how many levels recursive the block should be applied
|
||||||
- add a "last modified" to "sort" https://chub-archive.evulid.cc/api/file/list?path=/chub.ai/characters&page=1&limit=50&sort=folders
|
- add a "last modified" to "sort" https://chub-archive.evulid.cc/api/file/list?path=/chub.ai/characters&page=1&limit=50&sort=folders
|
||||||
|
- add an admin endpoint to fetch the last n modified files.
|
||||||
|
|
Loading…
Reference in New Issue