2017-01-24 19:43:02 -07:00
|
|
|
// Copyright (c) 2014 Couchbase, Inc.
|
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
package index
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"encoding/json"
|
|
|
|
"fmt"
|
2019-02-17 17:50:26 -07:00
|
|
|
"reflect"
|
2017-01-24 19:43:02 -07:00
|
|
|
|
|
|
|
"github.com/blevesearch/bleve/document"
|
|
|
|
"github.com/blevesearch/bleve/index/store"
|
2019-02-17 17:50:26 -07:00
|
|
|
"github.com/blevesearch/bleve/size"
|
2017-01-24 19:43:02 -07:00
|
|
|
)
|
|
|
|
|
2019-02-17 17:50:26 -07:00
|
|
|
var reflectStaticSizeTermFieldDoc int
|
|
|
|
var reflectStaticSizeTermFieldVector int
|
|
|
|
|
|
|
|
func init() {
|
|
|
|
var tfd TermFieldDoc
|
|
|
|
reflectStaticSizeTermFieldDoc = int(reflect.TypeOf(tfd).Size())
|
|
|
|
var tfv TermFieldVector
|
|
|
|
reflectStaticSizeTermFieldVector = int(reflect.TypeOf(tfv).Size())
|
|
|
|
}
|
|
|
|
|
2017-01-24 19:43:02 -07:00
|
|
|
var ErrorUnknownStorageType = fmt.Errorf("unknown storage type")
|
|
|
|
|
|
|
|
type Index interface {
|
|
|
|
Open() error
|
|
|
|
Close() error
|
|
|
|
|
|
|
|
Update(doc *document.Document) error
|
|
|
|
Delete(id string) error
|
|
|
|
Batch(batch *Batch) error
|
|
|
|
|
|
|
|
SetInternal(key, val []byte) error
|
|
|
|
DeleteInternal(key []byte) error
|
|
|
|
|
|
|
|
// Reader returns a low-level accessor on the index data. Close it to
|
|
|
|
// release associated resources.
|
|
|
|
Reader() (IndexReader, error)
|
|
|
|
|
|
|
|
Stats() json.Marshaler
|
|
|
|
StatsMap() map[string]interface{}
|
|
|
|
|
|
|
|
Analyze(d *document.Document) *AnalysisResult
|
|
|
|
|
|
|
|
Advanced() (store.KVStore, error)
|
|
|
|
}
|
|
|
|
|
2017-09-16 14:16:21 -06:00
|
|
|
type DocumentFieldTermVisitor func(field string, term []byte)
|
|
|
|
|
2017-01-24 19:43:02 -07:00
|
|
|
type IndexReader interface {
|
|
|
|
TermFieldReader(term []byte, field string, includeFreq, includeNorm, includeTermVectors bool) (TermFieldReader, error)
|
|
|
|
|
|
|
|
// DocIDReader returns an iterator over all doc ids
|
|
|
|
// The caller must close returned instance to release associated resources.
|
|
|
|
DocIDReaderAll() (DocIDReader, error)
|
|
|
|
|
|
|
|
DocIDReaderOnly(ids []string) (DocIDReader, error)
|
|
|
|
|
|
|
|
FieldDict(field string) (FieldDict, error)
|
|
|
|
|
|
|
|
// FieldDictRange is currently defined to include the start and end terms
|
|
|
|
FieldDictRange(field string, startTerm []byte, endTerm []byte) (FieldDict, error)
|
|
|
|
FieldDictPrefix(field string, termPrefix []byte) (FieldDict, error)
|
|
|
|
|
|
|
|
Document(id string) (*document.Document, error)
|
2017-09-16 14:16:21 -06:00
|
|
|
DocumentVisitFieldTerms(id IndexInternalID, fields []string, visitor DocumentFieldTermVisitor) error
|
2017-01-24 19:43:02 -07:00
|
|
|
|
2019-02-17 17:50:26 -07:00
|
|
|
DocValueReader(fields []string) (DocValueReader, error)
|
|
|
|
|
2017-01-24 19:43:02 -07:00
|
|
|
Fields() ([]string, error)
|
|
|
|
|
|
|
|
GetInternal(key []byte) ([]byte, error)
|
|
|
|
|
|
|
|
DocCount() (uint64, error)
|
|
|
|
|
|
|
|
ExternalID(id IndexInternalID) (string, error)
|
|
|
|
InternalID(id string) (IndexInternalID, error)
|
|
|
|
|
|
|
|
DumpAll() chan interface{}
|
|
|
|
DumpDoc(id string) chan interface{}
|
|
|
|
DumpFields() chan interface{}
|
|
|
|
|
|
|
|
Close() error
|
|
|
|
}
|
|
|
|
|
2019-02-17 17:50:26 -07:00
|
|
|
// The Regexp interface defines the subset of the regexp.Regexp API
|
|
|
|
// methods that are used by bleve indexes, allowing callers to pass in
|
|
|
|
// alternate implementations.
|
|
|
|
type Regexp interface {
|
|
|
|
FindStringIndex(s string) (loc []int)
|
|
|
|
|
|
|
|
LiteralPrefix() (prefix string, complete bool)
|
|
|
|
|
|
|
|
String() string
|
|
|
|
}
|
|
|
|
|
|
|
|
type IndexReaderRegexp interface {
|
|
|
|
FieldDictRegexp(field string, regex string) (FieldDict, error)
|
|
|
|
}
|
|
|
|
|
|
|
|
type IndexReaderFuzzy interface {
|
|
|
|
FieldDictFuzzy(field string, term string, fuzziness int, prefix string) (FieldDict, error)
|
|
|
|
}
|
|
|
|
|
|
|
|
type IndexReaderOnly interface {
|
|
|
|
FieldDictOnly(field string, onlyTerms [][]byte, includeCount bool) (FieldDict, error)
|
|
|
|
}
|
|
|
|
|
2017-01-24 19:43:02 -07:00
|
|
|
// FieldTerms contains the terms used by a document, keyed by field
|
|
|
|
type FieldTerms map[string][]string
|
|
|
|
|
|
|
|
// FieldsNotYetCached returns a list of fields not yet cached out of a larger list of fields
|
|
|
|
func (f FieldTerms) FieldsNotYetCached(fields []string) []string {
|
|
|
|
rv := make([]string, 0, len(fields))
|
|
|
|
for _, field := range fields {
|
|
|
|
if _, ok := f[field]; !ok {
|
|
|
|
rv = append(rv, field)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return rv
|
|
|
|
}
|
|
|
|
|
|
|
|
// Merge will combine two FieldTerms
|
|
|
|
// it assumes that the terms lists are complete (thus do not need to be merged)
|
|
|
|
// field terms from the other list always replace the ones in the receiver
|
|
|
|
func (f FieldTerms) Merge(other FieldTerms) {
|
|
|
|
for field, terms := range other {
|
|
|
|
f[field] = terms
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
type TermFieldVector struct {
|
|
|
|
Field string
|
|
|
|
ArrayPositions []uint64
|
|
|
|
Pos uint64
|
|
|
|
Start uint64
|
|
|
|
End uint64
|
|
|
|
}
|
|
|
|
|
2019-02-17 17:50:26 -07:00
|
|
|
func (tfv *TermFieldVector) Size() int {
|
|
|
|
return reflectStaticSizeTermFieldVector + size.SizeOfPtr +
|
|
|
|
len(tfv.Field) + len(tfv.ArrayPositions)*size.SizeOfUint64
|
|
|
|
}
|
|
|
|
|
2017-01-24 19:43:02 -07:00
|
|
|
// IndexInternalID is an opaque document identifier interal to the index impl
|
|
|
|
type IndexInternalID []byte
|
|
|
|
|
|
|
|
func (id IndexInternalID) Equals(other IndexInternalID) bool {
|
|
|
|
return id.Compare(other) == 0
|
|
|
|
}
|
|
|
|
|
|
|
|
func (id IndexInternalID) Compare(other IndexInternalID) int {
|
|
|
|
return bytes.Compare(id, other)
|
|
|
|
}
|
|
|
|
|
|
|
|
type TermFieldDoc struct {
|
|
|
|
Term string
|
|
|
|
ID IndexInternalID
|
|
|
|
Freq uint64
|
|
|
|
Norm float64
|
|
|
|
Vectors []*TermFieldVector
|
|
|
|
}
|
|
|
|
|
2019-02-17 17:50:26 -07:00
|
|
|
func (tfd *TermFieldDoc) Size() int {
|
|
|
|
sizeInBytes := reflectStaticSizeTermFieldDoc + size.SizeOfPtr +
|
|
|
|
len(tfd.Term) + len(tfd.ID)
|
|
|
|
|
|
|
|
for _, entry := range tfd.Vectors {
|
|
|
|
sizeInBytes += entry.Size()
|
|
|
|
}
|
|
|
|
|
|
|
|
return sizeInBytes
|
|
|
|
}
|
|
|
|
|
2017-01-24 19:43:02 -07:00
|
|
|
// Reset allows an already allocated TermFieldDoc to be reused
|
|
|
|
func (tfd *TermFieldDoc) Reset() *TermFieldDoc {
|
|
|
|
// remember the []byte used for the ID
|
|
|
|
id := tfd.ID
|
2019-02-17 17:50:26 -07:00
|
|
|
vectors := tfd.Vectors
|
2017-01-24 19:43:02 -07:00
|
|
|
// idiom to copy over from empty TermFieldDoc (0 allocations)
|
|
|
|
*tfd = TermFieldDoc{}
|
|
|
|
// reuse the []byte already allocated (and reset len to 0)
|
|
|
|
tfd.ID = id[:0]
|
2019-02-17 17:50:26 -07:00
|
|
|
tfd.Vectors = vectors[:0]
|
2017-01-24 19:43:02 -07:00
|
|
|
return tfd
|
|
|
|
}
|
|
|
|
|
|
|
|
// TermFieldReader is the interface exposing the enumeration of documents
|
|
|
|
// containing a given term in a given field. Documents are returned in byte
|
|
|
|
// lexicographic order over their identifiers.
|
|
|
|
type TermFieldReader interface {
|
|
|
|
// Next returns the next document containing the term in this field, or nil
|
|
|
|
// when it reaches the end of the enumeration. The preAlloced TermFieldDoc
|
|
|
|
// is optional, and when non-nil, will be used instead of allocating memory.
|
|
|
|
Next(preAlloced *TermFieldDoc) (*TermFieldDoc, error)
|
|
|
|
|
|
|
|
// Advance resets the enumeration at specified document or its immediate
|
|
|
|
// follower.
|
|
|
|
Advance(ID IndexInternalID, preAlloced *TermFieldDoc) (*TermFieldDoc, error)
|
|
|
|
|
|
|
|
// Count returns the number of documents contains the term in this field.
|
|
|
|
Count() uint64
|
|
|
|
Close() error
|
2019-02-17 17:50:26 -07:00
|
|
|
|
|
|
|
Size() int
|
2017-01-24 19:43:02 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
type DictEntry struct {
|
|
|
|
Term string
|
|
|
|
Count uint64
|
|
|
|
}
|
|
|
|
|
|
|
|
type FieldDict interface {
|
|
|
|
Next() (*DictEntry, error)
|
|
|
|
Close() error
|
|
|
|
}
|
|
|
|
|
|
|
|
// DocIDReader is the interface exposing enumeration of documents identifiers.
|
|
|
|
// Close the reader to release associated resources.
|
|
|
|
type DocIDReader interface {
|
|
|
|
// Next returns the next document internal identifier in the natural
|
|
|
|
// index order, nil when the end of the sequence is reached.
|
|
|
|
Next() (IndexInternalID, error)
|
|
|
|
|
|
|
|
// Advance resets the iteration to the first internal identifier greater than
|
|
|
|
// or equal to ID. If ID is smaller than the start of the range, the iteration
|
|
|
|
// will start there instead. If ID is greater than or equal to the end of
|
|
|
|
// the range, Next() call will return io.EOF.
|
|
|
|
Advance(ID IndexInternalID) (IndexInternalID, error)
|
2019-02-17 17:50:26 -07:00
|
|
|
|
|
|
|
Size() int
|
|
|
|
|
2017-01-24 19:43:02 -07:00
|
|
|
Close() error
|
|
|
|
}
|
|
|
|
|
2019-02-17 17:50:26 -07:00
|
|
|
type BatchCallback func(error)
|
|
|
|
|
2017-01-24 19:43:02 -07:00
|
|
|
type Batch struct {
|
2019-02-17 17:50:26 -07:00
|
|
|
IndexOps map[string]*document.Document
|
|
|
|
InternalOps map[string][]byte
|
|
|
|
persistedCallback BatchCallback
|
2017-01-24 19:43:02 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
func NewBatch() *Batch {
|
|
|
|
return &Batch{
|
|
|
|
IndexOps: make(map[string]*document.Document),
|
|
|
|
InternalOps: make(map[string][]byte),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (b *Batch) Update(doc *document.Document) {
|
|
|
|
b.IndexOps[doc.ID] = doc
|
|
|
|
}
|
|
|
|
|
|
|
|
func (b *Batch) Delete(id string) {
|
|
|
|
b.IndexOps[id] = nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (b *Batch) SetInternal(key, val []byte) {
|
|
|
|
b.InternalOps[string(key)] = val
|
|
|
|
}
|
|
|
|
|
|
|
|
func (b *Batch) DeleteInternal(key []byte) {
|
|
|
|
b.InternalOps[string(key)] = nil
|
|
|
|
}
|
|
|
|
|
2019-02-17 17:50:26 -07:00
|
|
|
func (b *Batch) SetPersistedCallback(f BatchCallback) {
|
|
|
|
b.persistedCallback = f
|
|
|
|
}
|
|
|
|
|
|
|
|
func (b *Batch) PersistedCallback() BatchCallback {
|
|
|
|
return b.persistedCallback
|
|
|
|
}
|
|
|
|
|
2017-01-24 19:43:02 -07:00
|
|
|
func (b *Batch) String() string {
|
|
|
|
rv := fmt.Sprintf("Batch (%d ops, %d internal ops)\n", len(b.IndexOps), len(b.InternalOps))
|
|
|
|
for k, v := range b.IndexOps {
|
|
|
|
if v != nil {
|
|
|
|
rv += fmt.Sprintf("\tINDEX - '%s'\n", k)
|
|
|
|
} else {
|
|
|
|
rv += fmt.Sprintf("\tDELETE - '%s'\n", k)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for k, v := range b.InternalOps {
|
|
|
|
if v != nil {
|
|
|
|
rv += fmt.Sprintf("\tSET INTERNAL - '%s'\n", k)
|
|
|
|
} else {
|
|
|
|
rv += fmt.Sprintf("\tDELETE INTERNAL - '%s'\n", k)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return rv
|
|
|
|
}
|
|
|
|
|
|
|
|
func (b *Batch) Reset() {
|
|
|
|
b.IndexOps = make(map[string]*document.Document)
|
|
|
|
b.InternalOps = make(map[string][]byte)
|
2019-02-17 17:50:26 -07:00
|
|
|
b.persistedCallback = nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (b *Batch) Merge(o *Batch) {
|
|
|
|
for k, v := range o.IndexOps {
|
|
|
|
b.IndexOps[k] = v
|
|
|
|
}
|
|
|
|
for k, v := range o.InternalOps {
|
|
|
|
b.InternalOps[k] = v
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (b *Batch) TotalDocSize() int {
|
|
|
|
var s int
|
|
|
|
for k, v := range b.IndexOps {
|
|
|
|
if v != nil {
|
|
|
|
s += v.Size() + size.SizeOfString
|
|
|
|
}
|
|
|
|
s += len(k)
|
|
|
|
}
|
|
|
|
return s
|
|
|
|
}
|
|
|
|
|
|
|
|
// Optimizable represents an optional interface that implementable by
|
|
|
|
// optimizable resources (e.g., TermFieldReaders, Searchers). These
|
|
|
|
// optimizable resources are provided the same OptimizableContext
|
|
|
|
// instance, so that they can coordinate via dynamic interface
|
|
|
|
// casting.
|
|
|
|
type Optimizable interface {
|
|
|
|
Optimize(kind string, octx OptimizableContext) (OptimizableContext, error)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Represents a result of optimization -- see the Finish() method.
|
|
|
|
type Optimized interface{}
|
|
|
|
|
|
|
|
type OptimizableContext interface {
|
|
|
|
// Once all the optimzable resources have been provided the same
|
|
|
|
// OptimizableContext instance, the optimization preparations are
|
|
|
|
// finished or completed via the Finish() method.
|
|
|
|
//
|
|
|
|
// Depending on the optimization being performed, the Finish()
|
|
|
|
// method might return a non-nil Optimized instance. For example,
|
|
|
|
// the Optimized instance might represent an optimized
|
|
|
|
// TermFieldReader instance.
|
|
|
|
Finish() (Optimized, error)
|
|
|
|
}
|
|
|
|
|
|
|
|
type DocValueReader interface {
|
|
|
|
VisitDocValues(id IndexInternalID, visitor DocumentFieldTermVisitor) error
|
2017-01-24 19:43:02 -07:00
|
|
|
}
|