first-commit
This commit is contained in:
58
modules/indexer/internal/bleve/batch.go
Normal file
58
modules/indexer/internal/bleve/batch.go
Normal file
@@ -0,0 +1,58 @@
|
||||
// Copyright 2021 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package bleve
|
||||
|
||||
import (
|
||||
"github.com/blevesearch/bleve/v2"
|
||||
)
|
||||
|
||||
// FlushingBatch is a batch of operations that automatically flushes to the
|
||||
// underlying index once it reaches a certain size.
|
||||
type FlushingBatch struct {
|
||||
maxBatchSize int
|
||||
batch *bleve.Batch
|
||||
index bleve.Index
|
||||
}
|
||||
|
||||
// NewFlushingBatch creates a new flushing batch for the specified index. Once
|
||||
// the number of operations in the batch reaches the specified limit, the batch
|
||||
// automatically flushes its operations to the index.
|
||||
func NewFlushingBatch(index bleve.Index, maxBatchSize int) *FlushingBatch {
|
||||
return &FlushingBatch{
|
||||
maxBatchSize: maxBatchSize,
|
||||
batch: index.NewBatch(),
|
||||
index: index,
|
||||
}
|
||||
}
|
||||
|
||||
// Index add a new index to batch
|
||||
func (b *FlushingBatch) Index(id string, data any) error {
|
||||
if err := b.batch.Index(id, data); err != nil {
|
||||
return err
|
||||
}
|
||||
return b.flushIfFull()
|
||||
}
|
||||
|
||||
// Delete add a delete index to batch
|
||||
func (b *FlushingBatch) Delete(id string) error {
|
||||
b.batch.Delete(id)
|
||||
return b.flushIfFull()
|
||||
}
|
||||
|
||||
func (b *FlushingBatch) flushIfFull() error {
|
||||
if b.batch.Size() < b.maxBatchSize {
|
||||
return nil
|
||||
}
|
||||
return b.Flush()
|
||||
}
|
||||
|
||||
// Flush submit the batch and create a new one
|
||||
func (b *FlushingBatch) Flush() error {
|
||||
err := b.index.Batch(b.batch)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
b.batch = b.index.NewBatch()
|
||||
return nil
|
||||
}
|
103
modules/indexer/internal/bleve/indexer.go
Normal file
103
modules/indexer/internal/bleve/indexer.go
Normal file
@@ -0,0 +1,103 @@
|
||||
// Copyright 2023 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package bleve
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
|
||||
"code.gitea.io/gitea/modules/indexer/internal"
|
||||
"code.gitea.io/gitea/modules/log"
|
||||
|
||||
"github.com/blevesearch/bleve/v2"
|
||||
"github.com/blevesearch/bleve/v2/mapping"
|
||||
"github.com/ethantkoenig/rupture"
|
||||
)
|
||||
|
||||
var _ internal.Indexer = &Indexer{}
|
||||
|
||||
// Indexer represents a basic bleve indexer implementation
|
||||
type Indexer struct {
|
||||
Indexer bleve.Index
|
||||
|
||||
indexDir string
|
||||
version int
|
||||
mappingGetter MappingGetter
|
||||
}
|
||||
|
||||
type MappingGetter func() (mapping.IndexMapping, error)
|
||||
|
||||
func NewIndexer(indexDir string, version int, mappingGetter func() (mapping.IndexMapping, error)) *Indexer {
|
||||
return &Indexer{
|
||||
indexDir: indexDir,
|
||||
version: version,
|
||||
mappingGetter: mappingGetter,
|
||||
}
|
||||
}
|
||||
|
||||
// Init initializes the indexer
|
||||
func (i *Indexer) Init(_ context.Context) (bool, error) {
|
||||
if i == nil {
|
||||
return false, errors.New("cannot init nil indexer")
|
||||
}
|
||||
|
||||
if i.Indexer != nil {
|
||||
return false, errors.New("indexer is already initialized")
|
||||
}
|
||||
|
||||
indexer, version, err := openIndexer(i.indexDir, i.version)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
if indexer != nil {
|
||||
i.Indexer = indexer
|
||||
return true, nil
|
||||
}
|
||||
|
||||
if version != 0 {
|
||||
log.Warn("Found older bleve index with version %d, Gitea will remove it and rebuild", version)
|
||||
}
|
||||
|
||||
indexMapping, err := i.mappingGetter()
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
indexer, err = bleve.New(i.indexDir, indexMapping)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
if err = rupture.WriteIndexMetadata(i.indexDir, &rupture.IndexMetadata{
|
||||
Version: i.version,
|
||||
}); err != nil {
|
||||
return false, err
|
||||
}
|
||||
|
||||
i.Indexer = indexer
|
||||
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// Ping checks if the indexer is available
|
||||
func (i *Indexer) Ping(_ context.Context) error {
|
||||
if i == nil {
|
||||
return errors.New("cannot ping nil indexer")
|
||||
}
|
||||
if i.Indexer == nil {
|
||||
return errors.New("indexer is not initialized")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (i *Indexer) Close() {
|
||||
if i == nil || i.Indexer == nil {
|
||||
return
|
||||
}
|
||||
|
||||
if err := i.Indexer.Close(); err != nil {
|
||||
log.Error("Failed to close bleve indexer in %q: %v", i.indexDir, err)
|
||||
}
|
||||
i.Indexer = nil
|
||||
}
|
66
modules/indexer/internal/bleve/query.go
Normal file
66
modules/indexer/internal/bleve/query.go
Normal file
@@ -0,0 +1,66 @@
|
||||
// Copyright 2023 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package bleve
|
||||
|
||||
import (
|
||||
"code.gitea.io/gitea/modules/optional"
|
||||
|
||||
"github.com/blevesearch/bleve/v2"
|
||||
"github.com/blevesearch/bleve/v2/search/query"
|
||||
)
|
||||
|
||||
// NumericEqualityQuery generates a numeric equality query for the given value and field
|
||||
func NumericEqualityQuery(value int64, field string) *query.NumericRangeQuery {
|
||||
f := float64(value)
|
||||
tru := true
|
||||
q := bleve.NewNumericRangeInclusiveQuery(&f, &f, &tru, &tru)
|
||||
q.SetField(field)
|
||||
return q
|
||||
}
|
||||
|
||||
// MatchPhraseQuery generates a match phrase query for the given phrase, field and analyzer
|
||||
func MatchPhraseQuery(matchPhrase, field, analyzer string, fuzziness int) *query.MatchPhraseQuery {
|
||||
q := bleve.NewMatchPhraseQuery(matchPhrase)
|
||||
q.FieldVal = field
|
||||
q.Analyzer = analyzer
|
||||
q.Fuzziness = fuzziness
|
||||
return q
|
||||
}
|
||||
|
||||
// MatchAndQuery generates a match query for the given phrase, field and analyzer
|
||||
func MatchAndQuery(matchPhrase, field, analyzer string, fuzziness int) *query.MatchQuery {
|
||||
q := bleve.NewMatchQuery(matchPhrase)
|
||||
q.FieldVal = field
|
||||
q.Analyzer = analyzer
|
||||
q.Fuzziness = fuzziness
|
||||
q.Operator = query.MatchQueryOperatorAnd
|
||||
return q
|
||||
}
|
||||
|
||||
// BoolFieldQuery generates a bool field query for the given value and field
|
||||
func BoolFieldQuery(value bool, field string) *query.BoolFieldQuery {
|
||||
q := bleve.NewBoolFieldQuery(value)
|
||||
q.SetField(field)
|
||||
return q
|
||||
}
|
||||
|
||||
func NumericRangeInclusiveQuery(minOption, maxOption optional.Option[int64], field string) *query.NumericRangeQuery {
|
||||
var minF, maxF *float64
|
||||
var minI, maxI *bool
|
||||
if minOption.Has() {
|
||||
minF = new(float64)
|
||||
*minF = float64(minOption.Value())
|
||||
minI = new(bool)
|
||||
*minI = true
|
||||
}
|
||||
if maxOption.Has() {
|
||||
maxF = new(float64)
|
||||
*maxF = float64(maxOption.Value())
|
||||
maxI = new(bool)
|
||||
*maxI = true
|
||||
}
|
||||
q := bleve.NewNumericRangeInclusiveQuery(minF, maxF, minI, maxI)
|
||||
q.SetField(field)
|
||||
return q
|
||||
}
|
90
modules/indexer/internal/bleve/util.go
Normal file
90
modules/indexer/internal/bleve/util.go
Normal file
@@ -0,0 +1,90 @@
|
||||
// Copyright 2023 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package bleve
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"os"
|
||||
"unicode"
|
||||
|
||||
"code.gitea.io/gitea/modules/log"
|
||||
"code.gitea.io/gitea/modules/setting"
|
||||
"code.gitea.io/gitea/modules/util"
|
||||
|
||||
"github.com/blevesearch/bleve/v2"
|
||||
unicode_tokenizer "github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
|
||||
"github.com/blevesearch/bleve/v2/index/upsidedown"
|
||||
"github.com/ethantkoenig/rupture"
|
||||
)
|
||||
|
||||
const (
|
||||
maxFuzziness = 2
|
||||
)
|
||||
|
||||
// openIndexer open the index at the specified path, checking for metadata
|
||||
// updates and bleve version updates. If index needs to be created (or
|
||||
// re-created), returns (nil, nil)
|
||||
func openIndexer(path string, latestVersion int) (bleve.Index, int, error) {
|
||||
_, err := os.Stat(path)
|
||||
if err != nil && os.IsNotExist(err) {
|
||||
return nil, 0, nil
|
||||
} else if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
metadata, err := rupture.ReadIndexMetadata(path)
|
||||
if err != nil {
|
||||
return nil, 0, err
|
||||
}
|
||||
if metadata.Version < latestVersion {
|
||||
// the indexer is using a previous version, so we should delete it and
|
||||
// re-populate
|
||||
return nil, metadata.Version, util.RemoveAll(path)
|
||||
}
|
||||
|
||||
index, err := bleve.Open(path)
|
||||
if err != nil {
|
||||
if errors.Is(err, upsidedown.IncompatibleVersion) {
|
||||
log.Warn("Indexer was built with a previous version of bleve, deleting and rebuilding")
|
||||
return nil, 0, util.RemoveAll(path)
|
||||
}
|
||||
return nil, 0, err
|
||||
}
|
||||
|
||||
return index, 0, nil
|
||||
}
|
||||
|
||||
// GuessFuzzinessByKeyword guesses fuzziness based on the levenshtein distance and determines how many chars
|
||||
// may be different on two string, and they still be considered equivalent.
|
||||
// Given a phrase, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero.
|
||||
func GuessFuzzinessByKeyword(s string) int {
|
||||
tokenizer := unicode_tokenizer.NewUnicodeTokenizer()
|
||||
tokens := tokenizer.Tokenize([]byte(s))
|
||||
|
||||
if len(tokens) > 0 {
|
||||
fuzziness := maxFuzziness
|
||||
|
||||
for _, token := range tokens {
|
||||
fuzziness = min(fuzziness, guessFuzzinessByKeyword(string(token.Term)))
|
||||
}
|
||||
|
||||
return fuzziness
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
func guessFuzzinessByKeyword(s string) int {
|
||||
// according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2
|
||||
// magic number 4 was chosen to determine the levenshtein distance per each character of a keyword
|
||||
// BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot.
|
||||
// Likewise, queries whose terms contains characters that are *not* letters should not use fuzziness
|
||||
|
||||
for _, r := range s {
|
||||
if r >= 128 || !unicode.IsLetter(r) {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
return min(min(setting.Indexer.TypeBleveMaxFuzzniess, maxFuzziness), len(s)/4)
|
||||
}
|
58
modules/indexer/internal/bleve/util_test.go
Normal file
58
modules/indexer/internal/bleve/util_test.go
Normal file
@@ -0,0 +1,58 @@
|
||||
// Copyright 2024 The Gitea Authors. All rights reserved.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
package bleve
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"code.gitea.io/gitea/modules/setting"
|
||||
"code.gitea.io/gitea/modules/test"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
|
||||
func TestBleveGuessFuzzinessByKeyword(t *testing.T) {
|
||||
defer test.MockVariableValue(&setting.Indexer.TypeBleveMaxFuzzniess, 2)()
|
||||
|
||||
scenarios := []struct {
|
||||
Input string
|
||||
Fuzziness int // See util.go for the definition of fuzziness in this particular context
|
||||
}{
|
||||
{
|
||||
Input: "",
|
||||
Fuzziness: 0,
|
||||
},
|
||||
{
|
||||
Input: "Avocado",
|
||||
Fuzziness: 1,
|
||||
},
|
||||
{
|
||||
Input: "Geschwindigkeit",
|
||||
Fuzziness: 2,
|
||||
},
|
||||
{
|
||||
Input: "non-exist",
|
||||
Fuzziness: 0,
|
||||
},
|
||||
{
|
||||
Input: "갃갃갃",
|
||||
Fuzziness: 0,
|
||||
},
|
||||
{
|
||||
Input: "repo1",
|
||||
Fuzziness: 0,
|
||||
},
|
||||
{
|
||||
Input: "avocado.md",
|
||||
Fuzziness: 0,
|
||||
},
|
||||
}
|
||||
|
||||
for _, scenario := range scenarios {
|
||||
t.Run(fmt.Sprintf("Fuziniess:%s=%d", scenario.Input, scenario.Fuzziness), func(t *testing.T) {
|
||||
assert.Equal(t, scenario.Fuzziness, GuessFuzzinessByKeyword(scenario.Input))
|
||||
})
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user