indexDB/fulltext.go

285 lines
6.7 KiB
Go
Raw Normal View History

package indexDB
import (
"bytes"
"compress/gzip"
"fmt"
"io"
"strings"
"sync"
_ "embed"
"github.com/blevesearch/bleve/v2"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/mapping"
"github.com/blevesearch/bleve/v2/registry"
"github.com/blevesearch/bleve/v2/search/query"
"github.com/go-ego/gse"
"golang.org/x/text/width"
"apigo.cc/go/cast"
"apigo.cc/go/log"
)
//go:embed dict/s_1.tgz
var indexDictMainCS []byte
//go:embed dict/t_1.tgz
var indexDictMainCT []byte
//go:embed dict/hmm_model.utf8.tgz
var indexDictHmm []byte
//go:embed dict/stop_words.utf8.tgz
var indexDictStop []byte
var (
seg gse.Segmenter
segLock sync.RWMutex
stopWords = map[string]bool{}
initOnce sync.Once
)
const AnalyzerName = "gse"
// GseAnalyzer implements the bleve analysis.Analyzer interface.
type GseAnalyzer struct{}
func (a *GseAnalyzer) Analyze(input []byte) analysis.TokenStream {
if len(input) == 0 {
return nil
}
segLock.RLock()
segments := seg.Segment(input)
segLock.RUnlock()
var tokens []*analysis.Token
position := 1
for _, segment := range segments {
tokenText := segment.Token().Text()
if tokenText == "" {
continue
}
word := width.Narrow.String(strings.ToLower(tokenText))
if stopWords[word] {
continue
}
tokens = append(tokens, &analysis.Token{
Term: []byte(word),
Start: segment.Start(),
End: segment.End(),
Position: position,
Type: analysis.Ideographic,
})
position++
}
return tokens
}
func analyzerConstructor(config map[string]any, cache *registry.Cache) (analysis.Analyzer, error) {
return &GseAnalyzer{}, nil
}
func gunzip(data []byte) []byte {
r, err := gzip.NewReader(bytes.NewReader(data))
if err != nil {
return nil
}
defer r.Close()
out, _ := io.ReadAll(r)
return out
}
func initGse() {
initOnce.Do(func() {
model := make(map[rune]float64)
lines := strings.Split(string(gunzip(indexDictHmm)), "\n")
for _, line := range lines {
if line = strings.TrimSpace(line); line != "" && line[0] != '#' {
for _, item := range strings.Split(line, ",") {
if parts := strings.SplitN(item, ":", 2); len(parts) == 2 {
if key := []rune(parts[0]); len(key) > 0 {
model[key[0]] = cast.To[float64](parts[1])
}
}
}
}
}
stopWordsText := string(gunzip(indexDictStop))
for _, word := range strings.Split(stopWordsText, "\n") {
w := strings.TrimSpace(word)
if w != "" {
stopWords[w] = true
}
}
stopWords[" "] = true
var err error
if seg, err = gse.NewEmbed(string(gunzip(indexDictMainCS)), string(gunzip(indexDictMainCT))); err == nil {
seg.SkipLog = true
seg.LoadModel(model)
} else {
log.DefaultLogger.Error("gse dict load failed", "err", err)
}
registry.RegisterAnalyzer(AnalyzerName, analyzerConstructor)
})
}
type Engine struct {
index bleve.Index
}
func newFulltextEngine(path string) (*Engine, error) {
initGse()
var idx bleve.Index
var err error
if path == "" {
idx, err = bleve.NewMemOnly(buildIndexMapping())
} else {
idx, err = bleve.Open(path)
if err == bleve.ErrorIndexPathDoesNotExist {
idx, err = bleve.New(path, buildIndexMapping())
}
}
if err != nil {
return nil, fmt.Errorf("failed to open/create fulltext index: %w", err)
}
return &Engine{index: idx}, nil
}
func buildIndexMapping() mapping.IndexMapping {
mapping := bleve.NewIndexMapping()
docMapping := bleve.NewDocumentMapping()
contentMapping := bleve.NewTextFieldMapping()
contentMapping.Analyzer = AnalyzerName
contentMapping.Store = true
docMapping.AddFieldMappingsAt("content", contentMapping)
idMapping := bleve.NewTextFieldMapping()
idMapping.Store = true
idMapping.Index = true
docMapping.AddFieldMappingsAt("id", idMapping)
mapping.DefaultMapping = docMapping
mapping.DefaultAnalyzer = AnalyzerName
return mapping
}
func (e *Engine) Close() error {
if e.index != nil {
return e.index.Close()
}
return nil
}
func (e *Engine) AddDocument(id string, content string, metadata map[string]any, allowUsers []string) error {
data := map[string]any{
"id": id,
"content": content,
"metadata": metadata,
}
for _, u := range allowUsers {
data["U-"+u] = "1"
}
return e.index.Index(id, data)
}
func (e *Engine) RemoveDocument(id string) error {
return e.index.Delete(id)
}
func (e *Engine) Search(idPrefix string, text string, topK int, userID string, filter []Condition) ([]SearchResult, error) {
var mainQuery query.Query
if text == "" {
mainQuery = query.NewMatchAllQuery()
} else {
contentQuery := query.NewMatchQuery(text)
contentQuery.SetField("content")
mainQuery = contentQuery
}
if userID != "" && userID != SystemUserID {
permQuery := query.NewTermQuery("1")
permQuery.SetField("U-" + userID)
mainQuery = query.NewConjunctionQuery([]query.Query{mainQuery, permQuery})
}
// We apply idPrefix logic directly to bleve if it's long enough, but it's simpler to overfetch and filter.
// We'll fetch topK*2 initially from bleve just in case, or apply PrefixQuery on "id".
if idPrefix != "" {
pq := query.NewPrefixQuery(idPrefix)
pq.SetField("id")
mainQuery = query.NewConjunctionQuery([]query.Query{mainQuery, pq})
}
// For dynamic conditions, we apply them in memory to ensure accuracy for complex operators.
// Since complex operators aren't all supported natively by Bleve without complex schema indexing,
// memory filtering is safer for arbitrary map[string]any. We fetch more documents to compensate.
req := bleve.NewSearchRequest(mainQuery)
req.Size = topK * 10
req.Fields = []string{"*"} // Fetch all fields for memory filtering
req.Highlight = bleve.NewHighlightWithStyle("html")
res, err := e.index.Search(req)
if err != nil {
return nil, fmt.Errorf("search failed: %w", err)
}
var results []SearchResult
for _, hit := range res.Hits {
idVal, _ := hit.Fields["id"].(string)
if idVal == "" {
idVal = hit.ID
}
metaIfc, _ := hit.Fields["metadata"].(map[string]any)
if metaIfc == nil {
metaIfc = make(map[string]any)
// Bleve unwraps nested structures sometimes, let's reconstruct or simply trust fields.
for k, v := range hit.Fields {
if strings.HasPrefix(k, "metadata.") {
metaIfc[strings.TrimPrefix(k, "metadata.")] = v
}
}
}
if !evalCondition(metaIfc, idVal, idPrefix, filter) {
continue
}
content, _ := hit.Fields["content"].(string)
previews := []string{}
if hit.Fragments != nil {
if f, ok := hit.Fragments["content"]; ok {
previews = append(previews, f...)
}
}
results = append(results, SearchResult{
ID: idVal,
Content: content,
Preview: strings.Join(previews, " ... "),
Score: float32(hit.Score),
Metadata: metaIfc,
})
if len(results) >= topK {
break
}
}
return results, nil
}