286 lines
6.7 KiB
Go
286 lines
6.7 KiB
Go
package indexDB
|
|
|
|
import (
|
|
"bytes"
|
|
"compress/gzip"
|
|
"fmt"
|
|
"io"
|
|
"strings"
|
|
"sync"
|
|
_ "embed"
|
|
|
|
"github.com/blevesearch/bleve/v2"
|
|
"github.com/blevesearch/bleve/v2/analysis"
|
|
"github.com/blevesearch/bleve/v2/mapping"
|
|
"github.com/blevesearch/bleve/v2/registry"
|
|
"github.com/blevesearch/bleve/v2/search/query"
|
|
"github.com/go-ego/gse"
|
|
"golang.org/x/text/width"
|
|
|
|
"apigo.cc/go/cast"
|
|
"apigo.cc/go/log"
|
|
)
|
|
|
|
//go:embed dict/s_1.tgz
|
|
var indexDictMainCS []byte
|
|
|
|
//go:embed dict/t_1.tgz
|
|
var indexDictMainCT []byte
|
|
|
|
//go:embed dict/hmm_model.utf8.tgz
|
|
var indexDictHmm []byte
|
|
|
|
//go:embed dict/stop_words.utf8.tgz
|
|
var indexDictStop []byte
|
|
|
|
var (
|
|
seg gse.Segmenter
|
|
segLock sync.RWMutex
|
|
stopWords = map[string]bool{}
|
|
initOnce sync.Once
|
|
)
|
|
|
|
const AnalyzerName = "gse"
|
|
|
|
// GseAnalyzer implements the bleve analysis.Analyzer interface.
|
|
type GseAnalyzer struct{}
|
|
|
|
func (a *GseAnalyzer) Analyze(input []byte) analysis.TokenStream {
|
|
if len(input) == 0 {
|
|
return nil
|
|
}
|
|
|
|
segLock.RLock()
|
|
segments := seg.Segment(input)
|
|
segLock.RUnlock()
|
|
|
|
var tokens []*analysis.Token
|
|
position := 1
|
|
for _, segment := range segments {
|
|
tokenText := segment.Token().Text()
|
|
if tokenText == "" {
|
|
continue
|
|
}
|
|
|
|
word := width.Narrow.String(strings.ToLower(tokenText))
|
|
if stopWords[word] {
|
|
continue
|
|
}
|
|
|
|
tokens = append(tokens, &analysis.Token{
|
|
Term: []byte(word),
|
|
Start: segment.Start(),
|
|
End: segment.End(),
|
|
Position: position,
|
|
Type: analysis.Ideographic,
|
|
})
|
|
position++
|
|
}
|
|
|
|
return tokens
|
|
}
|
|
|
|
func analyzerConstructor(config map[string]any, cache *registry.Cache) (analysis.Analyzer, error) {
|
|
return &GseAnalyzer{}, nil
|
|
}
|
|
|
|
func gunzip(data []byte) []byte {
|
|
r, err := gzip.NewReader(bytes.NewReader(data))
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
defer r.Close()
|
|
out, _ := io.ReadAll(r)
|
|
return out
|
|
}
|
|
|
|
func initGse() {
|
|
initOnce.Do(func() {
|
|
model := make(map[rune]float64)
|
|
lines := strings.Split(string(gunzip(indexDictHmm)), "\n")
|
|
for _, line := range lines {
|
|
if line = strings.TrimSpace(line); line != "" && line[0] != '#' {
|
|
for _, item := range strings.Split(line, ",") {
|
|
if parts := strings.SplitN(item, ":", 2); len(parts) == 2 {
|
|
if key := []rune(parts[0]); len(key) > 0 {
|
|
model[key[0]] = cast.To[float64](parts[1])
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
stopWordsText := string(gunzip(indexDictStop))
|
|
for _, word := range strings.Split(stopWordsText, "\n") {
|
|
w := strings.TrimSpace(word)
|
|
if w != "" {
|
|
stopWords[w] = true
|
|
}
|
|
}
|
|
stopWords[" "] = true
|
|
|
|
var err error
|
|
if seg, err = gse.NewEmbed(string(gunzip(indexDictMainCS)), string(gunzip(indexDictMainCT))); err == nil {
|
|
seg.SkipLog = true
|
|
seg.LoadModel(model)
|
|
} else {
|
|
log.DefaultLogger.Error("gse dict load failed", "err", err)
|
|
}
|
|
|
|
registry.RegisterAnalyzer(AnalyzerName, analyzerConstructor)
|
|
})
|
|
}
|
|
|
|
type Engine struct {
|
|
index bleve.Index
|
|
}
|
|
|
|
func newFulltextEngine(path string) (*Engine, error) {
|
|
initGse()
|
|
|
|
var idx bleve.Index
|
|
var err error
|
|
|
|
if path == "" {
|
|
idx, err = bleve.NewMemOnly(buildIndexMapping())
|
|
} else {
|
|
idx, err = bleve.Open(path)
|
|
if err == bleve.ErrorIndexPathDoesNotExist {
|
|
idx, err = bleve.New(path, buildIndexMapping())
|
|
}
|
|
}
|
|
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to open/create fulltext index: %w", err)
|
|
}
|
|
|
|
return &Engine{index: idx}, nil
|
|
}
|
|
|
|
func buildIndexMapping() mapping.IndexMapping {
|
|
mapping := bleve.NewIndexMapping()
|
|
docMapping := bleve.NewDocumentMapping()
|
|
|
|
contentMapping := bleve.NewTextFieldMapping()
|
|
contentMapping.Analyzer = AnalyzerName
|
|
contentMapping.Store = true
|
|
docMapping.AddFieldMappingsAt("content", contentMapping)
|
|
|
|
idMapping := bleve.NewTextFieldMapping()
|
|
idMapping.Analyzer = "keyword"
|
|
idMapping.Store = true
|
|
idMapping.Index = true
|
|
docMapping.AddFieldMappingsAt("id", idMapping)
|
|
|
|
mapping.DefaultMapping = docMapping
|
|
mapping.DefaultAnalyzer = AnalyzerName
|
|
|
|
return mapping
|
|
}
|
|
|
|
func (e *Engine) Close() error {
|
|
if e.index != nil {
|
|
return e.index.Close()
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (e *Engine) AddDocument(id string, content string, metadata map[string]any, allowUsers []string) error {
|
|
data := map[string]any{
|
|
"id": id,
|
|
"content": content,
|
|
"metadata": metadata,
|
|
}
|
|
for _, u := range allowUsers {
|
|
data["U-"+u] = "1"
|
|
}
|
|
return e.index.Index(id, data)
|
|
}
|
|
|
|
func (e *Engine) RemoveDocument(id string) error {
|
|
return e.index.Delete(id)
|
|
}
|
|
|
|
func (e *Engine) Search(idPrefix string, text string, topK int, userID string, filter []Condition) ([]SearchResult, error) {
|
|
var mainQuery query.Query
|
|
if text == "" {
|
|
mainQuery = query.NewMatchAllQuery()
|
|
} else {
|
|
contentQuery := query.NewMatchQuery(text)
|
|
contentQuery.SetField("content")
|
|
mainQuery = contentQuery
|
|
}
|
|
|
|
if userID != "" && userID != SystemUserID {
|
|
permQuery := query.NewTermQuery("1")
|
|
permQuery.SetField("U-" + userID)
|
|
mainQuery = query.NewConjunctionQuery([]query.Query{mainQuery, permQuery})
|
|
}
|
|
|
|
// We apply idPrefix logic directly to bleve if it's long enough, but it's simpler to overfetch and filter.
|
|
// We'll fetch topK*2 initially from bleve just in case, or apply PrefixQuery on "id".
|
|
if idPrefix != "" {
|
|
pq := query.NewPrefixQuery(idPrefix)
|
|
pq.SetField("id")
|
|
mainQuery = query.NewConjunctionQuery([]query.Query{mainQuery, pq})
|
|
}
|
|
|
|
// For dynamic conditions, we apply them in memory to ensure accuracy for complex operators.
|
|
// Since complex operators aren't all supported natively by Bleve without complex schema indexing,
|
|
// memory filtering is safer for arbitrary map[string]any. We fetch more documents to compensate.
|
|
req := bleve.NewSearchRequest(mainQuery)
|
|
req.Size = topK * 10
|
|
req.Fields = []string{"*"} // Fetch all fields for memory filtering
|
|
req.Highlight = bleve.NewHighlightWithStyle("html")
|
|
|
|
res, err := e.index.Search(req)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("search failed: %w", err)
|
|
}
|
|
|
|
var results []SearchResult
|
|
for _, hit := range res.Hits {
|
|
idVal, _ := hit.Fields["id"].(string)
|
|
if idVal == "" {
|
|
idVal = hit.ID
|
|
}
|
|
|
|
metaIfc, _ := hit.Fields["metadata"].(map[string]any)
|
|
if metaIfc == nil {
|
|
metaIfc = make(map[string]any)
|
|
// Bleve unwraps nested structures sometimes, let's reconstruct or simply trust fields.
|
|
for k, v := range hit.Fields {
|
|
if strings.HasPrefix(k, "metadata.") {
|
|
metaIfc[strings.TrimPrefix(k, "metadata.")] = v
|
|
}
|
|
}
|
|
}
|
|
|
|
if !evalCondition(metaIfc, idVal, idPrefix, filter) {
|
|
continue
|
|
}
|
|
|
|
content, _ := hit.Fields["content"].(string)
|
|
previews := []string{}
|
|
if hit.Fragments != nil {
|
|
if f, ok := hit.Fragments["content"]; ok {
|
|
previews = append(previews, f...)
|
|
}
|
|
}
|
|
|
|
results = append(results, SearchResult{
|
|
ID: idVal,
|
|
Content: content,
|
|
Preview: strings.Join(previews, " ... "),
|
|
Score: float32(hit.Score),
|
|
Metadata: metaIfc,
|
|
})
|
|
|
|
if len(results) >= topK {
|
|
break
|
|
}
|
|
}
|
|
|
|
return results, nil
|
|
}
|