package indexDB import ( "bytes" "compress/gzip" "fmt" "io" "strings" "sync" _ "embed" "github.com/blevesearch/bleve/v2" "github.com/blevesearch/bleve/v2/analysis" "github.com/blevesearch/bleve/v2/mapping" "github.com/blevesearch/bleve/v2/registry" "github.com/blevesearch/bleve/v2/search/query" "github.com/go-ego/gse" "golang.org/x/text/width" "apigo.cc/go/cast" "apigo.cc/go/log" ) //go:embed dict/s_1.tgz var indexDictMainCS []byte //go:embed dict/t_1.tgz var indexDictMainCT []byte //go:embed dict/hmm_model.utf8.tgz var indexDictHmm []byte //go:embed dict/stop_words.utf8.tgz var indexDictStop []byte var ( seg gse.Segmenter segLock sync.RWMutex stopWords = map[string]bool{} initOnce sync.Once ) const AnalyzerName = "gse" // GseAnalyzer implements the bleve analysis.Analyzer interface. type GseAnalyzer struct{} func (a *GseAnalyzer) Analyze(input []byte) analysis.TokenStream { if len(input) == 0 { return nil } segLock.RLock() segments := seg.Segment(input) segLock.RUnlock() var tokens []*analysis.Token position := 1 for _, segment := range segments { tokenText := segment.Token().Text() if tokenText == "" { continue } word := width.Narrow.String(strings.ToLower(tokenText)) if stopWords[word] { continue } tokens = append(tokens, &analysis.Token{ Term: []byte(word), Start: segment.Start(), End: segment.End(), Position: position, Type: analysis.Ideographic, }) position++ } return tokens } func analyzerConstructor(config map[string]any, cache *registry.Cache) (analysis.Analyzer, error) { return &GseAnalyzer{}, nil } func gunzip(data []byte) []byte { r, err := gzip.NewReader(bytes.NewReader(data)) if err != nil { return nil } defer r.Close() out, _ := io.ReadAll(r) return out } func initGse() { initOnce.Do(func() { model := make(map[rune]float64) lines := strings.Split(string(gunzip(indexDictHmm)), "\n") for _, line := range lines { if line = strings.TrimSpace(line); line != "" && line[0] != '#' { for _, item := range strings.Split(line, ",") { if parts := strings.SplitN(item, ":", 2); len(parts) == 2 { if key := []rune(parts[0]); len(key) > 0 { model[key[0]] = cast.To[float64](parts[1]) } } } } } stopWordsText := string(gunzip(indexDictStop)) for _, word := range strings.Split(stopWordsText, "\n") { w := strings.TrimSpace(word) if w != "" { stopWords[w] = true } } stopWords[" "] = true var err error if seg, err = gse.NewEmbed(string(gunzip(indexDictMainCS)), string(gunzip(indexDictMainCT))); err == nil { seg.SkipLog = true seg.LoadModel(model) } else { log.DefaultLogger.Error("gse dict load failed", "err", err) } registry.RegisterAnalyzer(AnalyzerName, analyzerConstructor) }) } type Engine struct { index bleve.Index } func newFulltextEngine(path string) (*Engine, error) { initGse() var idx bleve.Index var err error if path == "" { idx, err = bleve.NewMemOnly(buildIndexMapping()) } else { idx, err = bleve.Open(path) if err == bleve.ErrorIndexPathDoesNotExist { idx, err = bleve.New(path, buildIndexMapping()) } } if err != nil { return nil, fmt.Errorf("failed to open/create fulltext index: %w", err) } return &Engine{index: idx}, nil } func buildIndexMapping() mapping.IndexMapping { mapping := bleve.NewIndexMapping() docMapping := bleve.NewDocumentMapping() contentMapping := bleve.NewTextFieldMapping() contentMapping.Analyzer = AnalyzerName contentMapping.Store = true docMapping.AddFieldMappingsAt("content", contentMapping) idMapping := bleve.NewTextFieldMapping() idMapping.Store = true idMapping.Index = true docMapping.AddFieldMappingsAt("id", idMapping) mapping.DefaultMapping = docMapping mapping.DefaultAnalyzer = AnalyzerName return mapping } func (e *Engine) Close() error { if e.index != nil { return e.index.Close() } return nil } func (e *Engine) AddDocument(id string, content string, metadata map[string]any, allowUsers []string) error { data := map[string]any{ "id": id, "content": content, "metadata": metadata, } for _, u := range allowUsers { data["U-"+u] = "1" } return e.index.Index(id, data) } func (e *Engine) RemoveDocument(id string) error { return e.index.Delete(id) } func (e *Engine) Search(idPrefix string, text string, topK int, userID string, filter []Condition) ([]SearchResult, error) { var mainQuery query.Query if text == "" { mainQuery = query.NewMatchAllQuery() } else { contentQuery := query.NewMatchQuery(text) contentQuery.SetField("content") mainQuery = contentQuery } if userID != "" && userID != SystemUserID { permQuery := query.NewTermQuery("1") permQuery.SetField("U-" + userID) mainQuery = query.NewConjunctionQuery([]query.Query{mainQuery, permQuery}) } // We apply idPrefix logic directly to bleve if it's long enough, but it's simpler to overfetch and filter. // We'll fetch topK*2 initially from bleve just in case, or apply PrefixQuery on "id". if idPrefix != "" { pq := query.NewPrefixQuery(idPrefix) pq.SetField("id") mainQuery = query.NewConjunctionQuery([]query.Query{mainQuery, pq}) } // For dynamic conditions, we apply them in memory to ensure accuracy for complex operators. // Since complex operators aren't all supported natively by Bleve without complex schema indexing, // memory filtering is safer for arbitrary map[string]any. We fetch more documents to compensate. req := bleve.NewSearchRequest(mainQuery) req.Size = topK * 10 req.Fields = []string{"*"} // Fetch all fields for memory filtering req.Highlight = bleve.NewHighlightWithStyle("html") res, err := e.index.Search(req) if err != nil { return nil, fmt.Errorf("search failed: %w", err) } var results []SearchResult for _, hit := range res.Hits { idVal, _ := hit.Fields["id"].(string) if idVal == "" { idVal = hit.ID } metaIfc, _ := hit.Fields["metadata"].(map[string]any) if metaIfc == nil { metaIfc = make(map[string]any) // Bleve unwraps nested structures sometimes, let's reconstruct or simply trust fields. for k, v := range hit.Fields { if strings.HasPrefix(k, "metadata.") { metaIfc[strings.TrimPrefix(k, "metadata.")] = v } } } if !evalCondition(metaIfc, idVal, idPrefix, filter) { continue } content, _ := hit.Fields["content"].(string) previews := []string{} if hit.Fragments != nil { if f, ok := hit.Fragments["content"]; ok { previews = append(previews, f...) } } results = append(results, SearchResult{ ID: idVal, Content: content, Preview: strings.Join(previews, " ... "), Score: float32(hit.Score), Metadata: metaIfc, }) if len(results) >= topK { break } } return results, nil }