1
1
mirror of https://github.com/go-gitea/gitea synced 2025-01-11 10:14:27 +00:00
Bruno Sofiato f64fbd9b74
Updated tokenizer to better matching when search for code snippets ()
This PR improves the accuracy of Gitea's code search. 

Currently, Gitea does not consider statements such as
`onsole.log("hello")` as hits when the user searches for `log`. The
culprit is how both ES and Bleve are tokenizing the file contents (in
both cases, `console.log` is a whole token).

In ES' case, we changed the tokenizer to
[simple_pattern_split](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-simplepatternsplit-tokenizer.html#:~:text=The%20simple_pattern_split%20tokenizer%20uses%20a,the%20tokenization%20is%20generally%20faster.).
In such a case, tokens are words formed by digits and letters. In
Bleve's case, it employs a
[letter](https://blevesearch.com/docs/Tokenizers/) tokenizer.

Resolves 

---------

Signed-off-by: Bruno Sofiato <bruno.sofiato@gmail.com>
2024-11-06 20:51:20 +00:00

384 lines
12 KiB
Go

// Copyright 2019 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package bleve
import (
"bufio"
"context"
"fmt"
"io"
"strconv"
"strings"
"time"
repo_model "code.gitea.io/gitea/models/repo"
"code.gitea.io/gitea/modules/analyze"
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/gitrepo"
path_filter "code.gitea.io/gitea/modules/indexer/code/bleve/token/path"
"code.gitea.io/gitea/modules/indexer/code/internal"
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/timeutil"
"code.gitea.io/gitea/modules/typesniffer"
"github.com/blevesearch/bleve/v2"
analyzer_custom "github.com/blevesearch/bleve/v2/analysis/analyzer/custom"
analyzer_keyword "github.com/blevesearch/bleve/v2/analysis/analyzer/keyword"
"github.com/blevesearch/bleve/v2/analysis/token/camelcase"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/letter"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/v2/mapping"
"github.com/blevesearch/bleve/v2/search/query"
"github.com/go-enry/go-enry/v2"
)
const (
unicodeNormalizeName = "unicodeNormalize"
maxBatchSize = 16
)
func addUnicodeNormalizeTokenFilter(m *mapping.IndexMappingImpl) error {
return m.AddCustomTokenFilter(unicodeNormalizeName, map[string]any{
"type": unicodenorm.Name,
"form": unicodenorm.NFC,
})
}
// RepoIndexerData data stored in the repo indexer
type RepoIndexerData struct {
RepoID int64
CommitID string
Content string
Filename string
Language string
UpdatedAt time.Time
}
// Type returns the document type, for bleve's mapping.Classifier interface.
func (d *RepoIndexerData) Type() string {
return repoIndexerDocType
}
const (
repoIndexerAnalyzer = "repoIndexerAnalyzer"
filenameIndexerAnalyzer = "filenameIndexerAnalyzer"
filenameIndexerTokenizer = "filenameIndexerTokenizer"
repoIndexerDocType = "repoIndexerDocType"
repoIndexerLatestVersion = 8
)
// generateBleveIndexMapping generates a bleve index mapping for the repo indexer
func generateBleveIndexMapping() (mapping.IndexMapping, error) {
docMapping := bleve.NewDocumentMapping()
numericFieldMapping := bleve.NewNumericFieldMapping()
numericFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("RepoID", numericFieldMapping)
textFieldMapping := bleve.NewTextFieldMapping()
textFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
fileNamedMapping := bleve.NewTextFieldMapping()
fileNamedMapping.IncludeInAll = false
fileNamedMapping.Analyzer = filenameIndexerAnalyzer
docMapping.AddFieldMappingsAt("Filename", fileNamedMapping)
termFieldMapping := bleve.NewTextFieldMapping()
termFieldMapping.IncludeInAll = false
termFieldMapping.Analyzer = analyzer_keyword.Name
docMapping.AddFieldMappingsAt("Language", termFieldMapping)
docMapping.AddFieldMappingsAt("CommitID", termFieldMapping)
timeFieldMapping := bleve.NewDateTimeFieldMapping()
timeFieldMapping.IncludeInAll = false
docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
mapping := bleve.NewIndexMapping()
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
return nil, err
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
"type": analyzer_custom.Name,
"char_filters": []string{},
"tokenizer": letter.Name,
"token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name},
}); err != nil {
return nil, err
}
if err := mapping.AddCustomAnalyzer(filenameIndexerAnalyzer, map[string]any{
"type": analyzer_custom.Name,
"char_filters": []string{},
"tokenizer": unicode.Name,
"token_filters": []string{unicodeNormalizeName, path_filter.Name, lowercase.Name},
}); err != nil {
return nil, err
}
mapping.DefaultAnalyzer = repoIndexerAnalyzer
mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
return mapping, nil
}
var _ internal.Indexer = &Indexer{}
// Indexer represents a bleve indexer implementation
type Indexer struct {
inner *inner_bleve.Indexer
indexer_internal.Indexer // do not composite inner_bleve.Indexer directly to avoid exposing too much
}
// NewIndexer creates a new bleve local indexer
func NewIndexer(indexDir string) *Indexer {
inner := inner_bleve.NewIndexer(indexDir, repoIndexerLatestVersion, generateBleveIndexMapping)
return &Indexer{
Indexer: inner,
inner: inner,
}
}
func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserError, batchReader *bufio.Reader, commitSha string,
update internal.FileUpdate, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch,
) error {
// Ignore vendored files in code search
if setting.Indexer.ExcludeVendored && analyze.IsVendor(update.Filename) {
return nil
}
size := update.Size
var err error
if !update.Sized {
var stdout string
stdout, _, err = git.NewCommand(ctx, "cat-file", "-s").AddDynamicArguments(update.BlobSha).RunStdString(&git.RunOpts{Dir: repo.RepoPath()})
if err != nil {
return err
}
if size, err = strconv.ParseInt(strings.TrimSpace(stdout), 10, 64); err != nil {
return fmt.Errorf("misformatted git cat-file output: %w", err)
}
}
if size > setting.Indexer.MaxIndexerFileSize {
return b.addDelete(update.Filename, repo, batch)
}
if _, err := batchWriter.Write([]byte(update.BlobSha + "\n")); err != nil {
return err
}
_, _, size, err = git.ReadBatchLine(batchReader)
if err != nil {
return err
}
fileContents, err := io.ReadAll(io.LimitReader(batchReader, size))
if err != nil {
return err
} else if !typesniffer.DetectContentType(fileContents).IsText() {
// FIXME: UTF-16 files will probably fail here
return nil
}
if _, err = batchReader.Discard(1); err != nil {
return err
}
id := internal.FilenameIndexerID(repo.ID, update.Filename)
return batch.Index(id, &RepoIndexerData{
RepoID: repo.ID,
CommitID: commitSha,
Filename: update.Filename,
Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
UpdatedAt: time.Now().UTC(),
})
}
func (b *Indexer) addDelete(filename string, repo *repo_model.Repository, batch *inner_bleve.FlushingBatch) error {
id := internal.FilenameIndexerID(repo.ID, filename)
return batch.Delete(id)
}
// Index indexes the data
func (b *Indexer) Index(ctx context.Context, repo *repo_model.Repository, sha string, changes *internal.RepoChanges) error {
batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
if len(changes.Updates) > 0 {
r, err := gitrepo.OpenRepository(ctx, repo)
if err != nil {
return err
}
defer r.Close()
gitBatch, err := r.NewBatch(ctx)
if err != nil {
return err
}
defer gitBatch.Close()
for _, update := range changes.Updates {
if err := b.addUpdate(ctx, gitBatch.Writer, gitBatch.Reader, sha, update, repo, batch); err != nil {
return err
}
}
gitBatch.Close()
}
for _, filename := range changes.RemovedFilenames {
if err := b.addDelete(filename, repo, batch); err != nil {
return err
}
}
return batch.Flush()
}
// Delete deletes indexes by ids
func (b *Indexer) Delete(_ context.Context, repoID int64) error {
query := inner_bleve.NumericEqualityQuery(repoID, "RepoID")
searchRequest := bleve.NewSearchRequestOptions(query, 2147483647, 0, false)
result, err := b.inner.Indexer.Search(searchRequest)
if err != nil {
return err
}
batch := inner_bleve.NewFlushingBatch(b.inner.Indexer, maxBatchSize)
for _, hit := range result.Hits {
if err = batch.Delete(hit.ID); err != nil {
return err
}
}
return batch.Flush()
}
// Search searches for files in the specified repo.
// Returns the matching file-paths
func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) {
var (
indexerQuery query.Query
keywordQuery query.Query
)
pathQuery := bleve.NewPrefixQuery(strings.ToLower(opts.Keyword))
pathQuery.FieldVal = "Filename"
pathQuery.SetBoost(10)
contentQuery := bleve.NewMatchQuery(opts.Keyword)
contentQuery.FieldVal = "Content"
if opts.IsKeywordFuzzy {
contentQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
}
keywordQuery = bleve.NewDisjunctionQuery(contentQuery, pathQuery)
if len(opts.RepoIDs) > 0 {
repoQueries := make([]query.Query, 0, len(opts.RepoIDs))
for _, repoID := range opts.RepoIDs {
repoQueries = append(repoQueries, inner_bleve.NumericEqualityQuery(repoID, "RepoID"))
}
indexerQuery = bleve.NewConjunctionQuery(
bleve.NewDisjunctionQuery(repoQueries...),
keywordQuery,
)
} else {
indexerQuery = keywordQuery
}
// Save for reuse without language filter
facetQuery := indexerQuery
if len(opts.Language) > 0 {
languageQuery := bleve.NewMatchQuery(opts.Language)
languageQuery.FieldVal = "Language"
languageQuery.Analyzer = analyzer_keyword.Name
indexerQuery = bleve.NewConjunctionQuery(
indexerQuery,
languageQuery,
)
}
from, pageSize := opts.GetSkipTake()
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
searchRequest.Fields = []string{"Content", "Filename", "RepoID", "Language", "CommitID", "UpdatedAt"}
searchRequest.IncludeLocations = true
if len(opts.Language) == 0 {
searchRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
}
searchRequest.SortBy([]string{"-_score", "UpdatedAt"})
result, err := b.inner.Indexer.SearchInContext(ctx, searchRequest)
if err != nil {
return 0, nil, nil, err
}
total := int64(result.Total)
searchResults := make([]*internal.SearchResult, len(result.Hits))
for i, hit := range result.Hits {
startIndex, endIndex := -1, -1
for _, locations := range hit.Locations["Content"] {
location := locations[0]
locationStart := int(location.Start)
locationEnd := int(location.End)
if startIndex < 0 || locationStart < startIndex {
startIndex = locationStart
}
if endIndex < 0 || locationEnd > endIndex {
endIndex = locationEnd
}
}
if len(hit.Locations["Filename"]) > 0 {
startIndex, endIndex = internal.FilenameMatchIndexPos(hit.Fields["Content"].(string))
}
language := hit.Fields["Language"].(string)
var updatedUnix timeutil.TimeStamp
if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
updatedUnix = timeutil.TimeStamp(t.Unix())
}
searchResults[i] = &internal.SearchResult{
RepoID: int64(hit.Fields["RepoID"].(float64)),
StartIndex: startIndex,
EndIndex: endIndex,
Filename: internal.FilenameOfIndexerID(hit.ID),
Content: hit.Fields["Content"].(string),
CommitID: hit.Fields["CommitID"].(string),
UpdatedUnix: updatedUnix,
Language: language,
Color: enry.GetColor(language),
}
}
searchResultLanguages := make([]*internal.SearchResultLanguages, 0, 10)
if len(opts.Language) > 0 {
// Use separate query to go get all language counts
facetRequest := bleve.NewSearchRequestOptions(facetQuery, 1, 0, false)
facetRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
facetRequest.IncludeLocations = true
facetRequest.AddFacet("languages", bleve.NewFacetRequest("Language", 10))
if result, err = b.inner.Indexer.Search(facetRequest); err != nil {
return 0, nil, nil, err
}
}
languagesFacet := result.Facets["languages"]
for _, term := range languagesFacet.Terms.Terms() {
if len(term.Term) == 0 {
continue
}
searchResultLanguages = append(searchResultLanguages, &internal.SearchResultLanguages{
Language: term.Term,
Color: enry.GetColor(term.Term),
Count: term.Count,
})
}
return total, searchResults, searchResultLanguages, nil
}