From 950abfe8ee7ea4b72dcc71c75b393b154f3dcaa2 Mon Sep 17 00:00:00 2001 From: Darren Hoo Date: Sun, 16 Feb 2025 18:28:06 +0800 Subject: [PATCH] enable literal string for code search (#33590) Close: #33588 --------- Co-authored-by: wxiaoguang Co-authored-by: Giteabot --- modules/indexer/code/bleve/bleve.go | 21 +++++-- .../code/elasticsearch/elasticsearch.go | 17 ++++-- modules/indexer/code/gitgrep/gitgrep.go | 59 +++++++++++++++++++ .../indexer/code/gitgrep/gitgrep_test.go | 2 +- modules/indexer/code/indexer.go | 6 +- modules/indexer/code/internal/util.go | 10 +++- modules/indexer/code/internal/util_test.go | 30 ++++++++++ routers/web/repo/search.go | 46 ++------------- 8 files changed, 135 insertions(+), 56 deletions(-) create mode 100644 modules/indexer/code/gitgrep/gitgrep.go rename routers/web/repo/search_test.go => modules/indexer/code/gitgrep/gitgrep_test.go (97%) create mode 100644 modules/indexer/code/internal/util_test.go diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go index 772317fa59..963c151a05 100644 --- a/modules/indexer/code/bleve/bleve.go +++ b/modules/indexer/code/bleve/bleve.go @@ -260,17 +260,28 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int var ( indexerQuery query.Query keywordQuery query.Query + contentQuery query.Query ) pathQuery := bleve.NewPrefixQuery(strings.ToLower(opts.Keyword)) pathQuery.FieldVal = "Filename" pathQuery.SetBoost(10) - contentQuery := bleve.NewMatchQuery(opts.Keyword) - contentQuery.FieldVal = "Content" - - if opts.IsKeywordFuzzy { - contentQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword) + keywordAsPhrase, isPhrase := internal.ParseKeywordAsPhrase(opts.Keyword) + if isPhrase { + q := bleve.NewMatchPhraseQuery(keywordAsPhrase) + q.FieldVal = "Content" + if opts.IsKeywordFuzzy { + q.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(keywordAsPhrase) + } + contentQuery = q + } else { + q := bleve.NewMatchQuery(opts.Keyword) + q.FieldVal = "Content" + if opts.IsKeywordFuzzy { + q.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword) + } + contentQuery = q } keywordQuery = bleve.NewDisjunctionQuery(contentQuery, pathQuery) diff --git a/modules/indexer/code/elasticsearch/elasticsearch.go b/modules/indexer/code/elasticsearch/elasticsearch.go index 1c4dd39eff..5e4b2c56f2 100644 --- a/modules/indexer/code/elasticsearch/elasticsearch.go +++ b/modules/indexer/code/elasticsearch/elasticsearch.go @@ -24,6 +24,7 @@ import ( "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/timeutil" "code.gitea.io/gitea/modules/typesniffer" + "code.gitea.io/gitea/modules/util" "github.com/go-enry/go-enry/v2" "github.com/olivere/elastic/v7" @@ -359,13 +360,19 @@ func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLan // Search searches for codes and language stats by given conditions. func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { - searchType := esMultiMatchTypePhrasePrefix - if opts.IsKeywordFuzzy { - searchType = esMultiMatchTypeBestFields + var contentQuery elastic.Query + keywordAsPhrase, isPhrase := internal.ParseKeywordAsPhrase(opts.Keyword) + if isPhrase { + contentQuery = elastic.NewMatchPhraseQuery("content", keywordAsPhrase) + } else { + // TODO: this is the old logic, but not really using "fuzziness" + // * IsKeywordFuzzy=true: "best_fields" + // * IsKeywordFuzzy=false: "phrase_prefix" + contentQuery = elastic.NewMultiMatchQuery("content", opts.Keyword). + Type(util.Iif(opts.IsKeywordFuzzy, esMultiMatchTypeBestFields, esMultiMatchTypePhrasePrefix)) } - kwQuery := elastic.NewBoolQuery().Should( - elastic.NewMultiMatchQuery(opts.Keyword, "content").Type(searchType), + contentQuery, elastic.NewMultiMatchQuery(opts.Keyword, "filename^10").Type(esMultiMatchTypePhrasePrefix), ) query := elastic.NewBoolQuery() diff --git a/modules/indexer/code/gitgrep/gitgrep.go b/modules/indexer/code/gitgrep/gitgrep.go new file mode 100644 index 0000000000..a85c9d02a5 --- /dev/null +++ b/modules/indexer/code/gitgrep/gitgrep.go @@ -0,0 +1,59 @@ +// Copyright 2025 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package gitgrep + +import ( + "context" + "fmt" + "strings" + + "code.gitea.io/gitea/modules/git" + code_indexer "code.gitea.io/gitea/modules/indexer/code" + "code.gitea.io/gitea/modules/setting" +) + +func indexSettingToGitGrepPathspecList() (list []string) { + for _, expr := range setting.Indexer.IncludePatterns { + list = append(list, ":(glob)"+expr.PatternString()) + } + for _, expr := range setting.Indexer.ExcludePatterns { + list = append(list, ":(glob,exclude)"+expr.PatternString()) + } + return list +} + +func PerformSearch(ctx context.Context, page int, repoID int64, gitRepo *git.Repository, ref git.RefName, keyword string, isFuzzy bool) (searchResults []*code_indexer.Result, total int, err error) { + // TODO: it should also respect ParseKeywordAsPhrase and clarify the "fuzzy" behavior + res, err := git.GrepSearch(ctx, gitRepo, keyword, git.GrepOptions{ + ContextLineNumber: 1, + IsFuzzy: isFuzzy, + RefName: ref.String(), + PathspecList: indexSettingToGitGrepPathspecList(), + }) + if err != nil { + // TODO: if no branch exists, it reports: exit status 128, fatal: this operation must be run in a work tree. + return nil, 0, fmt.Errorf("git.GrepSearch: %w", err) + } + commitID, err := gitRepo.GetRefCommitID(ref.String()) + if err != nil { + return nil, 0, fmt.Errorf("gitRepo.GetRefCommitID: %w", err) + } + + total = len(res) + pageStart := min((page-1)*setting.UI.RepoSearchPagingNum, len(res)) + pageEnd := min(page*setting.UI.RepoSearchPagingNum, len(res)) + res = res[pageStart:pageEnd] + for _, r := range res { + searchResults = append(searchResults, &code_indexer.Result{ + RepoID: repoID, + Filename: r.Filename, + CommitID: commitID, + // UpdatedUnix: not supported yet + // Language: not supported yet + // Color: not supported yet + Lines: code_indexer.HighlightSearchResultCode(r.Filename, "", r.LineNumbers, strings.Join(r.LineCodes, "\n")), + }) + } + return searchResults, total, nil +} diff --git a/routers/web/repo/search_test.go b/modules/indexer/code/gitgrep/gitgrep_test.go similarity index 97% rename from routers/web/repo/search_test.go rename to modules/indexer/code/gitgrep/gitgrep_test.go index 33a1610384..97dda9d966 100644 --- a/routers/web/repo/search_test.go +++ b/modules/indexer/code/gitgrep/gitgrep_test.go @@ -1,7 +1,7 @@ // Copyright 2024 The Gitea Authors. All rights reserved. // SPDX-License-Identifier: MIT -package repo +package gitgrep import ( "testing" diff --git a/modules/indexer/code/indexer.go b/modules/indexer/code/indexer.go index 728b37fab6..38fd10dae7 100644 --- a/modules/indexer/code/indexer.go +++ b/modules/indexer/code/indexer.go @@ -29,13 +29,11 @@ var ( // When the real indexer is not ready, it will be a dummy indexer which will return error to explain it's not ready. // So it's always safe use it as *globalIndexer.Load() and call its methods. globalIndexer atomic.Pointer[internal.Indexer] - dummyIndexer *internal.Indexer ) func init() { - i := internal.NewDummyIndexer() - dummyIndexer = &i - globalIndexer.Store(dummyIndexer) + dummyIndexer := internal.NewDummyIndexer() + globalIndexer.Store(&dummyIndexer) } func index(ctx context.Context, indexer internal.Indexer, repoID int64) error { diff --git a/modules/indexer/code/internal/util.go b/modules/indexer/code/internal/util.go index 5b95783d9f..46e631166d 100644 --- a/modules/indexer/code/internal/util.go +++ b/modules/indexer/code/internal/util.go @@ -35,7 +35,7 @@ func FilenameOfIndexerID(indexerID string) string { return indexerID[index+1:] } -// Given the contents of file, returns the boundaries of its first seven lines. +// FilenameMatchIndexPos returns the boundaries of its first seven lines. func FilenameMatchIndexPos(content string) (int, int) { count := 1 for i, c := range content { @@ -48,3 +48,11 @@ func FilenameMatchIndexPos(content string) (int, int) { } return 0, len(content) } + +func ParseKeywordAsPhrase(keyword string) (string, bool) { + if strings.HasPrefix(keyword, `"`) && strings.HasSuffix(keyword, `"`) && len(keyword) > 1 { + // only remove the prefix and suffix quotes, no need to decode the content at the moment + return keyword[1 : len(keyword)-1], true + } + return "", false +} diff --git a/modules/indexer/code/internal/util_test.go b/modules/indexer/code/internal/util_test.go new file mode 100644 index 0000000000..457936296b --- /dev/null +++ b/modules/indexer/code/internal/util_test.go @@ -0,0 +1,30 @@ +// Copyright 2025 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package internal + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestParseKeywordAsPhrase(t *testing.T) { + cases := []struct { + keyword string + phrase string + isPhrase bool + }{ + {``, "", false}, + {`a`, "", false}, + {`"`, "", false}, + {`"a`, "", false}, + {`"a"`, "a", true}, + {`""\"""`, `"\""`, true}, + } + for _, c := range cases { + phrase, isPhrase := ParseKeywordAsPhrase(c.keyword) + assert.Equal(t, c.phrase, phrase, "keyword=%q", c.keyword) + assert.Equal(t, c.isPhrase, isPhrase, "keyword=%q", c.keyword) + } +} diff --git a/routers/web/repo/search.go b/routers/web/repo/search.go index bbbe5c1081..ea40e64bbb 100644 --- a/routers/web/repo/search.go +++ b/routers/web/repo/search.go @@ -5,11 +5,11 @@ package repo import ( "net/http" - "strings" "code.gitea.io/gitea/models/db" "code.gitea.io/gitea/modules/git" code_indexer "code.gitea.io/gitea/modules/indexer/code" + "code.gitea.io/gitea/modules/indexer/code/gitgrep" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/templates" "code.gitea.io/gitea/routers/common" @@ -18,16 +18,6 @@ import ( const tplSearch templates.TplName = "repo/search" -func indexSettingToGitGrepPathspecList() (list []string) { - for _, expr := range setting.Indexer.IncludePatterns { - list = append(list, ":(glob)"+expr.PatternString()) - } - for _, expr := range setting.Indexer.ExcludePatterns { - list = append(list, ":(glob,exclude)"+expr.PatternString()) - } - return list -} - // Search render repository search page func Search(ctx *context.Context) { ctx.Data["PageIsViewCode"] = true @@ -67,38 +57,14 @@ func Search(ctx *context.Context) { ctx.Data["CodeIndexerUnavailable"] = !code_indexer.IsAvailable(ctx) } } else { - searchRefName := git.RefNameFromBranch(ctx.Repo.Repository.DefaultBranch) // BranchName should be default branch or the first existing branch - res, err := git.GrepSearch(ctx, ctx.Repo.GitRepo, prepareSearch.Keyword, git.GrepOptions{ - ContextLineNumber: 1, - IsFuzzy: prepareSearch.IsFuzzy, - RefName: searchRefName.String(), - PathspecList: indexSettingToGitGrepPathspecList(), - }) + var err error + // ref should be default branch or the first existing branch + searchRef := git.RefNameFromBranch(ctx.Repo.Repository.DefaultBranch) + searchResults, total, err = gitgrep.PerformSearch(ctx, page, ctx.Repo.Repository.ID, ctx.Repo.GitRepo, searchRef, prepareSearch.Keyword, prepareSearch.IsFuzzy) if err != nil { - // TODO: if no branch exists, it reports: exit status 128, fatal: this operation must be run in a work tree. - ctx.ServerError("GrepSearch", err) + ctx.ServerError("gitgrep.PerformSearch", err) return } - commitID, err := ctx.Repo.GitRepo.GetRefCommitID(searchRefName.String()) - if err != nil { - ctx.ServerError("GetRefCommitID", err) - return - } - total = len(res) - pageStart := min((page-1)*setting.UI.RepoSearchPagingNum, len(res)) - pageEnd := min(page*setting.UI.RepoSearchPagingNum, len(res)) - res = res[pageStart:pageEnd] - for _, r := range res { - searchResults = append(searchResults, &code_indexer.Result{ - RepoID: ctx.Repo.Repository.ID, - Filename: r.Filename, - CommitID: commitID, - // UpdatedUnix: not supported yet - // Language: not supported yet - // Color: not supported yet - Lines: code_indexer.HighlightSearchResultCode(r.Filename, "", r.LineNumbers, strings.Join(r.LineCodes, "\n")), - }) - } } ctx.Data["Repo"] = ctx.Repo.Repository