mirror of
https://github.com/go-gitea/gitea
synced 2025-01-07 08:24:28 +00:00
900ac62251
This is a large and complex PR, so let me explain in detail its changes. First, I had to create new index mappings for Bleve and ElasticSerach as the current ones do not support search by filename. This requires Gitea to recreate the code search indexes (I do not know if this is a breaking change, but I feel it deserves a heads-up). I've used [this approach](https://www.elastic.co/guide/en/elasticsearch/reference/7.17/analysis-pathhierarchy-tokenizer.html) to model the filename index. It allows us to efficiently search for both the full path and the name of a file. Bleve, however, does not support this out-of-box, so I had to code a brand new [token filter](https://blevesearch.com/docs/Token-Filters/) to generate the search terms. I also did an overhaul in the `indexer_test.go` file. It now asserts the order of the expected results (this is important since matches based on the name of a file are more relevant than those based on its content). I've added new test scenarios that deal with searching by filename. They use a new repo included in the Gitea fixture. The screenshot below depicts how Gitea shows the search results. It shows results based on content in the same way as the current version does. In matches based on the filename, the first seven lines of the file contents are shown (BTW, this is how GitHub does it). ![image](https://github.com/user-attachments/assets/9d938d86-1a8d-4f89-8644-1921a473e858) Resolves #32096 --------- Signed-off-by: Bruno Sofiato <bruno.sofiato@gmail.com>
293 lines
7.2 KiB
Go
293 lines
7.2 KiB
Go
// Copyright 2020 The Gitea Authors. All rights reserved.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
package code
|
|
|
|
import (
|
|
"context"
|
|
"os"
|
|
"slices"
|
|
"testing"
|
|
|
|
"code.gitea.io/gitea/models/db"
|
|
"code.gitea.io/gitea/models/unittest"
|
|
"code.gitea.io/gitea/modules/git"
|
|
"code.gitea.io/gitea/modules/indexer/code/bleve"
|
|
"code.gitea.io/gitea/modules/indexer/code/elasticsearch"
|
|
"code.gitea.io/gitea/modules/indexer/code/internal"
|
|
|
|
_ "code.gitea.io/gitea/models"
|
|
_ "code.gitea.io/gitea/models/actions"
|
|
_ "code.gitea.io/gitea/models/activities"
|
|
|
|
"github.com/stretchr/testify/assert"
|
|
|
|
_ "github.com/mattn/go-sqlite3"
|
|
)
|
|
|
|
type codeSearchResult struct {
|
|
Filename string
|
|
Content string
|
|
}
|
|
|
|
func TestMain(m *testing.M) {
|
|
unittest.MainTest(m)
|
|
}
|
|
|
|
func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
|
|
t.Run(name, func(t *testing.T) {
|
|
assert.NoError(t, setupRepositoryIndexes(git.DefaultContext, indexer))
|
|
|
|
keywords := []struct {
|
|
RepoIDs []int64
|
|
Keyword string
|
|
Langs int
|
|
Results []codeSearchResult
|
|
}{
|
|
// Search for an exact match on the contents of a file
|
|
// This scenario yields a single result (the file README.md on the repo '1')
|
|
{
|
|
RepoIDs: nil,
|
|
Keyword: "Description",
|
|
Langs: 1,
|
|
Results: []codeSearchResult{
|
|
{
|
|
Filename: "README.md",
|
|
Content: "# repo1\n\nDescription for repo1",
|
|
},
|
|
},
|
|
},
|
|
// Search for an exact match on the contents of a file within the repo '2'.
|
|
// This scenario yields no results
|
|
{
|
|
RepoIDs: []int64{2},
|
|
Keyword: "Description",
|
|
Langs: 0,
|
|
},
|
|
// Search for an exact match on the contents of a file
|
|
// This scenario yields a single result (the file README.md on the repo '1')
|
|
{
|
|
RepoIDs: nil,
|
|
Keyword: "repo1",
|
|
Langs: 1,
|
|
Results: []codeSearchResult{
|
|
{
|
|
Filename: "README.md",
|
|
Content: "# repo1\n\nDescription for repo1",
|
|
},
|
|
},
|
|
},
|
|
// Search for an exact match on the contents of a file within the repo '2'.
|
|
// This scenario yields no results
|
|
{
|
|
RepoIDs: []int64{2},
|
|
Keyword: "repo1",
|
|
Langs: 0,
|
|
},
|
|
// Search for a non-existing term.
|
|
// This scenario yields no results
|
|
{
|
|
RepoIDs: nil,
|
|
Keyword: "non-exist",
|
|
Langs: 0,
|
|
},
|
|
// Search for an exact match on the contents of a file within the repo '62'.
|
|
// This scenario yields a single result (the file avocado.md on the repo '62')
|
|
{
|
|
RepoIDs: []int64{62},
|
|
Keyword: "pineaple",
|
|
Langs: 1,
|
|
Results: []codeSearchResult{
|
|
{
|
|
Filename: "avocado.md",
|
|
Content: "# repo1\n\npineaple pie of cucumber juice",
|
|
},
|
|
},
|
|
},
|
|
// Search for an exact match on the filename within the repo '62'.
|
|
// This scenario yields a single result (the file avocado.md on the repo '62')
|
|
{
|
|
RepoIDs: []int64{62},
|
|
Keyword: "avocado.md",
|
|
Langs: 1,
|
|
Results: []codeSearchResult{
|
|
{
|
|
Filename: "avocado.md",
|
|
Content: "# repo1\n\npineaple pie of cucumber juice",
|
|
},
|
|
},
|
|
},
|
|
// Search for an partial match on the filename within the repo '62'.
|
|
// This scenario yields a single result (the file avocado.md on the repo '62')
|
|
{
|
|
RepoIDs: []int64{62},
|
|
Keyword: "avo",
|
|
Langs: 1,
|
|
Results: []codeSearchResult{
|
|
{
|
|
Filename: "avocado.md",
|
|
Content: "# repo1\n\npineaple pie of cucumber juice",
|
|
},
|
|
},
|
|
},
|
|
// Search for matches on both the contents and the filenames within the repo '62'.
|
|
// This scenario yields two results: the first result is baed on the file (cucumber.md) while the second is based on the contents
|
|
{
|
|
RepoIDs: []int64{62},
|
|
Keyword: "cucumber",
|
|
Langs: 1,
|
|
Results: []codeSearchResult{
|
|
{
|
|
Filename: "cucumber.md",
|
|
Content: "Salad is good for your health",
|
|
},
|
|
{
|
|
Filename: "avocado.md",
|
|
Content: "# repo1\n\npineaple pie of cucumber juice",
|
|
},
|
|
},
|
|
},
|
|
// Search for matches on the filenames within the repo '62'.
|
|
// This scenario yields two results (both are based on filename, the first one is an exact match)
|
|
{
|
|
RepoIDs: []int64{62},
|
|
Keyword: "ham",
|
|
Langs: 1,
|
|
Results: []codeSearchResult{
|
|
{
|
|
Filename: "ham.md",
|
|
Content: "This is also not cheese",
|
|
},
|
|
{
|
|
Filename: "potato/ham.md",
|
|
Content: "This is not cheese",
|
|
},
|
|
},
|
|
},
|
|
// Search for matches on the contents of files within the repo '62'.
|
|
// This scenario yields two results (both are based on contents, the first one is an exact match where as the second is a 'fuzzy' one)
|
|
{
|
|
RepoIDs: []int64{62},
|
|
Keyword: "This is not cheese",
|
|
Langs: 1,
|
|
Results: []codeSearchResult{
|
|
{
|
|
Filename: "potato/ham.md",
|
|
Content: "This is not cheese",
|
|
},
|
|
{
|
|
Filename: "ham.md",
|
|
Content: "This is also not cheese",
|
|
},
|
|
},
|
|
},
|
|
}
|
|
|
|
for _, kw := range keywords {
|
|
t.Run(kw.Keyword, func(t *testing.T) {
|
|
total, res, langs, err := indexer.Search(context.TODO(), &internal.SearchOptions{
|
|
RepoIDs: kw.RepoIDs,
|
|
Keyword: kw.Keyword,
|
|
Paginator: &db.ListOptions{
|
|
Page: 1,
|
|
PageSize: 10,
|
|
},
|
|
IsKeywordFuzzy: true,
|
|
})
|
|
assert.NoError(t, err)
|
|
assert.Len(t, langs, kw.Langs)
|
|
|
|
hits := make([]codeSearchResult, 0, len(res))
|
|
|
|
if total > 0 {
|
|
assert.NotEmpty(t, kw.Results, "The given scenario does not provide any expected results")
|
|
}
|
|
|
|
for _, hit := range res {
|
|
hits = append(hits, codeSearchResult{
|
|
Filename: hit.Filename,
|
|
Content: hit.Content,
|
|
})
|
|
}
|
|
|
|
lastIndex := -1
|
|
|
|
for _, expected := range kw.Results {
|
|
index := slices.Index(hits, expected)
|
|
if index == -1 {
|
|
assert.Failf(t, "Result not found", "Expected %v in %v", expected, hits)
|
|
} else if lastIndex > index {
|
|
assert.Failf(t, "Result is out of order", "The order of %v within %v is wrong", expected, hits)
|
|
} else {
|
|
lastIndex = index
|
|
}
|
|
}
|
|
})
|
|
}
|
|
|
|
assert.NoError(t, tearDownRepositoryIndexes(indexer))
|
|
})
|
|
}
|
|
|
|
func TestBleveIndexAndSearch(t *testing.T) {
|
|
unittest.PrepareTestEnv(t)
|
|
|
|
dir := t.TempDir()
|
|
|
|
idx := bleve.NewIndexer(dir)
|
|
_, err := idx.Init(context.Background())
|
|
if err != nil {
|
|
if idx != nil {
|
|
idx.Close()
|
|
}
|
|
assert.FailNow(t, "Unable to create bleve indexer Error: %v", err)
|
|
}
|
|
defer idx.Close()
|
|
|
|
testIndexer("beleve", t, idx)
|
|
}
|
|
|
|
func TestESIndexAndSearch(t *testing.T) {
|
|
unittest.PrepareTestEnv(t)
|
|
|
|
u := os.Getenv("TEST_INDEXER_CODE_ES_URL")
|
|
if u == "" {
|
|
t.SkipNow()
|
|
return
|
|
}
|
|
|
|
indexer := elasticsearch.NewIndexer(u, "gitea_codes")
|
|
if _, err := indexer.Init(context.Background()); err != nil {
|
|
if indexer != nil {
|
|
indexer.Close()
|
|
}
|
|
assert.FailNow(t, "Unable to init ES indexer Error: %v", err)
|
|
}
|
|
|
|
defer indexer.Close()
|
|
|
|
testIndexer("elastic_search", t, indexer)
|
|
}
|
|
|
|
func setupRepositoryIndexes(ctx context.Context, indexer internal.Indexer) error {
|
|
for _, repoID := range repositoriesToSearch() {
|
|
if err := index(ctx, indexer, repoID); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func tearDownRepositoryIndexes(indexer internal.Indexer) error {
|
|
for _, repoID := range repositoriesToSearch() {
|
|
if err := indexer.Delete(context.Background(), repoID); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func repositoriesToSearch() []int64 {
|
|
return []int64{1, 62}
|
|
}
|