mirror of
https://github.com/go-gitea/gitea
synced 2025-01-23 08:04:30 +00:00
f64fbd9b74
This PR improves the accuracy of Gitea's code search. Currently, Gitea does not consider statements such as `onsole.log("hello")` as hits when the user searches for `log`. The culprit is how both ES and Bleve are tokenizing the file contents (in both cases, `console.log` is a whole token). In ES' case, we changed the tokenizer to [simple_pattern_split](https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-simplepatternsplit-tokenizer.html#:~:text=The%20simple_pattern_split%20tokenizer%20uses%20a,the%20tokenization%20is%20generally%20faster.). In such a case, tokens are words formed by digits and letters. In Bleve's case, it employs a [letter](https://blevesearch.com/docs/Tokenizers/) tokenizer. Resolves #32220 --------- Signed-off-by: Bruno Sofiato <bruno.sofiato@gmail.com>
54 lines
976 B
Go
54 lines
976 B
Go
// Copyright 2024 The Gitea Authors. All rights reserved.
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
package bleve
|
|
|
|
import (
|
|
"fmt"
|
|
"testing"
|
|
|
|
"github.com/stretchr/testify/assert"
|
|
)
|
|
|
|
func TestBleveGuessFuzzinessByKeyword(t *testing.T) {
|
|
scenarios := []struct {
|
|
Input string
|
|
Fuzziness int // See util.go for the definition of fuzziness in this particular context
|
|
}{
|
|
{
|
|
Input: "",
|
|
Fuzziness: 0,
|
|
},
|
|
{
|
|
Input: "Avocado",
|
|
Fuzziness: 1,
|
|
},
|
|
{
|
|
Input: "Geschwindigkeit",
|
|
Fuzziness: 2,
|
|
},
|
|
{
|
|
Input: "non-exist",
|
|
Fuzziness: 0,
|
|
},
|
|
{
|
|
Input: "갃갃갃",
|
|
Fuzziness: 0,
|
|
},
|
|
{
|
|
Input: "repo1",
|
|
Fuzziness: 0,
|
|
},
|
|
{
|
|
Input: "avocado.md",
|
|
Fuzziness: 0,
|
|
},
|
|
}
|
|
|
|
for _, scenario := range scenarios {
|
|
t.Run(fmt.Sprintf("ensure fuzziness of '%s' is '%d'", scenario.Input, scenario.Fuzziness), func(t *testing.T) {
|
|
assert.Equal(t, scenario.Fuzziness, GuessFuzzinessByKeyword(scenario.Input))
|
|
})
|
|
}
|
|
}
|