1
1
mirror of https://github.com/go-gitea/gitea synced 2025-07-15 23:17:19 +00:00

Refactor Git Attribute & performance optimization (#34154)

This PR moved git attributes related code to `modules/git/attribute` sub
package and moved language stats related code to
`modules/git/languagestats` sub package to make it easier to maintain.

And it also introduced a performance improvement which use the `git
check-attr --source` which can be run in a bare git repository so that
we don't need to create a git index file. The new parameter need a git
version >= 2.40 . If git version less than 2.40, it will fall back to
previous implementation.

---------

Co-authored-by: wxiaoguang <wxiaoguang@gmail.com>
Co-authored-by: yp05327 <576951401@qq.com>
This commit is contained in:
Lunny Xiao
2025-04-11 06:41:29 -07:00
committed by GitHub
parent d725b78824
commit ae0af8ea5b
28 changed files with 875 additions and 587 deletions

View File

@@ -0,0 +1,114 @@
// Copyright 2025 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package attribute
import (
"strings"
"code.gitea.io/gitea/modules/optional"
)
type Attribute string
const (
LinguistVendored = "linguist-vendored"
LinguistGenerated = "linguist-generated"
LinguistDocumentation = "linguist-documentation"
LinguistDetectable = "linguist-detectable"
LinguistLanguage = "linguist-language"
GitlabLanguage = "gitlab-language"
Lockable = "lockable"
Filter = "filter"
)
var LinguistAttributes = []string{
LinguistVendored,
LinguistGenerated,
LinguistDocumentation,
LinguistDetectable,
LinguistLanguage,
GitlabLanguage,
}
func (a Attribute) IsUnspecified() bool {
return a == "" || a == "unspecified"
}
func (a Attribute) ToString() optional.Option[string] {
if !a.IsUnspecified() {
return optional.Some(string(a))
}
return optional.None[string]()
}
// ToBool converts the attribute value to optional boolean: true if "set"/"true", false if "unset"/"false", none otherwise
func (a Attribute) ToBool() optional.Option[bool] {
switch a {
case "set", "true":
return optional.Some(true)
case "unset", "false":
return optional.Some(false)
}
return optional.None[bool]()
}
type Attributes struct {
m map[string]Attribute
}
func NewAttributes() *Attributes {
return &Attributes{m: make(map[string]Attribute)}
}
func (attrs *Attributes) Get(name string) Attribute {
if value, has := attrs.m[name]; has {
return value
}
return ""
}
func (attrs *Attributes) GetVendored() optional.Option[bool] {
return attrs.Get(LinguistVendored).ToBool()
}
func (attrs *Attributes) GetGenerated() optional.Option[bool] {
return attrs.Get(LinguistGenerated).ToBool()
}
func (attrs *Attributes) GetDocumentation() optional.Option[bool] {
return attrs.Get(LinguistDocumentation).ToBool()
}
func (attrs *Attributes) GetDetectable() optional.Option[bool] {
return attrs.Get(LinguistDetectable).ToBool()
}
func (attrs *Attributes) GetLinguistLanguage() optional.Option[string] {
return attrs.Get(LinguistLanguage).ToString()
}
func (attrs *Attributes) GetGitlabLanguage() optional.Option[string] {
attrStr := attrs.Get(GitlabLanguage).ToString()
if attrStr.Has() {
raw := attrStr.Value()
// gitlab-language may have additional parameters after the language
// ignore them and just use the main language
// https://docs.gitlab.com/ee/user/project/highlighting.html#override-syntax-highlighting-for-a-file-type
if idx := strings.IndexByte(raw, '?'); idx >= 0 {
return optional.Some(raw[:idx])
}
}
return attrStr
}
func (attrs *Attributes) GetLanguage() optional.Option[string] {
// prefer linguist-language over gitlab-language
// if linguist-language is not set, use gitlab-language
// if both are not set, return none
language := attrs.GetLinguistLanguage()
if language.Value() == "" {
language = attrs.GetGitlabLanguage()
}
return language
}

View File

@@ -0,0 +1,37 @@
// Copyright 2025 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package attribute
import (
"testing"
"github.com/stretchr/testify/assert"
)
func Test_Attribute(t *testing.T) {
assert.Empty(t, Attribute("").ToString().Value())
assert.Empty(t, Attribute("unspecified").ToString().Value())
assert.Equal(t, "python", Attribute("python").ToString().Value())
assert.Equal(t, "Java", Attribute("Java").ToString().Value())
attributes := Attributes{
m: map[string]Attribute{
LinguistGenerated: "true",
LinguistDocumentation: "false",
LinguistDetectable: "set",
LinguistLanguage: "Python",
GitlabLanguage: "Java",
"filter": "unspecified",
"test": "",
},
}
assert.Empty(t, attributes.Get("test").ToString().Value())
assert.Empty(t, attributes.Get("filter").ToString().Value())
assert.Equal(t, "Python", attributes.Get(LinguistLanguage).ToString().Value())
assert.Equal(t, "Java", attributes.Get(GitlabLanguage).ToString().Value())
assert.True(t, attributes.Get(LinguistGenerated).ToBool().Value())
assert.False(t, attributes.Get(LinguistDocumentation).ToBool().Value())
assert.True(t, attributes.Get(LinguistDetectable).ToBool().Value())
}

View File

@@ -0,0 +1,216 @@
// Copyright 2019 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package attribute
import (
"bytes"
"context"
"fmt"
"os"
"path/filepath"
"time"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/log"
)
// BatchChecker provides a reader for check-attribute content that can be long running
type BatchChecker struct {
attributesNum int
repo *git.Repository
stdinWriter *os.File
stdOut *nulSeparatedAttributeWriter
ctx context.Context
cancel context.CancelFunc
cmd *git.Command
}
// NewBatchChecker creates a check attribute reader for the current repository and provided commit ID
// If treeish is empty, then it will use current working directory, otherwise it will use the provided treeish on the bare repo
func NewBatchChecker(repo *git.Repository, treeish string, attributes []string) (checker *BatchChecker, returnedErr error) {
ctx, cancel := context.WithCancel(repo.Ctx)
defer func() {
if returnedErr != nil {
cancel()
}
}()
cmd, envs, cleanup, err := checkAttrCommand(repo, treeish, nil, attributes)
if err != nil {
return nil, err
}
defer func() {
if returnedErr != nil {
cleanup()
}
}()
cmd.AddArguments("--stdin")
checker = &BatchChecker{
attributesNum: len(attributes),
repo: repo,
ctx: ctx,
cmd: cmd,
cancel: func() {
cancel()
cleanup()
},
}
stdinReader, stdinWriter, err := os.Pipe()
if err != nil {
return nil, err
}
checker.stdinWriter = stdinWriter
lw := new(nulSeparatedAttributeWriter)
lw.attributes = make(chan attributeTriple, len(attributes))
lw.closed = make(chan struct{})
checker.stdOut = lw
go func() {
defer func() {
_ = stdinReader.Close()
_ = lw.Close()
}()
stdErr := new(bytes.Buffer)
err := cmd.Run(ctx, &git.RunOpts{
Env: envs,
Dir: repo.Path,
Stdin: stdinReader,
Stdout: lw,
Stderr: stdErr,
})
if err != nil && !git.IsErrCanceledOrKilled(err) {
log.Error("Attribute checker for commit %s exits with error: %v", treeish, err)
}
checker.cancel()
}()
return checker, nil
}
// CheckPath check attr for given path
func (c *BatchChecker) CheckPath(path string) (rs *Attributes, err error) {
defer func() {
if err != nil && err != c.ctx.Err() {
log.Error("Unexpected error when checking path %s in %s, error: %v", path, filepath.Base(c.repo.Path), err)
}
}()
select {
case <-c.ctx.Done():
return nil, c.ctx.Err()
default:
}
if _, err = c.stdinWriter.Write([]byte(path + "\x00")); err != nil {
defer c.Close()
return nil, err
}
reportTimeout := func() error {
stdOutClosed := false
select {
case <-c.stdOut.closed:
stdOutClosed = true
default:
}
debugMsg := fmt.Sprintf("check path %q in repo %q", path, filepath.Base(c.repo.Path))
debugMsg += fmt.Sprintf(", stdOut: tmp=%q, pos=%d, closed=%v", string(c.stdOut.tmp), c.stdOut.pos, stdOutClosed)
if c.cmd != nil {
debugMsg += fmt.Sprintf(", process state: %q", c.cmd.ProcessState())
}
_ = c.Close()
return fmt.Errorf("CheckPath timeout: %s", debugMsg)
}
rs = NewAttributes()
for i := 0; i < c.attributesNum; i++ {
select {
case <-time.After(5 * time.Second):
// there is no "hang" problem now. This code is just used to catch other potential problems.
return nil, reportTimeout()
case attr, ok := <-c.stdOut.ReadAttribute():
if !ok {
return nil, c.ctx.Err()
}
rs.m[attr.Attribute] = Attribute(attr.Value)
case <-c.ctx.Done():
return nil, c.ctx.Err()
}
}
return rs, nil
}
func (c *BatchChecker) Close() error {
c.cancel()
err := c.stdinWriter.Close()
return err
}
type attributeTriple struct {
Filename string
Attribute string
Value string
}
type nulSeparatedAttributeWriter struct {
tmp []byte
attributes chan attributeTriple
closed chan struct{}
working attributeTriple
pos int
}
func (wr *nulSeparatedAttributeWriter) Write(p []byte) (n int, err error) {
l, read := len(p), 0
nulIdx := bytes.IndexByte(p, '\x00')
for nulIdx >= 0 {
wr.tmp = append(wr.tmp, p[:nulIdx]...)
switch wr.pos {
case 0:
wr.working = attributeTriple{
Filename: string(wr.tmp),
}
case 1:
wr.working.Attribute = string(wr.tmp)
case 2:
wr.working.Value = string(wr.tmp)
}
wr.tmp = wr.tmp[:0]
wr.pos++
if wr.pos > 2 {
wr.attributes <- wr.working
wr.pos = 0
}
read += nulIdx + 1
if l > read {
p = p[nulIdx+1:]
nulIdx = bytes.IndexByte(p, '\x00')
} else {
return l, nil
}
}
wr.tmp = append(wr.tmp, p...)
return l, nil
}
func (wr *nulSeparatedAttributeWriter) ReadAttribute() <-chan attributeTriple {
return wr.attributes
}
func (wr *nulSeparatedAttributeWriter) Close() error {
select {
case <-wr.closed:
return nil
default:
}
close(wr.attributes)
close(wr.closed)
return nil
}

View File

@@ -0,0 +1,172 @@
// Copyright 2021 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package attribute
import (
"path/filepath"
"testing"
"time"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/test"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func Test_nulSeparatedAttributeWriter_ReadAttribute(t *testing.T) {
wr := &nulSeparatedAttributeWriter{
attributes: make(chan attributeTriple, 5),
}
testStr := ".gitignore\"\n\x00linguist-vendored\x00unspecified\x00"
n, err := wr.Write([]byte(testStr))
assert.Len(t, testStr, n)
assert.NoError(t, err)
select {
case attr := <-wr.ReadAttribute():
assert.Equal(t, ".gitignore\"\n", attr.Filename)
assert.Equal(t, LinguistVendored, attr.Attribute)
assert.Equal(t, "unspecified", attr.Value)
case <-time.After(100 * time.Millisecond):
assert.FailNow(t, "took too long to read an attribute from the list")
}
// Write a second attribute again
n, err = wr.Write([]byte(testStr))
assert.Len(t, testStr, n)
assert.NoError(t, err)
select {
case attr := <-wr.ReadAttribute():
assert.Equal(t, ".gitignore\"\n", attr.Filename)
assert.Equal(t, LinguistVendored, attr.Attribute)
assert.Equal(t, "unspecified", attr.Value)
case <-time.After(100 * time.Millisecond):
assert.FailNow(t, "took too long to read an attribute from the list")
}
// Write a partial attribute
_, err = wr.Write([]byte("incomplete-file"))
assert.NoError(t, err)
_, err = wr.Write([]byte("name\x00"))
assert.NoError(t, err)
select {
case <-wr.ReadAttribute():
assert.FailNow(t, "There should not be an attribute ready to read")
case <-time.After(100 * time.Millisecond):
}
_, err = wr.Write([]byte("attribute\x00"))
assert.NoError(t, err)
select {
case <-wr.ReadAttribute():
assert.FailNow(t, "There should not be an attribute ready to read")
case <-time.After(100 * time.Millisecond):
}
_, err = wr.Write([]byte("value\x00"))
assert.NoError(t, err)
attr := <-wr.ReadAttribute()
assert.Equal(t, "incomplete-filename", attr.Filename)
assert.Equal(t, "attribute", attr.Attribute)
assert.Equal(t, "value", attr.Value)
_, err = wr.Write([]byte("shouldbe.vendor\x00linguist-vendored\x00set\x00shouldbe.vendor\x00linguist-generated\x00unspecified\x00shouldbe.vendor\x00linguist-language\x00unspecified\x00"))
assert.NoError(t, err)
attr = <-wr.ReadAttribute()
assert.NoError(t, err)
assert.Equal(t, attributeTriple{
Filename: "shouldbe.vendor",
Attribute: LinguistVendored,
Value: "set",
}, attr)
attr = <-wr.ReadAttribute()
assert.NoError(t, err)
assert.Equal(t, attributeTriple{
Filename: "shouldbe.vendor",
Attribute: LinguistGenerated,
Value: "unspecified",
}, attr)
attr = <-wr.ReadAttribute()
assert.NoError(t, err)
assert.Equal(t, attributeTriple{
Filename: "shouldbe.vendor",
Attribute: LinguistLanguage,
Value: "unspecified",
}, attr)
}
func expectedAttrs() *Attributes {
return &Attributes{
m: map[string]Attribute{
LinguistGenerated: "unspecified",
LinguistDetectable: "unspecified",
LinguistDocumentation: "unspecified",
LinguistVendored: "unspecified",
LinguistLanguage: "Python",
GitlabLanguage: "unspecified",
},
}
}
func Test_BatchChecker(t *testing.T) {
setting.AppDataPath = t.TempDir()
repoPath := "../tests/repos/language_stats_repo"
gitRepo, err := git.OpenRepository(t.Context(), repoPath)
require.NoError(t, err)
defer gitRepo.Close()
commitID := "8fee858da5796dfb37704761701bb8e800ad9ef3"
t.Run("Create index file to run git check-attr", func(t *testing.T) {
defer test.MockVariableValue(&git.DefaultFeatures().SupportCheckAttrOnBare, false)()
checker, err := NewBatchChecker(gitRepo, commitID, LinguistAttributes)
assert.NoError(t, err)
defer checker.Close()
attributes, err := checker.CheckPath("i-am-a-python.p")
assert.NoError(t, err)
assert.Equal(t, expectedAttrs(), attributes)
})
// run git check-attr on work tree
t.Run("Run git check-attr on git work tree", func(t *testing.T) {
dir := filepath.Join(t.TempDir(), "test-repo")
err := git.Clone(t.Context(), repoPath, dir, git.CloneRepoOptions{
Shared: true,
Branch: "master",
})
assert.NoError(t, err)
tempRepo, err := git.OpenRepository(t.Context(), dir)
assert.NoError(t, err)
defer tempRepo.Close()
checker, err := NewBatchChecker(tempRepo, "", LinguistAttributes)
assert.NoError(t, err)
defer checker.Close()
attributes, err := checker.CheckPath("i-am-a-python.p")
assert.NoError(t, err)
assert.Equal(t, expectedAttrs(), attributes)
})
if !git.DefaultFeatures().SupportCheckAttrOnBare {
t.Skip("git version 2.40 is required to support run check-attr on bare repo")
return
}
t.Run("Run git check-attr in bare repository", func(t *testing.T) {
checker, err := NewBatchChecker(gitRepo, commitID, LinguistAttributes)
assert.NoError(t, err)
defer checker.Close()
attributes, err := checker.CheckPath("i-am-a-python.p")
assert.NoError(t, err)
assert.Equal(t, expectedAttrs(), attributes)
})
}

View File

@@ -0,0 +1,96 @@
// Copyright 2025 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package attribute
import (
"bytes"
"context"
"errors"
"fmt"
"os"
"code.gitea.io/gitea/modules/git"
)
func checkAttrCommand(gitRepo *git.Repository, treeish string, filenames, attributes []string) (*git.Command, []string, func(), error) {
cancel := func() {}
envs := []string{"GIT_FLUSH=1"}
cmd := git.NewCommand("check-attr", "-z")
if len(attributes) == 0 {
cmd.AddArguments("--all")
}
// there is treeish, read from bare repo or temp index created by "read-tree"
if treeish != "" {
if git.DefaultFeatures().SupportCheckAttrOnBare {
cmd.AddArguments("--source")
cmd.AddDynamicArguments(treeish)
} else {
indexFilename, worktree, deleteTemporaryFile, err := gitRepo.ReadTreeToTemporaryIndex(treeish)
if err != nil {
return nil, nil, nil, err
}
cmd.AddArguments("--cached")
envs = append(envs,
"GIT_INDEX_FILE="+indexFilename,
"GIT_WORK_TREE="+worktree,
)
cancel = deleteTemporaryFile
}
} // else: no treeish, assume it is a not a bare repo, read from working directory
cmd.AddDynamicArguments(attributes...)
if len(filenames) > 0 {
cmd.AddDashesAndList(filenames...)
}
return cmd, envs, cancel, nil
}
type CheckAttributeOpts struct {
Filenames []string
Attributes []string
}
// CheckAttributes return the attributes of the given filenames and attributes in the given treeish.
// If treeish is empty, then it will use current working directory, otherwise it will use the provided treeish on the bare repo
func CheckAttributes(ctx context.Context, gitRepo *git.Repository, treeish string, opts CheckAttributeOpts) (map[string]*Attributes, error) {
cmd, envs, cancel, err := checkAttrCommand(gitRepo, treeish, opts.Filenames, opts.Attributes)
if err != nil {
return nil, err
}
defer cancel()
stdOut := new(bytes.Buffer)
stdErr := new(bytes.Buffer)
if err := cmd.Run(ctx, &git.RunOpts{
Env: append(os.Environ(), envs...),
Dir: gitRepo.Path,
Stdout: stdOut,
Stderr: stdErr,
}); err != nil {
return nil, fmt.Errorf("failed to run check-attr: %w\n%s\n%s", err, stdOut.String(), stdErr.String())
}
fields := bytes.Split(stdOut.Bytes(), []byte{'\000'})
if len(fields)%3 != 1 {
return nil, errors.New("wrong number of fields in return from check-attr")
}
attributesMap := make(map[string]*Attributes)
for i := 0; i < (len(fields) / 3); i++ {
filename := string(fields[3*i])
attribute := string(fields[3*i+1])
info := string(fields[3*i+2])
attribute2info, ok := attributesMap[filename]
if !ok {
attribute2info = NewAttributes()
attributesMap[filename] = attribute2info
}
attribute2info.m[attribute] = Attribute(info)
}
return attributesMap, nil
}

View File

@@ -0,0 +1,74 @@
// Copyright 2025 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package attribute
import (
"path/filepath"
"testing"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/test"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func Test_Checker(t *testing.T) {
setting.AppDataPath = t.TempDir()
repoPath := "../tests/repos/language_stats_repo"
gitRepo, err := git.OpenRepository(t.Context(), repoPath)
require.NoError(t, err)
defer gitRepo.Close()
commitID := "8fee858da5796dfb37704761701bb8e800ad9ef3"
t.Run("Create index file to run git check-attr", func(t *testing.T) {
defer test.MockVariableValue(&git.DefaultFeatures().SupportCheckAttrOnBare, false)()
attrs, err := CheckAttributes(t.Context(), gitRepo, commitID, CheckAttributeOpts{
Filenames: []string{"i-am-a-python.p"},
Attributes: LinguistAttributes,
})
assert.NoError(t, err)
assert.Len(t, attrs, 1)
assert.Equal(t, expectedAttrs(), attrs["i-am-a-python.p"])
})
// run git check-attr on work tree
t.Run("Run git check-attr on git work tree", func(t *testing.T) {
dir := filepath.Join(t.TempDir(), "test-repo")
err := git.Clone(t.Context(), repoPath, dir, git.CloneRepoOptions{
Shared: true,
Branch: "master",
})
assert.NoError(t, err)
tempRepo, err := git.OpenRepository(t.Context(), dir)
assert.NoError(t, err)
defer tempRepo.Close()
attrs, err := CheckAttributes(t.Context(), tempRepo, "", CheckAttributeOpts{
Filenames: []string{"i-am-a-python.p"},
Attributes: LinguistAttributes,
})
assert.NoError(t, err)
assert.Len(t, attrs, 1)
assert.Equal(t, expectedAttrs(), attrs["i-am-a-python.p"])
})
if !git.DefaultFeatures().SupportCheckAttrOnBare {
t.Skip("git version 2.40 is required to support run check-attr on bare repo")
return
}
t.Run("Run git check-attr in bare repository", func(t *testing.T) {
attrs, err := CheckAttributes(t.Context(), gitRepo, commitID, CheckAttributeOpts{
Filenames: []string{"i-am-a-python.p"},
Attributes: LinguistAttributes,
})
assert.NoError(t, err)
assert.Len(t, attrs, 1)
assert.Equal(t, expectedAttrs(), attrs["i-am-a-python.p"])
})
}

View File

@@ -0,0 +1,41 @@
// Copyright 2025 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package attribute
import (
"context"
"fmt"
"os"
"testing"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/util"
)
func testRun(m *testing.M) error {
gitHomePath, err := os.MkdirTemp(os.TempDir(), "git-home")
if err != nil {
return fmt.Errorf("unable to create temp dir: %w", err)
}
defer util.RemoveAll(gitHomePath)
setting.Git.HomePath = gitHomePath
if err = git.InitFull(context.Background()); err != nil {
return fmt.Errorf("failed to call Init: %w", err)
}
exitCode := m.Run()
if exitCode != 0 {
return fmt.Errorf("run test failed, ExitCode=%d", exitCode)
}
return nil
}
func TestMain(m *testing.M) {
if err := testRun(m); err != nil {
_, _ = fmt.Fprintf(os.Stderr, "Test failed: %v", err)
os.Exit(1)
}
}