1
1
mirror of https://github.com/go-gitea/gitea synced 2025-07-22 18:28:37 +00:00

Refactor Git Attribute & performance optimization (#34154)

This PR moved git attributes related code to `modules/git/attribute` sub
package and moved language stats related code to
`modules/git/languagestats` sub package to make it easier to maintain.

And it also introduced a performance improvement which use the `git
check-attr --source` which can be run in a bare git repository so that
we don't need to create a git index file. The new parameter need a git
version >= 2.40 . If git version less than 2.40, it will fall back to
previous implementation.

---------

Co-authored-by: wxiaoguang <wxiaoguang@gmail.com>
Co-authored-by: yp05327 <576951401@qq.com>
This commit is contained in:
Lunny Xiao
2025-04-11 06:41:29 -07:00
committed by GitHub
parent d725b78824
commit ae0af8ea5b
28 changed files with 875 additions and 587 deletions

View File

@@ -0,0 +1,65 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package languagestats
import (
"context"
"strings"
"unicode"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/git/attribute"
)
const (
fileSizeLimit int64 = 16 * 1024 // 16 KiB
bigFileSize int64 = 1024 * 1024 // 1 MiB
)
// mergeLanguageStats mergers language names with different cases. The name with most upper case letters is used.
func mergeLanguageStats(stats map[string]int64) map[string]int64 {
names := map[string]struct {
uniqueName string
upperCount int
}{}
countUpper := func(s string) (count int) {
for _, r := range s {
if unicode.IsUpper(r) {
count++
}
}
return count
}
for name := range stats {
cnt := countUpper(name)
lower := strings.ToLower(name)
if cnt >= names[lower].upperCount {
names[lower] = struct {
uniqueName string
upperCount int
}{uniqueName: name, upperCount: cnt}
}
}
res := make(map[string]int64, len(names))
for name, num := range stats {
res[names[strings.ToLower(name)].uniqueName] += num
}
return res
}
// GetFileLanguage tries to get the (linguist) language of the file content
func GetFileLanguage(ctx context.Context, gitRepo *git.Repository, treeish, treePath string) (string, error) {
attributesMap, err := attribute.CheckAttributes(ctx, gitRepo, treeish, attribute.CheckAttributeOpts{
Attributes: []string{attribute.LinguistLanguage, attribute.GitlabLanguage},
Filenames: []string{treePath},
})
if err != nil {
return "", err
}
return attributesMap[treePath].GetLanguage().Value(), nil
}

View File

@@ -0,0 +1,181 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
//go:build gogit
package languagestats
import (
"bytes"
"io"
"code.gitea.io/gitea/modules/analyze"
git_module "code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/git/attribute"
"code.gitea.io/gitea/modules/optional"
"github.com/go-enry/go-enry/v2"
"github.com/go-git/go-git/v5"
"github.com/go-git/go-git/v5/plumbing"
"github.com/go-git/go-git/v5/plumbing/object"
)
// GetLanguageStats calculates language stats for git repository at specified commit
func GetLanguageStats(repo *git_module.Repository, commitID string) (map[string]int64, error) {
r, err := git.PlainOpen(repo.Path)
if err != nil {
return nil, err
}
rev, err := r.ResolveRevision(plumbing.Revision(commitID))
if err != nil {
return nil, err
}
commit, err := r.CommitObject(*rev)
if err != nil {
return nil, err
}
tree, err := commit.Tree()
if err != nil {
return nil, err
}
checker, err := attribute.NewBatchChecker(repo, commitID, attribute.LinguistAttributes)
if err != nil {
return nil, err
}
defer checker.Close()
// sizes contains the current calculated size of all files by language
sizes := make(map[string]int64)
// by default we will only count the sizes of programming languages or markup languages
// unless they are explicitly set using linguist-language
includedLanguage := map[string]bool{}
// or if there's only one language in the repository
firstExcludedLanguage := ""
firstExcludedLanguageSize := int64(0)
err = tree.Files().ForEach(func(f *object.File) error {
if f.Size == 0 {
return nil
}
isVendored := optional.None[bool]()
isGenerated := optional.None[bool]()
isDocumentation := optional.None[bool]()
isDetectable := optional.None[bool]()
attrs, err := checker.CheckPath(f.Name)
if err == nil {
isVendored = attrs.GetVendored()
if isVendored.ValueOrDefault(false) {
return nil
}
isGenerated = attrs.GetGenerated()
if isGenerated.ValueOrDefault(false) {
return nil
}
isDocumentation = attrs.GetDocumentation()
if isDocumentation.ValueOrDefault(false) {
return nil
}
isDetectable = attrs.GetDetectable()
if !isDetectable.ValueOrDefault(true) {
return nil
}
hasLanguage := attrs.GetLanguage()
if hasLanguage.Value() != "" {
language := hasLanguage.Value()
// group languages, such as Pug -> HTML; SCSS -> CSS
group := enry.GetLanguageGroup(language)
if len(group) != 0 {
language = group
}
// this language will always be added to the size
sizes[language] += f.Size
return nil
}
}
if (!isVendored.Has() && analyze.IsVendor(f.Name)) ||
enry.IsDotFile(f.Name) ||
(!isDocumentation.Has() && enry.IsDocumentation(f.Name)) ||
enry.IsConfiguration(f.Name) {
return nil
}
// If content can not be read or file is too big just do detection by filename
var content []byte
if f.Size <= bigFileSize {
content, _ = readFile(f, fileSizeLimit)
}
if !isGenerated.Has() && enry.IsGenerated(f.Name, content) {
return nil
}
language := analyze.GetCodeLanguage(f.Name, content)
if language == enry.OtherLanguage || language == "" {
return nil
}
// group languages, such as Pug -> HTML; SCSS -> CSS
group := enry.GetLanguageGroup(language)
if group != "" {
language = group
}
included, checked := includedLanguage[language]
if !checked {
langtype := enry.GetLanguageType(language)
included = langtype == enry.Programming || langtype == enry.Markup
includedLanguage[language] = included
}
if included || isDetectable.ValueOrDefault(false) {
sizes[language] += f.Size
} else if len(sizes) == 0 && (firstExcludedLanguage == "" || firstExcludedLanguage == language) {
firstExcludedLanguage = language
firstExcludedLanguageSize += f.Size
}
return nil
})
if err != nil {
return nil, err
}
// If there are no included languages add the first excluded language
if len(sizes) == 0 && firstExcludedLanguage != "" {
sizes[firstExcludedLanguage] = firstExcludedLanguageSize
}
return mergeLanguageStats(sizes), nil
}
func readFile(f *object.File, limit int64) ([]byte, error) {
r, err := f.Reader()
if err != nil {
return nil, err
}
defer r.Close()
if limit <= 0 {
return io.ReadAll(r)
}
size := f.Size
if limit > 0 && size > limit {
size = limit
}
buf := bytes.NewBuffer(nil)
buf.Grow(int(size))
_, err = io.Copy(buf, io.LimitReader(r, limit))
return buf.Bytes(), err
}

View File

@@ -0,0 +1,209 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
//go:build !gogit
package languagestats
import (
"bytes"
"io"
"code.gitea.io/gitea/modules/analyze"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/git/attribute"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/optional"
"github.com/go-enry/go-enry/v2"
)
// GetLanguageStats calculates language stats for git repository at specified commit
func GetLanguageStats(repo *git.Repository, commitID string) (map[string]int64, error) {
// We will feed the commit IDs in order into cat-file --batch, followed by blobs as necessary.
// so let's create a batch stdin and stdout
batchStdinWriter, batchReader, cancel, err := repo.CatFileBatch(repo.Ctx)
if err != nil {
return nil, err
}
defer cancel()
writeID := func(id string) error {
_, err := batchStdinWriter.Write([]byte(id + "\n"))
return err
}
if err := writeID(commitID); err != nil {
return nil, err
}
shaBytes, typ, size, err := git.ReadBatchLine(batchReader)
if typ != "commit" {
log.Debug("Unable to get commit for: %s. Err: %v", commitID, err)
return nil, git.ErrNotExist{ID: commitID}
}
sha, err := git.NewIDFromString(string(shaBytes))
if err != nil {
log.Debug("Unable to get commit for: %s. Err: %v", commitID, err)
return nil, git.ErrNotExist{ID: commitID}
}
commit, err := git.CommitFromReader(repo, sha, io.LimitReader(batchReader, size))
if err != nil {
log.Debug("Unable to get commit for: %s. Err: %v", commitID, err)
return nil, err
}
if _, err = batchReader.Discard(1); err != nil {
return nil, err
}
tree := commit.Tree
entries, err := tree.ListEntriesRecursiveWithSize()
if err != nil {
return nil, err
}
checker, err := attribute.NewBatchChecker(repo, commitID, attribute.LinguistAttributes)
if err != nil {
return nil, err
}
defer checker.Close()
contentBuf := bytes.Buffer{}
var content []byte
// sizes contains the current calculated size of all files by language
sizes := make(map[string]int64)
// by default we will only count the sizes of programming languages or markup languages
// unless they are explicitly set using linguist-language
includedLanguage := map[string]bool{}
// or if there's only one language in the repository
firstExcludedLanguage := ""
firstExcludedLanguageSize := int64(0)
for _, f := range entries {
select {
case <-repo.Ctx.Done():
return sizes, repo.Ctx.Err()
default:
}
contentBuf.Reset()
content = contentBuf.Bytes()
if f.Size() == 0 {
continue
}
isVendored := optional.None[bool]()
isGenerated := optional.None[bool]()
isDocumentation := optional.None[bool]()
isDetectable := optional.None[bool]()
attrs, err := checker.CheckPath(f.Name())
if err == nil {
if isVendored = attrs.GetVendored(); isVendored.ValueOrDefault(false) {
continue
}
if isGenerated = attrs.GetGenerated(); isGenerated.ValueOrDefault(false) {
continue
}
if isDocumentation = attrs.GetDocumentation(); isDocumentation.ValueOrDefault(false) {
continue
}
if isDetectable = attrs.GetDetectable(); !isDetectable.ValueOrDefault(true) {
continue
}
if hasLanguage := attrs.GetLanguage(); hasLanguage.Value() != "" {
language := hasLanguage.Value()
// group languages, such as Pug -> HTML; SCSS -> CSS
group := enry.GetLanguageGroup(language)
if len(group) != 0 {
language = group
}
// this language will always be added to the size
sizes[language] += f.Size()
continue
}
}
if (!isVendored.Has() && analyze.IsVendor(f.Name())) ||
enry.IsDotFile(f.Name()) ||
(!isDocumentation.Has() && enry.IsDocumentation(f.Name())) ||
enry.IsConfiguration(f.Name()) {
continue
}
// If content can not be read or file is too big just do detection by filename
if f.Size() <= bigFileSize {
if err := writeID(f.ID.String()); err != nil {
return nil, err
}
_, _, size, err := git.ReadBatchLine(batchReader)
if err != nil {
log.Debug("Error reading blob: %s Err: %v", f.ID.String(), err)
return nil, err
}
sizeToRead := size
discard := int64(1)
if size > fileSizeLimit {
sizeToRead = fileSizeLimit
discard = size - fileSizeLimit + 1
}
_, err = contentBuf.ReadFrom(io.LimitReader(batchReader, sizeToRead))
if err != nil {
return nil, err
}
content = contentBuf.Bytes()
if err := git.DiscardFull(batchReader, discard); err != nil {
return nil, err
}
}
if !isGenerated.Has() && enry.IsGenerated(f.Name(), content) {
continue
}
// FIXME: Why can't we split this and the IsGenerated tests to avoid reading the blob unless absolutely necessary?
// - eg. do the all the detection tests using filename first before reading content.
language := analyze.GetCodeLanguage(f.Name(), content)
if language == "" {
continue
}
// group languages, such as Pug -> HTML; SCSS -> CSS
group := enry.GetLanguageGroup(language)
if group != "" {
language = group
}
included, checked := includedLanguage[language]
if !checked {
langType := enry.GetLanguageType(language)
included = langType == enry.Programming || langType == enry.Markup
includedLanguage[language] = included
}
if included || isDetectable.ValueOrDefault(false) {
sizes[language] += f.Size()
} else if len(sizes) == 0 && (firstExcludedLanguage == "" || firstExcludedLanguage == language) {
firstExcludedLanguage = language
firstExcludedLanguageSize += f.Size()
}
}
// If there are no included languages add the first excluded language
if len(sizes) == 0 && firstExcludedLanguage != "" {
sizes[firstExcludedLanguage] = firstExcludedLanguageSize
}
return mergeLanguageStats(sizes), nil
}

View File

@@ -0,0 +1,46 @@
// Copyright 2020 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
//go:build !gogit
package languagestats
import (
"testing"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/setting"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestRepository_GetLanguageStats(t *testing.T) {
setting.AppDataPath = t.TempDir()
repoPath := "../tests/repos/language_stats_repo"
gitRepo, err := git.OpenRepository(t.Context(), repoPath)
require.NoError(t, err)
defer gitRepo.Close()
stats, err := GetLanguageStats(gitRepo, "8fee858da5796dfb37704761701bb8e800ad9ef3")
require.NoError(t, err)
assert.Equal(t, map[string]int64{
"Python": 134,
"Java": 112,
}, stats)
}
func TestMergeLanguageStats(t *testing.T) {
assert.Equal(t, map[string]int64{
"PHP": 1,
"python": 10,
"JAVA": 700,
}, mergeLanguageStats(map[string]int64{
"PHP": 1,
"python": 10,
"Java": 100,
"java": 200,
"JAVA": 400,
}))
}

View File

@@ -0,0 +1,41 @@
// Copyright 2025 The Gitea Authors. All rights reserved.
// SPDX-License-Identifier: MIT
package languagestats
import (
"context"
"fmt"
"os"
"testing"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/util"
)
func testRun(m *testing.M) error {
gitHomePath, err := os.MkdirTemp(os.TempDir(), "git-home")
if err != nil {
return fmt.Errorf("unable to create temp dir: %w", err)
}
defer util.RemoveAll(gitHomePath)
setting.Git.HomePath = gitHomePath
if err = git.InitFull(context.Background()); err != nil {
return fmt.Errorf("failed to call Init: %w", err)
}
exitCode := m.Run()
if exitCode != 0 {
return fmt.Errorf("run test failed, ExitCode=%d", exitCode)
}
return nil
}
func TestMain(m *testing.M) {
if err := testRun(m); err != nil {
_, _ = fmt.Fprintf(os.Stderr, "Test failed: %v", err)
os.Exit(1)
}
}