From a34826b19f0f091f88b41a40ec196d64044f6554 Mon Sep 17 00:00:00 2001 From: Lauris BH Date: Tue, 2 Jun 2020 20:55:21 +0300 Subject: [PATCH] Change language statistics to save size instead of percentage (#11681) (#11690) * Change language statistics to save size instead of percentage (#11681) * Change language statistics to save size instead of percentage in database Co-Authored-By: Cirno the Strongest <1447794+CirnoT@users.noreply.github.com> * Do not exclude if only language * Fix edge cases with special langauges Co-authored-by: Cirno the Strongest <1447794+CirnoT@users.noreply.github.com> * Fix language stat calculation (#11692) * Fix language stat calculation * Group languages and ignore 0 size files * remove unneeded code Co-authored-by: Cirno the Strongest <1447794+CirnoT@users.noreply.github.com> --- models/migrations/migrations.go | 2 + models/migrations/v140.go | 56 ++++++++++++++++++++++ models/repo_language_stats.go | 69 +++++++++++++++++++-------- modules/git/repo_language_stats.go | 46 +++++++++++------- modules/indexer/stats/indexer_test.go | 7 +-- 5 files changed, 140 insertions(+), 40 deletions(-) create mode 100644 models/migrations/v140.go diff --git a/models/migrations/migrations.go b/models/migrations/migrations.go index 00d84da2e8..869661aee4 100644 --- a/models/migrations/migrations.go +++ b/models/migrations/migrations.go @@ -212,6 +212,8 @@ var migrations = []Migration{ NewMigration("Add ResolveDoerID to Comment table", addResolveDoerIDCommentColumn), // v139 -> v140 NewMigration("prepend refs/heads/ to issue refs", prependRefsHeadsToIssueRefs), + // v140 -> v141 + NewMigration("Save detected language file size to database instead of percent", fixLanguageStatsToSaveSize), } // GetCurrentDBVersion returns the current db version diff --git a/models/migrations/v140.go b/models/migrations/v140.go new file mode 100644 index 0000000000..871d14b84e --- /dev/null +++ b/models/migrations/v140.go @@ -0,0 +1,56 @@ +// Copyright 2020 The Gitea Authors. All rights reserved. +// Use of this source code is governed by a MIT-style +// license that can be found in the LICENSE file. + +package migrations + +import ( + "fmt" + + "code.gitea.io/gitea/modules/setting" + + "xorm.io/xorm" +) + +func fixLanguageStatsToSaveSize(x *xorm.Engine) error { + // LanguageStat see models/repo_language_stats.go + type LanguageStat struct { + Size int64 `xorm:"NOT NULL DEFAULT 0"` + } + + // RepoIndexerType specifies the repository indexer type + type RepoIndexerType int + + const ( + // RepoIndexerTypeCode code indexer + RepoIndexerTypeCode RepoIndexerType = iota // 0 + // RepoIndexerTypeStats repository stats indexer + RepoIndexerTypeStats // 1 + ) + + // RepoIndexerStatus see models/repo_indexer.go + type RepoIndexerStatus struct { + IndexerType RepoIndexerType `xorm:"INDEX(s) NOT NULL DEFAULT 0"` + } + + if err := x.Sync2(new(LanguageStat)); err != nil { + return fmt.Errorf("Sync2: %v", err) + } + + x.Delete(&RepoIndexerStatus{IndexerType: RepoIndexerTypeStats}) + + // Delete language stat statuses + truncExpr := "TRUNCATE TABLE" + if setting.Database.UseSQLite3 { + truncExpr = "DELETE FROM" + } + + // Delete language stats + if _, err := x.Exec(fmt.Sprintf("%s language_stat", truncExpr)); err != nil { + return err + } + + sess := x.NewSession() + defer sess.Close() + return dropTableColumns(sess, "language_stat", "percentage") +} diff --git a/models/repo_language_stats.go b/models/repo_language_stats.go index 5f1aed1f30..a15063e25a 100644 --- a/models/repo_language_stats.go +++ b/models/repo_language_stats.go @@ -20,7 +20,8 @@ type LanguageStat struct { CommitID string IsPrimary bool Language string `xorm:"VARCHAR(30) UNIQUE(s) INDEX NOT NULL"` - Percentage float32 `xorm:"NUMERIC(5,2) NOT NULL DEFAULT 0"` + Percentage float32 `xorm:"-"` + Size int64 `xorm:"NOT NULL DEFAULT 0"` Color string `xorm:"-"` CreatedUnix timeutil.TimeStamp `xorm:"INDEX CREATED"` } @@ -34,12 +35,36 @@ func (stats LanguageStatList) loadAttributes() { } } +func (stats LanguageStatList) getLanguagePercentages() map[string]float32 { + langPerc := make(map[string]float32) + var otherPerc float32 = 100 + var total int64 + + for _, stat := range stats { + total += stat.Size + } + if total > 0 { + for _, stat := range stats { + perc := float32(math.Round(float64(stat.Size)/float64(total)*1000) / 10) + if perc <= 0.1 { + continue + } + otherPerc -= perc + langPerc[stat.Language] = perc + } + otherPerc = float32(math.Round(float64(otherPerc)*10) / 10) + } + if otherPerc > 0 { + langPerc["other"] = otherPerc + } + return langPerc +} + func (repo *Repository) getLanguageStats(e Engine) (LanguageStatList, error) { stats := make(LanguageStatList, 0, 6) - if err := e.Where("`repo_id` = ?", repo.ID).Desc("`percentage`").Find(&stats); err != nil { + if err := e.Where("`repo_id` = ?", repo.ID).Desc("`size`").Find(&stats); err != nil { return nil, err } - stats.loadAttributes() return stats, nil } @@ -54,13 +79,18 @@ func (repo *Repository) GetTopLanguageStats(limit int) (LanguageStatList, error) if err != nil { return nil, err } + perc := stats.getLanguagePercentages() topstats := make(LanguageStatList, 0, limit) var other float32 for i := range stats { - if stats[i].Language == "other" || len(topstats) >= limit { - other += stats[i].Percentage + if _, ok := perc[stats[i].Language]; !ok { continue } + if stats[i].Language == "other" || len(topstats) >= limit { + other += perc[stats[i].Language] + continue + } + stats[i].Percentage = perc[stats[i].Language] topstats = append(topstats, stats[i]) } if other > 0 { @@ -71,11 +101,12 @@ func (repo *Repository) GetTopLanguageStats(limit int) (LanguageStatList, error) Percentage: float32(math.Round(float64(other)*10) / 10), }) } + topstats.loadAttributes() return topstats, nil } // UpdateLanguageStats updates the language statistics for repository -func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]float32) error { +func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]int64) error { sess := x.NewSession() if err := sess.Begin(); err != nil { return err @@ -87,15 +118,15 @@ func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]fl return err } var topLang string - var p float32 - for lang, perc := range stats { - if perc > p { - p = perc + var s int64 + for lang, size := range stats { + if size > s { + s = size topLang = strings.ToLower(lang) } } - for lang, perc := range stats { + for lang, size := range stats { upd := false llang := strings.ToLower(lang) for _, s := range oldstats { @@ -103,8 +134,8 @@ func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]fl if strings.ToLower(s.Language) == llang { s.CommitID = commitID s.IsPrimary = llang == topLang - s.Percentage = perc - if _, err := sess.ID(s.ID).Cols("`commit_id`", "`percentage`", "`is_primary`").Update(s); err != nil { + s.Size = size + if _, err := sess.ID(s.ID).Cols("`commit_id`", "`size`", "`is_primary`").Update(s); err != nil { return err } upd = true @@ -114,11 +145,11 @@ func (repo *Repository) UpdateLanguageStats(commitID string, stats map[string]fl // Insert new language if !upd { if _, err := sess.Insert(&LanguageStat{ - RepoID: repo.ID, - CommitID: commitID, - IsPrimary: llang == topLang, - Language: lang, - Percentage: perc, + RepoID: repo.ID, + CommitID: commitID, + IsPrimary: llang == topLang, + Language: lang, + Size: size, }); err != nil { return err } @@ -153,7 +184,7 @@ func CopyLanguageStat(originalRepo, destRepo *Repository) error { return err } RepoLang := make(LanguageStatList, 0, 6) - if err := sess.Where("`repo_id` = ?", originalRepo.ID).Desc("`percentage`").Find(&RepoLang); err != nil { + if err := sess.Where("`repo_id` = ?", originalRepo.ID).Desc("`size`").Find(&RepoLang); err != nil { return err } if len(RepoLang) > 0 { diff --git a/modules/git/repo_language_stats.go b/modules/git/repo_language_stats.go index 8ff8fa20c1..06d7d6aba0 100644 --- a/modules/git/repo_language_stats.go +++ b/modules/git/repo_language_stats.go @@ -8,7 +8,6 @@ import ( "bytes" "io" "io/ioutil" - "math" "code.gitea.io/gitea/modules/analyze" @@ -20,8 +19,22 @@ import ( const fileSizeLimit int64 = 16 * 1024 * 1024 +// specialLanguages defines list of languages that are excluded from the calculation +// unless they are the only language present in repository. Only languages which under +// normal circumstances are not considered to be code should be listed here. +var specialLanguages = []string{ + "XML", + "JSON", + "TOML", + "YAML", + "INI", + "SVG", + "Text", + "Markdown", +} + // GetLanguageStats calculates language stats for git repository at specified commit -func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, error) { +func (repo *Repository) GetLanguageStats(commitID string) (map[string]int64, error) { r, err := git.PlainOpen(repo.Path) if err != nil { return nil, err @@ -43,9 +56,8 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e } sizes := make(map[string]int64) - var total int64 err = tree.Files().ForEach(func(f *object.File) error { - if enry.IsVendor(f.Name) || enry.IsDotFile(f.Name) || + if f.Size == 0 || enry.IsVendor(f.Name) || enry.IsDotFile(f.Name) || enry.IsDocumentation(f.Name) || enry.IsConfiguration(f.Name) { return nil } @@ -63,8 +75,13 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e return nil } + // group languages, such as Pug -> HTML; SCSS -> CSS + group := enry.GetLanguageGroup(language) + if group != "" { + language = group + } + sizes[language] += f.Size - total += f.Size return nil }) @@ -72,21 +89,14 @@ func (repo *Repository) GetLanguageStats(commitID string) (map[string]float32, e return nil, err } - stats := make(map[string]float32) - var otherPerc float32 = 100 - for language, size := range sizes { - perc := float32(math.Round(float64(size)/float64(total)*1000) / 10) - if perc <= 0.1 { - continue + // filter special languages unless they are the only language + if len(sizes) > 1 { + for _, language := range specialLanguages { + delete(sizes, language) } - otherPerc -= perc - stats[language] = perc } - otherPerc = float32(math.Round(float64(otherPerc)*10) / 10) - if otherPerc > 0 { - stats["other"] = otherPerc - } - return stats, nil + + return sizes, nil } func readFile(f *object.File, limit int64) ([]byte, error) { diff --git a/modules/indexer/stats/indexer_test.go b/modules/indexer/stats/indexer_test.go index 29d0f6dbe4..4bcbaa9423 100644 --- a/modules/indexer/stats/indexer_test.go +++ b/modules/indexer/stats/indexer_test.go @@ -34,9 +34,10 @@ func TestRepoStatsIndex(t *testing.T) { repo, err := models.GetRepositoryByID(1) assert.NoError(t, err) + status, err := repo.GetIndexerStatus(models.RepoIndexerTypeStats) + assert.NoError(t, err) + assert.Equal(t, "65f1bf27bc3bf70f64657658635e66094edbcb4d", status.CommitSha) langs, err := repo.GetTopLanguageStats(5) assert.NoError(t, err) - assert.Len(t, langs, 1) - assert.Equal(t, "other", langs[0].Language) - assert.Equal(t, float32(100), langs[0].Percentage) + assert.Empty(t, langs) }