From 033d92997fc16baee097d2b25f08e0984e628abd Mon Sep 17 00:00:00 2001 From: techknowlogick Date: Thu, 25 May 2023 04:13:47 -0400 Subject: [PATCH] Allow skipping forks and mirrors from being indexed (#23187) This PR adds two new options to disable repo/code search indexing of both forks and mirrors. Related: #22842 --- custom/conf/app.example.ini | 4 +++ .../config-cheat-sheet.en-us.md | 1 + modules/indexer/code/indexer.go | 27 ++++++++++++++ modules/setting/indexer.go | 35 ++++++++++--------- 4 files changed, 51 insertions(+), 16 deletions(-) diff --git a/custom/conf/app.example.ini b/custom/conf/app.example.ini index 344f92be49..c2b721b0cf 100644 --- a/custom/conf/app.example.ini +++ b/custom/conf/app.example.ini @@ -1362,6 +1362,10 @@ LEVEL = Info ;; repo indexer by default disabled, since it uses a lot of disk space ;REPO_INDEXER_ENABLED = false ;; +;; repo indexer units, the items to index, could be `sources`, `forks`, `mirrors`, `templates` or any combination of them separated by a comma. +;; If empty then it defaults to `sources` only, as if you'd like to disable fully please see REPO_INDEXER_ENABLED. +;REPO_INDEXER_REPO_TYPES = sources,forks,mirrors,templates +;; ;; Code search engine type, could be `bleve` or `elasticsearch`. ;REPO_INDEXER_TYPE = bleve ;; diff --git a/docs/content/doc/administration/config-cheat-sheet.en-us.md b/docs/content/doc/administration/config-cheat-sheet.en-us.md index 9bb2533dac..9616be586d 100644 --- a/docs/content/doc/administration/config-cheat-sheet.en-us.md +++ b/docs/content/doc/administration/config-cheat-sheet.en-us.md @@ -465,6 +465,7 @@ relation to port exhaustion. - `ISSUE_INDEXER_PATH`: **indexers/issues.bleve**: Index file used for issue search; available when ISSUE_INDEXER_TYPE is bleve and elasticsearch. Relative paths will be made absolute against _`AppWorkPath`_. - `REPO_INDEXER_ENABLED`: **false**: Enables code search (uses a lot of disk space, about 6 times more than the repository size). +- `REPO_INDEXER_REPO_TYPES`: **sources,forks,mirrors,templates**: Repo indexer units. The items to index could be `sources`, `forks`, `mirrors`, `templates` or any combination of them separated by a comma. If empty then it defaults to `sources` only, as if you'd like to disable fully please see `REPO_INDEXER_ENABLED`. - `REPO_INDEXER_TYPE`: **bleve**: Code search engine type, could be `bleve` or `elasticsearch`. - `REPO_INDEXER_PATH`: **indexers/repos.bleve**: Index file used for code search. - `REPO_INDEXER_CONN_STR`: ****: Code indexer connection string, available when `REPO_INDEXER_TYPE` is elasticsearch. i.e. http://elastic:changeme@localhost:9200 diff --git a/modules/indexer/code/indexer.go b/modules/indexer/code/indexer.go index a5e40b52c1..e9b8e76500 100644 --- a/modules/indexer/code/indexer.go +++ b/modules/indexer/code/indexer.go @@ -19,6 +19,7 @@ import ( "code.gitea.io/gitea/modules/queue" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/timeutil" + "code.gitea.io/gitea/modules/util" ) // SearchResult result of performing a search in a repo @@ -91,6 +92,32 @@ func index(ctx context.Context, indexer Indexer, repoID int64) error { return err } + repoTypes := setting.Indexer.RepoIndexerRepoTypes + + if len(repoTypes) == 0 { + repoTypes = []string{"sources"} + } + + // skip forks from being indexed if unit is not present + if !util.SliceContains(repoTypes, "forks") && repo.IsFork { + return nil + } + + // skip mirrors from being indexed if unit is not present + if !util.SliceContains(repoTypes, "mirrors") && repo.IsMirror { + return nil + } + + // skip templates from being indexed if unit is not present + if !util.SliceContains(repoTypes, "templates") && repo.IsTemplate { + return nil + } + + // skip regular repos from being indexed if unit is not present + if !util.SliceContains(repoTypes, "sources") && !repo.IsFork && !repo.IsMirror && !repo.IsTemplate { + return nil + } + sha, err := getDefaultBranchSha(ctx, repo) if err != nil { return err diff --git a/modules/setting/indexer.go b/modules/setting/indexer.go index 6836e62311..16f3d80168 100644 --- a/modules/setting/indexer.go +++ b/modules/setting/indexer.go @@ -23,15 +23,16 @@ var Indexer = struct { IssueIndexerName string StartupTimeout time.Duration - RepoIndexerEnabled bool - RepoType string - RepoPath string - RepoConnStr string - RepoIndexerName string - MaxIndexerFileSize int64 - IncludePatterns []glob.Glob - ExcludePatterns []glob.Glob - ExcludeVendored bool + RepoIndexerEnabled bool + RepoIndexerRepoTypes []string + RepoType string + RepoPath string + RepoConnStr string + RepoIndexerName string + MaxIndexerFileSize int64 + IncludePatterns []glob.Glob + ExcludePatterns []glob.Glob + ExcludeVendored bool }{ IssueType: "bleve", IssuePath: "indexers/issues.bleve", @@ -39,13 +40,14 @@ var Indexer = struct { IssueConnAuth: "", IssueIndexerName: "gitea_issues", - RepoIndexerEnabled: false, - RepoType: "bleve", - RepoPath: "indexers/repos.bleve", - RepoConnStr: "", - RepoIndexerName: "gitea_codes", - MaxIndexerFileSize: 1024 * 1024, - ExcludeVendored: true, + RepoIndexerEnabled: false, + RepoIndexerRepoTypes: []string{"sources", "forks", "mirrors", "templates"}, + RepoType: "bleve", + RepoPath: "indexers/repos.bleve", + RepoConnStr: "", + RepoIndexerName: "gitea_codes", + MaxIndexerFileSize: 1024 * 1024, + ExcludeVendored: true, } func loadIndexerFrom(rootCfg ConfigProvider) { @@ -71,6 +73,7 @@ func loadIndexerFrom(rootCfg ConfigProvider) { Indexer.IssueIndexerName = sec.Key("ISSUE_INDEXER_NAME").MustString(Indexer.IssueIndexerName) Indexer.RepoIndexerEnabled = sec.Key("REPO_INDEXER_ENABLED").MustBool(false) + Indexer.RepoIndexerRepoTypes = strings.Split(sec.Key("REPO_INDEXER_REPO_TYPES").MustString("sources,forks,mirrors,templates"), ",") Indexer.RepoType = sec.Key("REPO_INDEXER_TYPE").MustString("bleve") Indexer.RepoPath = filepath.ToSlash(sec.Key("REPO_INDEXER_PATH").MustString(filepath.ToSlash(filepath.Join(AppDataPath, "indexers/repos.bleve")))) if !filepath.IsAbs(Indexer.RepoPath) {