From 22eeede885327fca0328b7d5b153e7a6c4211ffa Mon Sep 17 00:00:00 2001 From: wxiaoguang Date: Wed, 12 Jul 2023 17:58:27 +0800 Subject: [PATCH] Do not "guess" the file encoding/BOM when using API to upload files (#25828) Related issue: #18368 It doesn't seem right to "guess" the file encoding/BOM when using API to upload files. The API should save the uploaded content as-is. --- services/repository/files/update.go | 105 +--------------------------- 1 file changed, 3 insertions(+), 102 deletions(-) diff --git a/services/repository/files/update.go b/services/repository/files/update.go index 737f914dd6..1d5f10a3f2 100644 --- a/services/repository/files/update.go +++ b/services/repository/files/update.go @@ -4,7 +4,6 @@ package files import ( - "bytes" "context" "fmt" "path" @@ -12,21 +11,15 @@ import ( "time" "code.gitea.io/gitea/models" - "code.gitea.io/gitea/models/db" git_model "code.gitea.io/gitea/models/git" repo_model "code.gitea.io/gitea/models/repo" user_model "code.gitea.io/gitea/models/user" - "code.gitea.io/gitea/modules/charset" "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/lfs" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/structs" - "code.gitea.io/gitea/modules/util" asymkey_service "code.gitea.io/gitea/services/asymkey" - - stdcharset "golang.org/x/net/html/charset" - "golang.org/x/text/transform" ) // IdentityOptions for a person's identity like an author or committer @@ -66,78 +59,9 @@ type ChangeRepoFilesOptions struct { type RepoFileOptions struct { treePath string fromTreePath string - encoding string - bom bool executable bool } -func detectEncodingAndBOM(entry *git.TreeEntry, repo *repo_model.Repository) (string, bool) { - reader, err := entry.Blob().DataAsync() - if err != nil { - // return default - return "UTF-8", false - } - defer reader.Close() - buf := make([]byte, 1024) - n, err := util.ReadAtMost(reader, buf) - if err != nil { - // return default - return "UTF-8", false - } - buf = buf[:n] - - if setting.LFS.StartServer { - pointer, _ := lfs.ReadPointerFromBuffer(buf) - if pointer.IsValid() { - meta, err := git_model.GetLFSMetaObjectByOid(db.DefaultContext, repo.ID, pointer.Oid) - if err != nil && err != git_model.ErrLFSObjectNotExist { - // return default - return "UTF-8", false - } - if meta != nil { - dataRc, err := lfs.ReadMetaObject(pointer) - if err != nil { - // return default - return "UTF-8", false - } - defer dataRc.Close() - buf = make([]byte, 1024) - n, err = util.ReadAtMost(dataRc, buf) - if err != nil { - // return default - return "UTF-8", false - } - buf = buf[:n] - } - } - } - - encoding, err := charset.DetectEncoding(buf) - if err != nil { - // just default to utf-8 and no bom - return "UTF-8", false - } - if encoding == "UTF-8" { - return encoding, bytes.Equal(buf[0:3], charset.UTF8BOM) - } - charsetEncoding, _ := stdcharset.Lookup(encoding) - if charsetEncoding == nil { - return "UTF-8", false - } - - result, n, err := transform.String(charsetEncoding.NewDecoder(), string(buf)) - if err != nil { - // return default - return "UTF-8", false - } - - if n > 2 { - return encoding, bytes.Equal([]byte(result)[0:3], charset.UTF8BOM) - } - - return encoding, false -} - // ChangeRepoFiles adds, updates or removes multiple files in the given repository func ChangeRepoFiles(ctx context.Context, repo *repo_model.Repository, doer *user_model.User, opts *ChangeRepoFilesOptions) (*structs.FilesResponse, error) { // If no branch name is set, assume default branch @@ -184,8 +108,6 @@ func ChangeRepoFiles(ctx context.Context, repo *repo_model.Repository, doer *use file.Options = &RepoFileOptions{ treePath: treePath, fromTreePath: fromTreePath, - encoding: "UTF-8", - bom: false, executable: false, } treePaths = append(treePaths, treePath) @@ -381,7 +303,6 @@ func handleCheckErrors(file *ChangeRepoFile, commit *git.Commit, opts *ChangeRep // haven't been made. We throw an error if one wasn't provided. return models.ErrSHAOrCommitIDNotProvided{} } - file.Options.encoding, file.Options.bom = detectEncodingAndBOM(fromEntry, repo) file.Options.executable = fromEntry.IsExecutable() } if file.Operation == "create" || file.Operation == "update" { @@ -466,28 +387,8 @@ func CreateOrUpdateFile(ctx context.Context, t *TemporaryUploadRepository, file } } - content := file.Content - if file.Options.bom { - content = string(charset.UTF8BOM) + content - } - if file.Options.encoding != "UTF-8" { - charsetEncoding, _ := stdcharset.Lookup(file.Options.encoding) - if charsetEncoding != nil { - result, _, err := transform.String(charsetEncoding.NewEncoder(), content) - if err != nil { - // Look if we can't encode back in to the original we should just stick with utf-8 - log.Error("Error re-encoding %s (%s) as %s - will stay as UTF-8: %v", file.TreePath, file.FromTreePath, file.Options.encoding, err) - result = content - } - content = result - } else { - log.Error("Unknown encoding: %s", file.Options.encoding) - } - } - // Reset the opts.Content to our adjusted content to ensure that LFS gets the correct content - file.Content = content + treeObjectContent := file.Content var lfsMetaObject *git_model.LFSMetaObject - if setting.LFS.StartServer && hasOldBranch { // Check there is no way this can return multiple infos filename2attribute2info, err := t.gitRepo.CheckAttribute(git.CheckAttributeOpts{ @@ -506,12 +407,12 @@ func CreateOrUpdateFile(ctx context.Context, t *TemporaryUploadRepository, file return err } lfsMetaObject = &git_model.LFSMetaObject{Pointer: pointer, RepositoryID: repoID} - content = pointer.StringContent() + treeObjectContent = pointer.StringContent() } } // Add the object to the database - objectHash, err := t.HashObject(strings.NewReader(content)) + objectHash, err := t.HashObject(strings.NewReader(treeObjectContent)) if err != nil { return err }