package archiver import ( "archive/tar" "bytes" "fmt" "io" "log" "os" "path" "path/filepath" "strconv" "strings" ) // Tar provides facilities for operating TAR archives. // See http://www.gnu.org/software/tar/manual/html_node/Standard.html. type Tar struct { // Whether to overwrite existing files; if false, // an error is returned if the file exists. OverwriteExisting bool // Whether to make all the directories necessary // to create a tar archive in the desired path. MkdirAll bool // A single top-level folder can be implicitly // created by the Archive or Unarchive methods // if the files to be added to the archive // or the files to be extracted from the archive // do not all have a common root. This roughly // mimics the behavior of archival tools integrated // into OS file browsers which create a subfolder // to avoid unexpectedly littering the destination // folder with potentially many files, causing a // problematic cleanup/organization situation. // This feature is available for both creation // and extraction of archives, but may be slightly // inefficient with lots and lots of files, // especially on extraction. ImplicitTopLevelFolder bool // Strip number of leading paths. This feature is available // only during unpacking of the entire archive. StripComponents int // If true, errors encountered during reading // or writing a single file will be logged and // the operation will continue on remaining files. ContinueOnError bool tw *tar.Writer tr *tar.Reader readerWrapFn func(io.Reader) (io.Reader, error) writerWrapFn func(io.Writer) (io.Writer, error) cleanupWrapFn func() } // CheckExt ensures the file extension matches the format. func (*Tar) CheckExt(filename string) error { if !strings.HasSuffix(filename, ".tar") { return fmt.Errorf("filename must have a .tar extension") } return nil } // CheckPath ensures that the filename has not been crafted to perform path traversal attacks func (*Tar) CheckPath(to, filename string) error { to, _ = filepath.Abs(to) //explicit the destination folder to prevent that 'string.HasPrefix' check can be 'bypassed' when no destination folder is supplied in input dest := filepath.Join(to, filename) //prevent path traversal attacks if !strings.HasPrefix(dest, to) { return &IllegalPathError{AbsolutePath: dest, Filename: filename} } return nil } // Archive creates a tarball file at destination containing // the files listed in sources. The destination must end with // ".tar". File paths can be those of regular files or // directories; directories will be recursively added. func (t *Tar) Archive(sources []string, destination string) error { err := t.CheckExt(destination) if t.writerWrapFn == nil && err != nil { return fmt.Errorf("checking extension: %v", err) } if !t.OverwriteExisting && fileExists(destination) { return fmt.Errorf("file already exists: %s", destination) } // make the folder to contain the resulting archive // if it does not already exist destDir := filepath.Dir(destination) if t.MkdirAll && !fileExists(destDir) { err := mkdir(destDir, 0755) if err != nil { return fmt.Errorf("making folder for destination: %v", err) } } out, err := os.Create(destination) if err != nil { return fmt.Errorf("creating %s: %v", destination, err) } defer out.Close() err = t.Create(out) if err != nil { return fmt.Errorf("creating tar: %v", err) } defer t.Close() var topLevelFolder string if t.ImplicitTopLevelFolder && multipleTopLevels(sources) { topLevelFolder = folderNameFromFileName(destination) } for _, source := range sources { err := t.writeWalk(source, topLevelFolder, destination) if err != nil { return fmt.Errorf("walking %s: %v", source, err) } } return nil } // Unarchive unpacks the .tar file at source to destination. // Destination will be treated as a folder name. func (t *Tar) Unarchive(source, destination string) error { if !fileExists(destination) && t.MkdirAll { err := mkdir(destination, 0755) if err != nil { return fmt.Errorf("preparing destination: %v", err) } } // if the files in the archive do not all share a common // root, then make sure we extract to a single subfolder // rather than potentially littering the destination... if t.ImplicitTopLevelFolder { var err error destination, err = t.addTopLevelFolder(source, destination) if err != nil { return fmt.Errorf("scanning source archive: %v", err) } } file, err := os.Open(source) if err != nil { return fmt.Errorf("opening source archive: %v", err) } defer file.Close() err = t.Open(file, 0) if err != nil { return fmt.Errorf("opening tar archive for reading: %v", err) } defer t.Close() for { err := t.untarNext(destination) if err == io.EOF { break } if err != nil { if t.ContinueOnError || IsIllegalPathError(err) { log.Printf("[ERROR] Reading file in tar archive: %v", err) continue } return fmt.Errorf("reading file in tar archive: %v", err) } } return nil } // addTopLevelFolder scans the files contained inside // the tarball named sourceArchive and returns a modified // destination if all the files do not share the same // top-level folder. func (t *Tar) addTopLevelFolder(sourceArchive, destination string) (string, error) { file, err := os.Open(sourceArchive) if err != nil { return "", fmt.Errorf("opening source archive: %v", err) } defer file.Close() // if the reader is to be wrapped, ensure we do that now // or we will not be able to read the archive successfully reader := io.Reader(file) if t.readerWrapFn != nil { reader, err = t.readerWrapFn(reader) if err != nil { return "", fmt.Errorf("wrapping reader: %v", err) } } if t.cleanupWrapFn != nil { defer t.cleanupWrapFn() } tr := tar.NewReader(reader) var files []string for { hdr, err := tr.Next() if err == io.EOF { break } if err != nil { return "", fmt.Errorf("scanning tarball's file listing: %v", err) } files = append(files, hdr.Name) } if multipleTopLevels(files) { destination = filepath.Join(destination, folderNameFromFileName(sourceArchive)) } return destination, nil } func (t *Tar) untarNext(destination string) error { f, err := t.Read() if err != nil { return err // don't wrap error; calling loop must break on io.EOF } defer f.Close() header, ok := f.Header.(*tar.Header) if !ok { return fmt.Errorf("expected header to be *tar.Header but was %T", f.Header) } errPath := t.CheckPath(destination, header.Name) if errPath != nil { return fmt.Errorf("checking path traversal attempt: %v", errPath) } if t.StripComponents > 0 { if strings.Count(header.Name, "/") < t.StripComponents { return nil // skip path with fewer components } for i := 0; i < t.StripComponents; i++ { slash := strings.Index(header.Name, "/") header.Name = header.Name[slash+1:] } } return t.untarFile(f, destination, header) } func (t *Tar) untarFile(f File, destination string, hdr *tar.Header) error { to := filepath.Join(destination, hdr.Name) // do not overwrite existing files, if configured if !f.IsDir() && !t.OverwriteExisting && fileExists(to) { return fmt.Errorf("file already exists: %s", to) } switch hdr.Typeflag { case tar.TypeDir: return mkdir(to, f.Mode()) case tar.TypeReg, tar.TypeRegA, tar.TypeChar, tar.TypeBlock, tar.TypeFifo, tar.TypeGNUSparse: return writeNewFile(to, f, f.Mode()) case tar.TypeSymlink: return writeNewSymbolicLink(to, hdr.Linkname) case tar.TypeLink: return writeNewHardLink(to, filepath.Join(destination, hdr.Linkname)) case tar.TypeXGlobalHeader: return nil // ignore the pax global header from git-generated tarballs default: return fmt.Errorf("%s: unknown type flag: %c", hdr.Name, hdr.Typeflag) } } func (t *Tar) writeWalk(source, topLevelFolder, destination string) error { sourceInfo, err := os.Stat(source) if err != nil { return fmt.Errorf("%s: stat: %v", source, err) } destAbs, err := filepath.Abs(destination) if err != nil { return fmt.Errorf("%s: getting absolute path of destination %s: %v", source, destination, err) } return filepath.Walk(source, func(fpath string, info os.FileInfo, err error) error { handleErr := func(err error) error { if t.ContinueOnError { log.Printf("[ERROR] Walking %s: %v", fpath, err) return nil } return err } if err != nil { return handleErr(fmt.Errorf("traversing %s: %v", fpath, err)) } if info == nil { return handleErr(fmt.Errorf("no file info")) } // make sure we do not copy our output file into itself fpathAbs, err := filepath.Abs(fpath) if err != nil { return handleErr(fmt.Errorf("%s: getting absolute path: %v", fpath, err)) } if within(fpathAbs, destAbs) { return nil } // build the name to be used within the archive nameInArchive, err := makeNameInArchive(sourceInfo, source, topLevelFolder, fpath) if err != nil { return handleErr(err) } var file io.ReadCloser if info.Mode().IsRegular() { file, err = os.Open(fpath) if err != nil { return handleErr(fmt.Errorf("%s: opening: %v", fpath, err)) } defer file.Close() } err = t.Write(File{ FileInfo: FileInfo{ FileInfo: info, CustomName: nameInArchive, }, ReadCloser: file, }) if err != nil { return handleErr(fmt.Errorf("%s: writing: %s", fpath, err)) } return nil }) } // Create opens t for writing a tar archive to out. func (t *Tar) Create(out io.Writer) error { if t.tw != nil { return fmt.Errorf("tar archive is already created for writing") } // wrapping writers allows us to output // compressed tarballs, for example if t.writerWrapFn != nil { var err error out, err = t.writerWrapFn(out) if err != nil { return fmt.Errorf("wrapping writer: %v", err) } } t.tw = tar.NewWriter(out) return nil } // Write writes f to t, which must have been opened for writing first. func (t *Tar) Write(f File) error { if t.tw == nil { return fmt.Errorf("tar archive was not created for writing first") } if f.FileInfo == nil { return fmt.Errorf("no file info") } if f.FileInfo.Name() == "" { return fmt.Errorf("missing file name") } var linkTarget string if isSymlink(f) { var err error linkTarget, err = os.Readlink(f.Name()) if err != nil { return fmt.Errorf("%s: readlink: %v", f.Name(), err) } } hdr, err := tar.FileInfoHeader(f, filepath.ToSlash(linkTarget)) if err != nil { return fmt.Errorf("%s: making header: %v", f.Name(), err) } err = t.tw.WriteHeader(hdr) if err != nil { return fmt.Errorf("%s: writing header: %v", hdr.Name, err) } if f.IsDir() { return nil // directories have no contents } if hdr.Typeflag == tar.TypeReg { if f.ReadCloser == nil { return fmt.Errorf("%s: no way to read file contents", f.Name()) } _, err := io.Copy(t.tw, f) if err != nil { return fmt.Errorf("%s: copying contents: %v", f.Name(), err) } } return nil } // Open opens t for reading an archive from // in. The size parameter is not used. func (t *Tar) Open(in io.Reader, size int64) error { if t.tr != nil { return fmt.Errorf("tar archive is already open for reading") } // wrapping readers allows us to open compressed tarballs if t.readerWrapFn != nil { var err error in, err = t.readerWrapFn(in) if err != nil { return fmt.Errorf("wrapping file reader: %v", err) } } t.tr = tar.NewReader(in) return nil } // Read reads the next file from t, which must have // already been opened for reading. If there are no // more files, the error is io.EOF. The File must // be closed when finished reading from it. func (t *Tar) Read() (File, error) { if t.tr == nil { return File{}, fmt.Errorf("tar archive is not open") } hdr, err := t.tr.Next() if err != nil { return File{}, err // don't wrap error; preserve io.EOF } file := File{ FileInfo: hdr.FileInfo(), Header: hdr, ReadCloser: ReadFakeCloser{t.tr}, } return file, nil } // Close closes the tar archive(s) opened by Create and Open. func (t *Tar) Close() error { var err error if t.tr != nil { t.tr = nil } if t.tw != nil { tw := t.tw t.tw = nil err = tw.Close() } // make sure cleanup of "Reader/Writer wrapper" // (say that ten times fast) happens AFTER the // underlying stream is closed if t.cleanupWrapFn != nil { t.cleanupWrapFn() } return err } // Walk calls walkFn for each visited item in archive. func (t *Tar) Walk(archive string, walkFn WalkFunc) error { file, err := os.Open(archive) if err != nil { return fmt.Errorf("opening archive file: %v", err) } defer file.Close() err = t.Open(file, 0) if err != nil { return fmt.Errorf("opening archive: %v", err) } defer t.Close() for { f, err := t.Read() if err == io.EOF { break } if err != nil { if t.ContinueOnError { log.Printf("[ERROR] Opening next file: %v", err) continue } return fmt.Errorf("opening next file: %v", err) } err = walkFn(f) if err != nil { if err == ErrStopWalk { break } if t.ContinueOnError { log.Printf("[ERROR] Walking %s: %v", f.Name(), err) continue } return fmt.Errorf("walking %s: %v", f.Name(), err) } } return nil } // Extract extracts a single file from the tar archive. // If the target is a directory, the entire folder will // be extracted into destination. func (t *Tar) Extract(source, target, destination string) error { // target refers to a path inside the archive, which should be clean also target = path.Clean(target) // if the target ends up being a directory, then // we will continue walking and extracting files // until we are no longer within that directory var targetDirPath string return t.Walk(source, func(f File) error { th, ok := f.Header.(*tar.Header) if !ok { return fmt.Errorf("expected header to be *tar.Header but was %T", f.Header) } // importantly, cleaning the path strips tailing slash, // which must be appended to folders within the archive name := path.Clean(th.Name) if f.IsDir() && target == name { targetDirPath = path.Dir(name) } if within(target, th.Name) { // either this is the exact file we want, or is // in the directory we want to extract // build the filename we will extract to end, err := filepath.Rel(targetDirPath, th.Name) if err != nil { return fmt.Errorf("relativizing paths: %v", err) } th.Name = end // relativize any hardlink names if th.Typeflag == tar.TypeLink { th.Linkname = filepath.Join(filepath.Base(filepath.Dir(th.Linkname)), filepath.Base(th.Linkname)) } err = t.untarFile(f, destination, th) if err != nil { return fmt.Errorf("extracting file %s: %v", th.Name, err) } // if our target was not a directory, stop walk if targetDirPath == "" { return ErrStopWalk } } else if targetDirPath != "" { // finished walking the entire directory return ErrStopWalk } return nil }) } // Match returns true if the format of file matches this // type's format. It should not affect reader position. func (*Tar) Match(file io.ReadSeeker) (bool, error) { currentPos, err := file.Seek(0, io.SeekCurrent) if err != nil { return false, err } _, err = file.Seek(0, 0) if err != nil { return false, err } defer func() { _, _ = file.Seek(currentPos, io.SeekStart) }() buf := make([]byte, tarBlockSize) if _, err = io.ReadFull(file, buf); err != nil { return false, nil } return hasTarHeader(buf), nil } // hasTarHeader checks passed bytes has a valid tar header or not. buf must // contain at least 512 bytes and if not, it always returns false. func hasTarHeader(buf []byte) bool { if len(buf) < tarBlockSize { return false } b := buf[148:156] b = bytes.Trim(b, " \x00") // clean up all spaces and null bytes if len(b) == 0 { return false // unknown format } hdrSum, err := strconv.ParseUint(string(b), 8, 64) if err != nil { return false } // According to the go official archive/tar, Sun tar uses signed byte // values so this calcs both signed and unsigned var usum uint64 var sum int64 for i, c := range buf { if 148 <= i && i < 156 { c = ' ' // checksum field itself is counted as branks } usum += uint64(uint8(c)) sum += int64(int8(c)) } if hdrSum != usum && int64(hdrSum) != sum { return false // invalid checksum } return true } func (t *Tar) String() string { return "tar" } // NewTar returns a new, default instance ready to be customized and used. func NewTar() *Tar { return &Tar{ MkdirAll: true, } } const tarBlockSize = 512 // Compile-time checks to ensure type implements desired interfaces. var ( _ = Reader(new(Tar)) _ = Writer(new(Tar)) _ = Archiver(new(Tar)) _ = Unarchiver(new(Tar)) _ = Walker(new(Tar)) _ = Extractor(new(Tar)) _ = Matcher(new(Tar)) _ = ExtensionChecker(new(Tar)) _ = FilenameChecker(new(Tar)) ) // DefaultTar is a default instance that is conveniently ready to use. var DefaultTar = NewTar()