Vendor Update Go Libs (#13444)

* denisenkom/go-mssqldb untagged -> v0.9.0 * github.com/editorconfig/editorconfig-core-go v2.3.7 -> v2.3.8 * github.com/go-testfixtures/testfixtures v3.4.0 -> v3.4.1 * github.com/mholt/archiver v3.3.2 -> v3.5.0 * github.com/olivere/elastic v7.0.20 -> v7.0.21 * github.com/urfave/cli v1.22.4 -> v1.22.5 * github.com/xanzy/go-gitlab v0.38.1 -> v0.39.0 * github.com/yuin/goldmark-meta untagged -> v1.0.0 * github.com/ethantkoenig/rupture 0a76f03a811a -> c3b3b810dc77 * github.com/jaytaylor/html2text 8fb95d837f7d -> 3577fbdbcff7 * github.com/kballard/go-shellquote cd60e84ee657 -> 95032a82bc51 * github.com/msteinert/pam 02ccfbfaf0cc -> 913b8f8cdf8b * github.com/unknwon/paginater 7748a72e0141 -> 042474bd0eae * CI.restart() Co-authored-by: techknowlogick <techknowlogick@gitea.io>
2025-12-05 04:18:27 +00:00 · 2020-11-06 19:41:42 +01:00
parent eebaa81f43
commit 30ce3731a1
184 changed files with 12387 additions and 2975 deletions
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/block.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/block.go
@@ -0,0 +1,469 @@
+package lz4block
+
+import (
+	"encoding/binary"
+	"math/bits"
+	"sync"
+
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
+)
+
+const (
+	// The following constants are used to setup the compression algorithm.
+	minMatch   = 4  // the minimum size of the match sequence size (4 bytes)
+	winSizeLog = 16 // LZ4 64Kb window size limit
+	winSize    = 1 << winSizeLog
+	winMask    = winSize - 1 // 64Kb window of previous data for dependent blocks
+
+	// hashLog determines the size of the hash table used to quickly find a previous match position.
+	// Its value influences the compression speed and memory usage, the lower the faster,
+	// but at the expense of the compression ratio.
+	// 16 seems to be the best compromise for fast compression.
+	hashLog = 16
+	htSize  = 1 << hashLog
+
+	mfLimit = 10 + minMatch // The last match cannot start within the last 14 bytes.
+)
+
+func recoverBlock(e *error) {
+	if r := recover(); r != nil && *e == nil {
+		*e = lz4errors.ErrInvalidSourceShortBuffer
+	}
+}
+
+// blockHash hashes the lower 6 bytes into a value < htSize.
+func blockHash(x uint64) uint32 {
+	const prime6bytes = 227718039650203
+	return uint32(((x << (64 - 48)) * prime6bytes) >> (64 - hashLog))
+}
+
+func CompressBlockBound(n int) int {
+	return n + n/255 + 16
+}
+
+func UncompressBlock(src, dst []byte) (int, error) {
+	if len(src) == 0 {
+		return 0, nil
+	}
+	if di := decodeBlock(dst, src); di >= 0 {
+		return di, nil
+	}
+	return 0, lz4errors.ErrInvalidSourceShortBuffer
+}
+
+type Compressor struct {
+	// Offsets are at most 64kiB, so we can store only the lower 16 bits of
+	// match positions: effectively, an offset from some 64kiB block boundary.
+	//
+	// When we retrieve such an offset, we interpret it as relative to the last
+	// block boundary si &^ 0xffff, or the one before, (si &^ 0xffff) - 0x10000,
+	// depending on which of these is inside the current window. If a table
+	// entry was generated more than 64kiB back in the input, we find out by
+	// inspecting the input stream.
+	table [htSize]uint16
+
+	needsReset bool
+}
+
+// Get returns the position of a presumptive match for the hash h.
+// The match may be a false positive due to a hash collision or an old entry.
+// If si < winSize, the return value may be negative.
+func (c *Compressor) get(h uint32, si int) int {
+	h &= htSize - 1
+	i := int(c.table[h])
+	i += si &^ winMask
+	if i >= si {
+		// Try previous 64kiB block (negative when in first block).
+		i -= winSize
+	}
+	return i
+}
+
+func (c *Compressor) put(h uint32, si int) {
+	h &= htSize - 1
+	c.table[h] = uint16(si)
+}
+
+var compressorPool = sync.Pool{New: func() interface{} { return new(Compressor) }}
+
+func CompressBlock(src, dst []byte) (int, error) {
+	c := compressorPool.Get().(*Compressor)
+	n, err := c.CompressBlock(src, dst)
+	compressorPool.Put(c)
+	return n, err
+}
+
+func (c *Compressor) CompressBlock(src, dst []byte) (int, error) {
+	if c.needsReset {
+		// Zero out reused table to avoid non-deterministic output (issue #65).
+		c.table = [htSize]uint16{}
+	}
+	c.needsReset = true // Only false on first call.
+
+	// Return 0, nil only if the destination buffer size is < CompressBlockBound.
+	isNotCompressible := len(dst) < CompressBlockBound(len(src))
+
+	// adaptSkipLog sets how quickly the compressor begins skipping blocks when data is incompressible.
+	// This significantly speeds up incompressible data and usually has very small impact on compression.
+	// bytes to skip =  1 + (bytes since last match >> adaptSkipLog)
+	const adaptSkipLog = 7
+
+	// si: Current position of the search.
+	// anchor: Position of the current literals.
+	var si, di, anchor int
+	sn := len(src) - mfLimit
+	if sn <= 0 {
+		goto lastLiterals
+	}
+
+	// Fast scan strategy: the hash table only stores the last 4 bytes sequences.
+	for si < sn {
+		// Hash the next 6 bytes (sequence)...
+		match := binary.LittleEndian.Uint64(src[si:])
+		h := blockHash(match)
+		h2 := blockHash(match >> 8)
+
+		// We check a match at s, s+1 and s+2 and pick the first one we get.
+		// Checking 3 only requires us to load the source one.
+		ref := c.get(h, si)
+		ref2 := c.get(h2, si)
+		c.put(h, si)
+		c.put(h2, si+1)
+
+		offset := si - ref
+
+		if offset <= 0 || offset >= winSize || uint32(match) != binary.LittleEndian.Uint32(src[ref:]) {
+			// No match. Start calculating another hash.
+			// The processor can usually do this out-of-order.
+			h = blockHash(match >> 16)
+			ref3 := c.get(h, si+2)
+
+			// Check the second match at si+1
+			si += 1
+			offset = si - ref2
+
+			if offset <= 0 || offset >= winSize || uint32(match>>8) != binary.LittleEndian.Uint32(src[ref2:]) {
+				// No match. Check the third match at si+2
+				si += 1
+				offset = si - ref3
+				c.put(h, si)
+
+				if offset <= 0 || offset >= winSize || uint32(match>>16) != binary.LittleEndian.Uint32(src[ref3:]) {
+					// Skip one extra byte (at si+3) before we check 3 matches again.
+					si += 2 + (si-anchor)>>adaptSkipLog
+					continue
+				}
+			}
+		}
+
+		// Match found.
+		lLen := si - anchor // Literal length.
+		// We already matched 4 bytes.
+		mLen := 4
+
+		// Extend backwards if we can, reducing literals.
+		tOff := si - offset - 1
+		for lLen > 0 && tOff >= 0 && src[si-1] == src[tOff] {
+			si--
+			tOff--
+			lLen--
+			mLen++
+		}
+
+		// Add the match length, so we continue search at the end.
+		// Use mLen to store the offset base.
+		si, mLen = si+mLen, si+minMatch
+
+		// Find the longest match by looking by batches of 8 bytes.
+		for si+8 < sn {
+			x := binary.LittleEndian.Uint64(src[si:]) ^ binary.LittleEndian.Uint64(src[si-offset:])
+			if x == 0 {
+				si += 8
+			} else {
+				// Stop is first non-zero byte.
+				si += bits.TrailingZeros64(x) >> 3
+				break
+			}
+		}
+
+		mLen = si - mLen
+		if mLen < 0xF {
+			dst[di] = byte(mLen)
+		} else {
+			dst[di] = 0xF
+		}
+
+		// Encode literals length.
+		if lLen < 0xF {
+			dst[di] |= byte(lLen << 4)
+		} else {
+			dst[di] |= 0xF0
+			di++
+			l := lLen - 0xF
+			for ; l >= 0xFF; l -= 0xFF {
+				dst[di] = 0xFF
+				di++
+			}
+			dst[di] = byte(l)
+		}
+		di++
+
+		// Literals.
+		if di+lLen > len(dst) {
+			return 0, lz4errors.ErrInvalidSourceShortBuffer
+		}
+		copy(dst[di:di+lLen], src[anchor:anchor+lLen])
+		di += lLen + 2
+		anchor = si
+
+		// Encode offset.
+		if di > len(dst) {
+			return 0, lz4errors.ErrInvalidSourceShortBuffer
+		}
+		dst[di-2], dst[di-1] = byte(offset), byte(offset>>8)
+
+		// Encode match length part 2.
+		if mLen >= 0xF {
+			for mLen -= 0xF; mLen >= 0xFF && di < len(dst); mLen -= 0xFF {
+				dst[di] = 0xFF
+				di++
+			}
+			if di >= len(dst) {
+				return 0, lz4errors.ErrInvalidSourceShortBuffer
+			}
+			dst[di] = byte(mLen)
+			di++
+		}
+		// Check if we can load next values.
+		if si >= sn {
+			break
+		}
+		// Hash match end-2
+		h = blockHash(binary.LittleEndian.Uint64(src[si-2:]))
+		c.put(h, si-2)
+	}
+
+lastLiterals:
+	if isNotCompressible && anchor == 0 {
+		// Incompressible.
+		return 0, nil
+	}
+
+	// Last literals.
+	if di >= len(dst) {
+		return 0, lz4errors.ErrInvalidSourceShortBuffer
+	}
+	lLen := len(src) - anchor
+	if lLen < 0xF {
+		dst[di] = byte(lLen << 4)
+	} else {
+		dst[di] = 0xF0
+		di++
+		for lLen -= 0xF; lLen >= 0xFF && di < len(dst); lLen -= 0xFF {
+			dst[di] = 0xFF
+			di++
+		}
+		if di >= len(dst) {
+			return 0, lz4errors.ErrInvalidSourceShortBuffer
+		}
+		dst[di] = byte(lLen)
+	}
+	di++
+
+	// Write the last literals.
+	if isNotCompressible && di >= anchor {
+		// Incompressible.
+		return 0, nil
+	}
+	if di+len(src)-anchor > len(dst) {
+		return 0, lz4errors.ErrInvalidSourceShortBuffer
+	}
+	di += copy(dst[di:di+len(src)-anchor], src[anchor:])
+	return di, nil
+}
+
+// blockHash hashes 4 bytes into a value < winSize.
+func blockHashHC(x uint32) uint32 {
+	const hasher uint32 = 2654435761 // Knuth multiplicative hash.
+	return x * hasher >> (32 - winSizeLog)
+}
+
+type CompressorHC struct {
+	// hashTable: stores the last position found for a given hash
+	// chainTable: stores previous positions for a given hash
+	hashTable, chainTable [htSize]int
+	needsReset            bool
+}
+
+var compressorHCPool = sync.Pool{New: func() interface{} { return new(CompressorHC) }}
+
+func CompressBlockHC(src, dst []byte, depth CompressionLevel) (int, error) {
+	c := compressorHCPool.Get().(*CompressorHC)
+	n, err := c.CompressBlock(src, dst, depth)
+	compressorHCPool.Put(c)
+	return n, err
+}
+
+func (c *CompressorHC) CompressBlock(src, dst []byte, depth CompressionLevel) (_ int, err error) {
+	if c.needsReset {
+		// Zero out reused table to avoid non-deterministic output (issue #65).
+		c.hashTable = [htSize]int{}
+		c.chainTable = [htSize]int{}
+	}
+	c.needsReset = true // Only false on first call.
+
+	defer recoverBlock(&err)
+
+	// Return 0, nil only if the destination buffer size is < CompressBlockBound.
+	isNotCompressible := len(dst) < CompressBlockBound(len(src))
+
+	// adaptSkipLog sets how quickly the compressor begins skipping blocks when data is incompressible.
+	// This significantly speeds up incompressible data and usually has very small impact on compression.
+	// bytes to skip =  1 + (bytes since last match >> adaptSkipLog)
+	const adaptSkipLog = 7
+
+	var si, di, anchor int
+	sn := len(src) - mfLimit
+	if sn <= 0 {
+		goto lastLiterals
+	}
+
+	if depth == 0 {
+		depth = winSize
+	}
+
+	for si < sn {
+		// Hash the next 4 bytes (sequence).
+		match := binary.LittleEndian.Uint32(src[si:])
+		h := blockHashHC(match)
+
+		// Follow the chain until out of window and give the longest match.
+		mLen := 0
+		offset := 0
+		for next, try := c.hashTable[h], depth; try > 0 && next > 0 && si-next < winSize; next, try = c.chainTable[next&winMask], try-1 {
+			// The first (mLen==0) or next byte (mLen>=minMatch) at current match length
+			// must match to improve on the match length.
+			if src[next+mLen] != src[si+mLen] {
+				continue
+			}
+			ml := 0
+			// Compare the current position with a previous with the same hash.
+			for ml < sn-si {
+				x := binary.LittleEndian.Uint64(src[next+ml:]) ^ binary.LittleEndian.Uint64(src[si+ml:])
+				if x == 0 {
+					ml += 8
+				} else {
+					// Stop is first non-zero byte.
+					ml += bits.TrailingZeros64(x) >> 3
+					break
+				}
+			}
+			if ml < minMatch || ml <= mLen {
+				// Match too small (<minMath) or smaller than the current match.
+				continue
+			}
+			// Found a longer match, keep its position and length.
+			mLen = ml
+			offset = si - next
+			// Try another previous position with the same hash.
+		}
+		c.chainTable[si&winMask] = c.hashTable[h]
+		c.hashTable[h] = si
+
+		// No match found.
+		if mLen == 0 {
+			si += 1 + (si-anchor)>>adaptSkipLog
+			continue
+		}
+
+		// Match found.
+		// Update hash/chain tables with overlapping bytes:
+		// si already hashed, add everything from si+1 up to the match length.
+		winStart := si + 1
+		if ws := si + mLen - winSize; ws > winStart {
+			winStart = ws
+		}
+		for si, ml := winStart, si+mLen; si < ml; {
+			match >>= 8
+			match |= uint32(src[si+3]) << 24
+			h := blockHashHC(match)
+			c.chainTable[si&winMask] = c.hashTable[h]
+			c.hashTable[h] = si
+			si++
+		}
+
+		lLen := si - anchor
+		si += mLen
+		mLen -= minMatch // Match length does not include minMatch.
+
+		if mLen < 0xF {
+			dst[di] = byte(mLen)
+		} else {
+			dst[di] = 0xF
+		}
+
+		// Encode literals length.
+		if lLen < 0xF {
+			dst[di] |= byte(lLen << 4)
+		} else {
+			dst[di] |= 0xF0
+			di++
+			l := lLen - 0xF
+			for ; l >= 0xFF; l -= 0xFF {
+				dst[di] = 0xFF
+				di++
+			}
+			dst[di] = byte(l)
+		}
+		di++
+
+		// Literals.
+		copy(dst[di:di+lLen], src[anchor:anchor+lLen])
+		di += lLen
+		anchor = si
+
+		// Encode offset.
+		di += 2
+		dst[di-2], dst[di-1] = byte(offset), byte(offset>>8)
+
+		// Encode match length part 2.
+		if mLen >= 0xF {
+			for mLen -= 0xF; mLen >= 0xFF; mLen -= 0xFF {
+				dst[di] = 0xFF
+				di++
+			}
+			dst[di] = byte(mLen)
+			di++
+		}
+	}
+
+	if isNotCompressible && anchor == 0 {
+		// Incompressible.
+		return 0, nil
+	}
+
+	// Last literals.
+lastLiterals:
+	lLen := len(src) - anchor
+	if lLen < 0xF {
+		dst[di] = byte(lLen << 4)
+	} else {
+		dst[di] = 0xF0
+		di++
+		lLen -= 0xF
+		for ; lLen >= 0xFF; lLen -= 0xFF {
+			dst[di] = 0xFF
+			di++
+		}
+		dst[di] = byte(lLen)
+	}
+	di++
+
+	// Write the last literals.
+	if isNotCompressible && di >= anchor {
+		// Incompressible.
+		return 0, nil
+	}
+	di += copy(dst[di:di+len(src)-anchor], src[anchor:])
+	return di, nil
+}
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/blocks.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/blocks.go
@@ -0,0 +1,88 @@
+// Package lz4block provides LZ4 BlockSize types and pools of buffers.
+package lz4block
+
+import "sync"
+
+const (
+	Block64Kb uint32 = 1 << (16 + iota*2)
+	Block256Kb
+	Block1Mb
+	Block4Mb
+	Block8Mb        = 2 * Block4Mb
+	legacyBlockSize = Block8Mb + Block8Mb/255 + 16 // CompressBound(Block8Mb)
+)
+
+var (
+	BlockPool64K  = sync.Pool{New: func() interface{} { return make([]byte, Block64Kb) }}
+	BlockPool256K = sync.Pool{New: func() interface{} { return make([]byte, Block256Kb) }}
+	BlockPool1M   = sync.Pool{New: func() interface{} { return make([]byte, Block1Mb) }}
+	BlockPool4M   = sync.Pool{New: func() interface{} { return make([]byte, Block4Mb) }}
+	BlockPool8M   = sync.Pool{New: func() interface{} { return make([]byte, legacyBlockSize) }}
+)
+
+func Index(b uint32) BlockSizeIndex {
+	switch b {
+	case Block64Kb:
+		return 4
+	case Block256Kb:
+		return 5
+	case Block1Mb:
+		return 6
+	case Block4Mb:
+		return 7
+	case Block8Mb: // only valid in legacy mode
+		return 3
+	}
+	return 0
+}
+
+func IsValid(b uint32) bool {
+	return Index(b) > 0
+}
+
+type BlockSizeIndex uint8
+
+func (b BlockSizeIndex) IsValid() bool {
+	switch b {
+	case 4, 5, 6, 7:
+		return true
+	}
+	return false
+}
+
+func (b BlockSizeIndex) Get() []byte {
+	var buf interface{}
+	switch b {
+	case 4:
+		buf = BlockPool64K.Get()
+	case 5:
+		buf = BlockPool256K.Get()
+	case 6:
+		buf = BlockPool1M.Get()
+	case 7:
+		buf = BlockPool4M.Get()
+	case 3:
+		buf = BlockPool8M.Get()
+	}
+	return buf.([]byte)
+}
+
+func Put(buf []byte) {
+	// Safeguard: do not allow invalid buffers.
+	switch c := cap(buf); uint32(c) {
+	case Block64Kb:
+		BlockPool64K.Put(buf[:c])
+	case Block256Kb:
+		BlockPool256K.Put(buf[:c])
+	case Block1Mb:
+		BlockPool1M.Put(buf[:c])
+	case Block4Mb:
+		BlockPool4M.Put(buf[:c])
+	case legacyBlockSize:
+		BlockPool8M.Put(buf[:c])
+	}
+}
+
+type CompressionLevel uint32
+
+const Fast CompressionLevel = 0
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_amd64.s
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_amd64.s
@@ -0,0 +1,369 @@
+// +build !appengine
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// AX scratch
+// BX scratch
+// CX scratch
+// DX token
+//
+// DI &dst
+// SI &src
+// R8 &dst + len(dst)
+// R9 &src + len(src)
+// R11 &dst
+// R12 short output end
+// R13 short input end
+// func decodeBlock(dst, src []byte) int
+// using 50 bytes of stack currently
+TEXT ·decodeBlock(SB), NOSPLIT, $64-56
+	MOVQ dst_base+0(FP), DI
+	MOVQ DI, R11
+	MOVQ dst_len+8(FP), R8
+	ADDQ DI, R8
+
+	MOVQ src_base+24(FP), SI
+	MOVQ src_len+32(FP), R9
+	CMPQ R9, $0
+	JE   err_corrupt
+	ADDQ SI, R9
+
+	// shortcut ends
+	// short output end
+	MOVQ R8, R12
+	SUBQ $32, R12
+	// short input end
+	MOVQ R9, R13
+	SUBQ $16, R13
+
+loop:
+	// for si < len(src)
+	CMPQ SI, R9
+	JGE end
+
+	// token := uint32(src[si])
+	MOVBQZX (SI), DX
+	INCQ SI
+
+	// lit_len = token >> 4
+	// if lit_len > 0
+	// CX = lit_len
+	MOVQ DX, CX
+	SHRQ $4, CX
+
+	// if lit_len != 0xF
+	CMPQ CX, $0xF
+	JEQ lit_len_loop_pre
+	CMPQ DI, R12
+	JGE lit_len_loop_pre
+	CMPQ SI, R13
+	JGE lit_len_loop_pre
+
+	// copy shortcut
+
+	// A two-stage shortcut for the most common case:
+	// 1) If the literal length is 0..14, and there is enough space,
+	// enter the shortcut and copy 16 bytes on behalf of the literals
+	// (in the fast mode, only 8 bytes can be safely copied this way).
+	// 2) Further if the match length is 4..18, copy 18 bytes in a similar
+	// manner; but we ensure that there's enough space in the output for
+	// those 18 bytes earlier, upon entering the shortcut (in other words,
+	// there is a combined check for both stages).
+
+	// copy literal
+	MOVOU (SI), X0
+	MOVOU X0, (DI)
+	ADDQ CX, DI
+	ADDQ CX, SI
+
+	MOVQ DX, CX
+	ANDQ $0xF, CX
+
+	// The second stage: prepare for match copying, decode full info.
+	// If it doesn't work out, the info won't be wasted.
+	// offset := uint16(data[:2])
+	MOVWQZX (SI), DX
+	ADDQ $2, SI
+
+	MOVQ DI, AX
+	SUBQ DX, AX
+	CMPQ AX, DI
+	JGT err_short_buf
+
+	// if we can't do the second stage then jump straight to read the
+	// match length, we already have the offset.
+	CMPQ CX, $0xF
+	JEQ match_len_loop_pre
+	CMPQ DX, $8
+	JLT match_len_loop_pre
+	CMPQ AX, R11
+	JLT err_short_buf
+
+	// memcpy(op + 0, match + 0, 8);
+	MOVQ (AX), BX
+	MOVQ BX, (DI)
+	// memcpy(op + 8, match + 8, 8);
+	MOVQ 8(AX), BX
+	MOVQ BX, 8(DI)
+	// memcpy(op +16, match +16, 2);
+	MOVW 16(AX), BX
+	MOVW BX, 16(DI)
+
+	LEAQ 4(DI)(CX*1), DI // minmatch
+
+	// shortcut complete, load next token
+	JMP loop
+
+lit_len_loop_pre:
+	// if lit_len > 0
+	CMPQ CX, $0
+	JEQ offset
+	CMPQ CX, $0xF
+	JNE copy_literal
+
+lit_len_loop:
+	// for src[si] == 0xFF
+	CMPB (SI), $0xFF
+	JNE lit_len_finalise
+
+	// bounds check src[si+1]
+	LEAQ 1(SI), AX
+	CMPQ AX, R9
+	JGT err_short_buf
+
+	// lit_len += 0xFF
+	ADDQ $0xFF, CX
+	INCQ SI
+	JMP lit_len_loop
+
+lit_len_finalise:
+	// lit_len += int(src[si])
+	// si++
+	MOVBQZX (SI), AX
+	ADDQ AX, CX
+	INCQ SI
+
+copy_literal:
+	// bounds check src and dst
+	LEAQ (SI)(CX*1), AX
+	CMPQ AX, R9
+	JGT err_short_buf
+
+	LEAQ (DI)(CX*1), AX
+	CMPQ AX, R8
+	JGT err_short_buf
+
+	// whats a good cut off to call memmove?
+	CMPQ CX, $16
+	JGT memmove_lit
+
+	// if len(dst[di:]) < 16
+	MOVQ R8, AX
+	SUBQ DI, AX
+	CMPQ AX, $16
+	JLT memmove_lit
+
+	// if len(src[si:]) < 16
+	MOVQ R9, AX
+	SUBQ SI, AX
+	CMPQ AX, $16
+	JLT memmove_lit
+
+	MOVOU (SI), X0
+	MOVOU X0, (DI)
+
+	JMP finish_lit_copy
+
+memmove_lit:
+	// memmove(to, from, len)
+	MOVQ DI, 0(SP)
+	MOVQ SI, 8(SP)
+	MOVQ CX, 16(SP)
+	// spill
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+	MOVQ CX, 40(SP) // need len to inc SI, DI after
+	MOVB DX, 48(SP)
+	CALL runtime·memmove(SB)
+
+	// restore registers
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+	MOVQ 40(SP), CX
+	MOVB 48(SP), DX
+
+	// recalc initial values
+	MOVQ dst_base+0(FP), R8
+	MOVQ R8, R11
+	ADDQ dst_len+8(FP), R8
+	MOVQ src_base+24(FP), R9
+	ADDQ src_len+32(FP), R9
+	MOVQ R8, R12
+	SUBQ $32, R12
+	MOVQ R9, R13
+	SUBQ $16, R13
+
+finish_lit_copy:
+	ADDQ CX, SI
+	ADDQ CX, DI
+
+	CMPQ SI, R9
+	JGE end
+
+offset:
+	// CX := mLen
+	// free up DX to use for offset
+	MOVQ DX, CX
+
+	LEAQ 2(SI), AX
+	CMPQ AX, R9
+	JGT err_short_buf
+
+	// offset
+	// DX := int(src[si]) | int(src[si+1])<<8
+	MOVWQZX (SI), DX
+	ADDQ $2, SI
+
+	// 0 offset is invalid
+	CMPQ DX, $0
+	JEQ err_corrupt
+
+	ANDB $0xF, CX
+
+match_len_loop_pre:
+	// if mlen != 0xF
+	CMPB CX, $0xF
+	JNE copy_match
+
+match_len_loop:
+	// for src[si] == 0xFF
+	// lit_len += 0xFF
+	CMPB (SI), $0xFF
+	JNE match_len_finalise
+
+	// bounds check src[si+1]
+	LEAQ 1(SI), AX
+	CMPQ AX, R9
+	JGT err_short_buf
+
+	ADDQ $0xFF, CX
+	INCQ SI
+	JMP match_len_loop
+
+match_len_finalise:
+	// lit_len += int(src[si])
+	// si++
+	MOVBQZX (SI), AX
+	ADDQ AX, CX
+	INCQ SI
+
+copy_match:
+	// mLen += minMatch
+	ADDQ $4, CX
+
+	// check we have match_len bytes left in dst
+	// di+match_len < len(dst)
+	LEAQ (DI)(CX*1), AX
+	CMPQ AX, R8
+	JGT err_short_buf
+
+	// DX = offset
+	// CX = match_len
+	// BX = &dst + (di - offset)
+	MOVQ DI, BX
+	SUBQ DX, BX
+
+	// check BX is within dst
+	// if BX < &dst
+	CMPQ BX, R11
+	JLT err_short_buf
+
+	// if offset + match_len < di
+	LEAQ (BX)(CX*1), AX
+	CMPQ DI, AX
+	JGT copy_interior_match
+
+	// AX := len(dst[:di])
+	// MOVQ DI, AX
+	// SUBQ R11, AX
+
+	// copy 16 bytes at a time
+	// if di-offset < 16 copy 16-(di-offset) bytes to di
+	// then do the remaining
+
+copy_match_loop:
+	// for match_len >= 0
+	// dst[di] = dst[i]
+	// di++
+	// i++
+	MOVB (BX), AX
+	MOVB AX, (DI)
+	INCQ DI
+	INCQ BX
+	DECQ CX
+
+	CMPQ CX, $0
+	JGT copy_match_loop
+
+	JMP loop
+
+copy_interior_match:
+	CMPQ CX, $16
+	JGT memmove_match
+
+	// if len(dst[di:]) < 16
+	MOVQ R8, AX
+	SUBQ DI, AX
+	CMPQ AX, $16
+	JLT memmove_match
+
+	MOVOU (BX), X0
+	MOVOU X0, (DI)
+
+	ADDQ CX, DI
+	JMP loop
+
+memmove_match:
+	// memmove(to, from, len)
+	MOVQ DI, 0(SP)
+	MOVQ BX, 8(SP)
+	MOVQ CX, 16(SP)
+	// spill
+	MOVQ DI, 24(SP)
+	MOVQ SI, 32(SP)
+	MOVQ CX, 40(SP) // need len to inc SI, DI after
+	CALL runtime·memmove(SB)
+
+	// restore registers
+	MOVQ 24(SP), DI
+	MOVQ 32(SP), SI
+	MOVQ 40(SP), CX
+
+	// recalc initial values
+	MOVQ dst_base+0(FP), R8
+	MOVQ R8, R11 // TODO: make these sensible numbers
+	ADDQ dst_len+8(FP), R8
+	MOVQ src_base+24(FP), R9
+	ADDQ src_len+32(FP), R9
+	MOVQ R8, R12
+	SUBQ $32, R12
+	MOVQ R9, R13
+	SUBQ $16, R13
+
+	ADDQ CX, DI
+	JMP loop
+
+err_corrupt:
+	MOVQ $-1, ret+48(FP)
+	RET
+
+err_short_buf:
+	MOVQ $-2, ret+48(FP)
+	RET
+
+end:
+	SUBQ R11, DI
+	MOVQ DI, ret+48(FP)
+	RET
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_arm.s
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_arm.s
@@ -0,0 +1,201 @@
+// +build gc
+// +build !noasm
+
+#include "textflag.h"
+
+// Register allocation.
+#define dst	R0
+#define dstorig	R1
+#define src	R2
+#define dstend	R3
+#define srcend	R4
+#define match	R5	// Match address.
+#define token	R6
+#define len	R7	// Literal and match lengths.
+#define offset	R6	// Match offset; overlaps with token.
+#define tmp1	R8
+#define tmp2	R9
+#define tmp3	R12
+
+#define minMatch	$4
+
+// func decodeBlock(dst, src []byte) int
+TEXT ·decodeBlock(SB), NOFRAME|NOSPLIT, $-4-28
+	MOVW dst_base +0(FP), dst
+	MOVW dst_len  +4(FP), dstend
+	MOVW src_base+12(FP), src
+	MOVW src_len +16(FP), srcend
+
+	CMP $0, srcend
+	BEQ shortSrc
+
+	ADD dst, dstend
+	ADD src, srcend
+
+	MOVW dst, dstorig
+
+loop:
+	// Read token. Extract literal length.
+	MOVBU.P 1(src), token
+	MOVW    token >> 4, len
+	CMP     $15, len
+	BNE     readLitlenDone
+
+readLitlenLoop:
+	CMP     src, srcend
+	BEQ     shortSrc
+	MOVBU.P 1(src), tmp1
+	ADD     tmp1, len
+	CMP     $255, tmp1
+	BEQ     readLitlenLoop
+
+readLitlenDone:
+	CMP $0, len
+	BEQ copyLiteralDone
+
+	// Bounds check dst+len and src+len.
+	ADD    dst, len, tmp1
+	CMP    dstend, tmp1
+	//BHI  shortDst	// Uncomment for distinct error codes.
+	ADD    src, len, tmp2
+	CMP.LS srcend, tmp2
+	BHI    shortSrc
+
+	// Copy literal.
+	CMP $4, len
+	BLO copyLiteralFinish
+
+	// Copy 0-3 bytes until src is aligned.
+	TST        $1, src
+	MOVBU.NE.P 1(src), tmp1
+	MOVB.NE.P  tmp1, 1(dst)
+	SUB.NE     $1, len
+
+	TST        $2, src
+	MOVHU.NE.P 2(src), tmp2
+	MOVB.NE.P  tmp2, 1(dst)
+	MOVW.NE    tmp2 >> 8, tmp1
+	MOVB.NE.P  tmp1, 1(dst)
+	SUB.NE     $2, len
+
+	B copyLiteralLoopCond
+
+copyLiteralLoop:
+	// Aligned load, unaligned write.
+	MOVW.P 4(src), tmp1
+	MOVW   tmp1 >>  8, tmp2
+	MOVB   tmp2, 1(dst)
+	MOVW   tmp1 >> 16, tmp3
+	MOVB   tmp3, 2(dst)
+	MOVW   tmp1 >> 24, tmp2
+	MOVB   tmp2, 3(dst)
+	MOVB.P tmp1, 4(dst)
+copyLiteralLoopCond:
+	// Loop until len-4 < 0.
+	SUB.S  $4, len
+	BPL    copyLiteralLoop
+
+	// Restore len, which is now negative.
+	ADD $4, len
+
+copyLiteralFinish:
+	// Copy remaining 0-3 bytes.
+	TST        $2, len
+	MOVHU.NE.P 2(src), tmp2
+	MOVB.NE.P  tmp2, 1(dst)
+	MOVW.NE    tmp2 >> 8, tmp1
+	MOVB.NE.P  tmp1, 1(dst)
+	TST        $1, len
+	MOVBU.NE.P 1(src), tmp1
+	MOVB.NE.P  tmp1, 1(dst)
+
+copyLiteralDone:
+	CMP src, srcend
+	BEQ end
+
+	// Initial part of match length.
+	// This frees up the token register for reuse as offset.
+	AND $15, token, len
+
+	// Read offset.
+	ADD   $2, src
+	CMP   srcend, src
+	BHI   shortSrc
+	MOVBU -2(src), offset
+	MOVBU -1(src), tmp1
+	ORR   tmp1 << 8, offset
+	CMP   $0, offset
+	BEQ   corrupt
+
+	// Read rest of match length.
+	CMP $15, len
+	BNE readMatchlenDone
+
+readMatchlenLoop:
+	CMP     src, srcend
+	BEQ     shortSrc
+	MOVBU.P 1(src), tmp1
+	ADD     tmp1, len
+	CMP     $255, tmp1
+	BEQ     readMatchlenLoop
+
+readMatchlenDone:
+	ADD minMatch, len
+
+	// Bounds check dst+len and match = dst-offset.
+	ADD    dst, len, tmp1
+	CMP    dstend, tmp1
+	//BHI  shortDst	// Uncomment for distinct error codes.
+	SUB    offset, dst, match
+	CMP.LS match, dstorig
+	BHI    corrupt
+
+	// If the offset is at least four (len is, because of minMatch),
+	// do a four-way unrolled byte copy loop. Using MOVD instead of four
+	// byte loads is much faster, but to remain portable we'd have to
+	// align match first, which in turn is too expensive.
+	CMP $4, offset
+	BLO copyMatch
+
+	SUB $4, len
+copyMatch4:
+	MOVBU.P 4(match), tmp1
+	MOVB.P  tmp1, 4(dst)
+	MOVBU   -3(match), tmp2
+	MOVB    tmp2, -3(dst)
+	MOVBU   -2(match), tmp3
+	MOVB    tmp3, -2(dst)
+	MOVBU   -1(match), tmp1
+	MOVB    tmp1, -1(dst)
+	SUB.S   $4, len
+	BPL     copyMatch4
+
+	// Restore len, which is now negative.
+	ADD.S $4, len
+	BEQ   copyMatchDone
+
+copyMatch:
+	// Simple byte-at-a-time copy.
+	SUB.S   $1, len
+	MOVBU.P 1(match), tmp2
+	MOVB.P  tmp2, 1(dst)
+	BNE     copyMatch
+
+copyMatchDone:
+	CMP src, srcend
+	BNE loop
+
+end:
+	SUB  dstorig, dst, tmp1
+	MOVW tmp1, ret+24(FP)
+	RET
+
+	// The three error cases have distinct labels so we can put different
+	// return codes here when debugging, or if the error returns need to
+	// be changed.
+shortDst:
+shortSrc:
+corrupt:
+	MOVW $-1, tmp1
+	MOVW tmp1, ret+24(FP)
+	RET
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_asm.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_asm.go
@@ -0,0 +1,9 @@
+// +build amd64 arm
+// +build !appengine
+// +build gc
+// +build !noasm
+
+package lz4block
+
+//go:noescape
+func decodeBlock(dst, src []byte) int
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_other.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4block/decode_other.go
@@ -0,0 +1,100 @@
+// +build !amd64,!arm appengine !gc noasm
+
+package lz4block
+
+func decodeBlock(dst, src []byte) (ret int) {
+	const hasError = -2
+	defer func() {
+		if recover() != nil {
+			ret = hasError
+		}
+	}()
+
+	var si, di uint
+	for {
+		// Literals and match lengths (token).
+		b := uint(src[si])
+		si++
+
+		// Literals.
+		if lLen := b >> 4; lLen > 0 {
+			switch {
+			case lLen < 0xF && si+16 < uint(len(src)):
+				// Shortcut 1
+				// if we have enough room in src and dst, and the literals length
+				// is small enough (0..14) then copy all 16 bytes, even if not all
+				// are part of the literals.
+				copy(dst[di:], src[si:si+16])
+				si += lLen
+				di += lLen
+				if mLen := b & 0xF; mLen < 0xF {
+					// Shortcut 2
+					// if the match length (4..18) fits within the literals, then copy
+					// all 18 bytes, even if not all are part of the literals.
+					mLen += 4
+					if offset := uint(src[si]) | uint(src[si+1])<<8; mLen <= offset {
+						i := di - offset
+						end := i + 18
+						if end > uint(len(dst)) {
+							// The remaining buffer may not hold 18 bytes.
+							// See https://github.com/pierrec/lz4/issues/51.
+							end = uint(len(dst))
+						}
+						copy(dst[di:], dst[i:end])
+						si += 2
+						di += mLen
+						continue
+					}
+				}
+			case lLen == 0xF:
+				for src[si] == 0xFF {
+					lLen += 0xFF
+					si++
+				}
+				lLen += uint(src[si])
+				si++
+				fallthrough
+			default:
+				copy(dst[di:di+lLen], src[si:si+lLen])
+				si += lLen
+				di += lLen
+			}
+		}
+		if si == uint(len(src)) {
+			return int(di)
+		} else if si > uint(len(src)) {
+			return hasError
+		}
+
+		offset := uint(src[si]) | uint(src[si+1])<<8
+		if offset == 0 {
+			return hasError
+		}
+		si += 2
+
+		// Match.
+		mLen := b & 0xF
+		if mLen == 0xF {
+			for src[si] == 0xFF {
+				mLen += 0xFF
+				si++
+			}
+			mLen += uint(src[si])
+			si++
+		}
+		mLen += minMatch
+
+		// Copy the match.
+		expanded := dst[di-offset:]
+		if mLen > offset {
+			// Efficiently copy the match dst[di-offset:di] into the dst slice.
+			bytesToCopy := offset * (mLen / offset)
+			for n := offset; n <= bytesToCopy+offset; n *= 2 {
+				copy(expanded[n:], expanded[:n])
+			}
+			di += bytesToCopy
+			mLen -= bytesToCopy
+		}
+		di += uint(copy(dst[di:di+mLen], expanded[:mLen]))
+	}
+}
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4errors/errors.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4errors/errors.go
@@ -0,0 +1,19 @@
+package lz4errors
+
+type Error string
+
+func (e Error) Error() string { return string(e) }
+
+const (
+	ErrInvalidSourceShortBuffer      Error = "lz4: invalid source or destination buffer too short"
+	ErrInvalidFrame                  Error = "lz4: bad magic number"
+	ErrInternalUnhandledState        Error = "lz4: unhandled state"
+	ErrInvalidHeaderChecksum         Error = "lz4: invalid header checksum"
+	ErrInvalidBlockChecksum          Error = "lz4: invalid block checksum"
+	ErrInvalidFrameChecksum          Error = "lz4: invalid frame checksum"
+	ErrOptionInvalidCompressionLevel Error = "lz4: invalid compression level"
+	ErrOptionClosedOrError           Error = "lz4: cannot apply options on closed or in error object"
+	ErrOptionInvalidBlockSize        Error = "lz4: invalid block size"
+	ErrOptionNotApplicable           Error = "lz4: option not applicable"
+	ErrWriterNotClosed               Error = "lz4: writer not closed"
+)
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/block.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/block.go
@@ -0,0 +1,331 @@
+package lz4stream
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"sync"
+
+	"github.com/pierrec/lz4/v4/internal/lz4block"
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
+	"github.com/pierrec/lz4/v4/internal/xxh32"
+)
+
+type Blocks struct {
+	Block  *FrameDataBlock
+	Blocks chan chan *FrameDataBlock
+	mu     sync.Mutex
+	err    error
+}
+
+func (b *Blocks) initW(f *Frame, dst io.Writer, num int) {
+	if num == 1 {
+		b.Blocks = nil
+		b.Block = NewFrameDataBlock(f)
+		return
+	}
+	b.Block = nil
+	if cap(b.Blocks) != num {
+		b.Blocks = make(chan chan *FrameDataBlock, num)
+	}
+	// goroutine managing concurrent block compression goroutines.
+	go func() {
+		// Process next block compression item.
+		for c := range b.Blocks {
+			// Read the next compressed block result.
+			// Waiting here ensures that the blocks are output in the order they were sent.
+			// The incoming channel is always closed as it indicates to the caller that
+			// the block has been processed.
+			block := <-c
+			if block == nil {
+				// Notify the block compression routine that we are done with its result.
+				// This is used when a sentinel block is sent to terminate the compression.
+				close(c)
+				return
+			}
+			// Do not attempt to write the block upon any previous failure.
+			if b.err == nil {
+				// Write the block.
+				if err := block.Write(f, dst); err != nil {
+					// Keep the first error.
+					b.err = err
+					// All pending compression goroutines need to shut down, so we need to keep going.
+				}
+			}
+			close(c)
+		}
+	}()
+}
+
+func (b *Blocks) close(f *Frame, num int) error {
+	if num == 1 {
+		if b.Block != nil {
+			b.Block.Close(f)
+		}
+		err := b.err
+		b.err = nil
+		return err
+	}
+	if b.Blocks == nil {
+		// Not initialized yet.
+		return nil
+	}
+	c := make(chan *FrameDataBlock)
+	b.Blocks <- c
+	c <- nil
+	<-c
+	err := b.err
+	b.err = nil
+	return err
+}
+
+// ErrorR returns any error set while uncompressing a stream.
+func (b *Blocks) ErrorR() error {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	return b.err
+}
+
+// initR returns a channel that streams the uncompressed blocks if in concurrent
+// mode and no error. When the channel is closed, check for any error with b.ErrorR.
+//
+// If not in concurrent mode, the uncompressed block is b.Block and the returned error
+// needs to be checked.
+func (b *Blocks) initR(f *Frame, num int, src io.Reader) (chan []byte, error) {
+	size := f.Descriptor.Flags.BlockSizeIndex()
+	if num == 1 {
+		b.Blocks = nil
+		b.Block = NewFrameDataBlock(f)
+		return nil, nil
+	}
+	b.Block = nil
+	blocks := make(chan chan []byte, num)
+	// data receives the uncompressed blocks.
+	data := make(chan []byte)
+	// Read blocks from the source sequentially
+	// and uncompress them concurrently.
+
+	// In legacy mode, accrue the uncompress sizes in cum.
+	var cum uint32
+	go func() {
+		var cumx uint32
+		var err error
+		for b.ErrorR() == nil {
+			block := NewFrameDataBlock(f)
+			cumx, err = block.Read(f, src, 0)
+			if err != nil {
+				break
+			}
+			// Recheck for an error as reading may be slow and uncompressing is expensive.
+			if b.ErrorR() != nil {
+				break
+			}
+			c := make(chan []byte)
+			blocks <- c
+			go func() {
+				data, err := block.Uncompress(f, size.Get(), false)
+				if err != nil {
+					b.closeR(err)
+				} else {
+					c <- data
+				}
+			}()
+		}
+		// End the collection loop and the data channel.
+		c := make(chan []byte)
+		blocks <- c
+		c <- nil // signal the collection loop that we are done
+		<-c      // wait for the collect loop to complete
+		if f.isLegacy() && cum == cumx {
+			err = io.EOF
+		}
+		b.closeR(err)
+		close(data)
+	}()
+	// Collect the uncompressed blocks and make them available
+	// on the returned channel.
+	go func(leg bool) {
+		defer close(blocks)
+		for c := range blocks {
+			buf := <-c
+			if buf == nil {
+				// Signal to end the loop.
+				close(c)
+				return
+			}
+			// Perform checksum now as the blocks are received in order.
+			if f.Descriptor.Flags.ContentChecksum() {
+				_, _ = f.checksum.Write(buf)
+			}
+			if leg {
+				cum += uint32(len(buf))
+			}
+			data <- buf
+			close(c)
+		}
+	}(f.isLegacy())
+	return data, nil
+}
+
+// closeR safely sets the error on b if not already set.
+func (b *Blocks) closeR(err error) {
+	b.mu.Lock()
+	if b.err == nil {
+		b.err = err
+	}
+	b.mu.Unlock()
+}
+
+func NewFrameDataBlock(f *Frame) *FrameDataBlock {
+	buf := f.Descriptor.Flags.BlockSizeIndex().Get()
+	return &FrameDataBlock{Data: buf, data: buf}
+}
+
+type FrameDataBlock struct {
+	Size     DataBlockSize
+	Data     []byte // compressed or uncompressed data (.data or .src)
+	Checksum uint32
+	data     []byte // buffer for compressed data
+	src      []byte // uncompressed data
+	err      error  // used in concurrent mode
+}
+
+func (b *FrameDataBlock) Close(f *Frame) {
+	b.Size = 0
+	b.Checksum = 0
+	b.err = nil
+	if b.data != nil {
+		// Block was not already closed.
+		lz4block.Put(b.data)
+		b.Data = nil
+		b.data = nil
+		b.src = nil
+	}
+}
+
+// Block compression errors are ignored since the buffer is sized appropriately.
+func (b *FrameDataBlock) Compress(f *Frame, src []byte, level lz4block.CompressionLevel) *FrameDataBlock {
+	data := b.data
+	if f.isLegacy() {
+		data = data[:cap(data)]
+	} else {
+		data = data[:len(src)] // trigger the incompressible flag in CompressBlock
+	}
+	var n int
+	switch level {
+	case lz4block.Fast:
+		n, _ = lz4block.CompressBlock(src, data)
+	default:
+		n, _ = lz4block.CompressBlockHC(src, data, level)
+	}
+	if n == 0 {
+		b.Size.UncompressedSet(true)
+		b.Data = src
+	} else {
+		b.Size.UncompressedSet(false)
+		b.Data = data[:n]
+	}
+	b.Size.sizeSet(len(b.Data))
+	b.src = src // keep track of the source for content checksum
+
+	if f.Descriptor.Flags.BlockChecksum() {
+		b.Checksum = xxh32.ChecksumZero(src)
+	}
+	return b
+}
+
+func (b *FrameDataBlock) Write(f *Frame, dst io.Writer) error {
+	// Write is called in the same order as blocks are compressed,
+	// so content checksum must be done here.
+	if f.Descriptor.Flags.ContentChecksum() {
+		_, _ = f.checksum.Write(b.src)
+	}
+	buf := f.buf[:]
+	binary.LittleEndian.PutUint32(buf, uint32(b.Size))
+	if _, err := dst.Write(buf[:4]); err != nil {
+		return err
+	}
+
+	if _, err := dst.Write(b.Data); err != nil {
+		return err
+	}
+
+	if b.Checksum == 0 {
+		return nil
+	}
+	binary.LittleEndian.PutUint32(buf, b.Checksum)
+	_, err := dst.Write(buf[:4])
+	return err
+}
+
+// Read updates b with the next block data, size and checksum if available.
+func (b *FrameDataBlock) Read(f *Frame, src io.Reader, cum uint32) (uint32, error) {
+	x, err := f.readUint32(src)
+	if err != nil {
+		return 0, err
+	}
+	if f.isLegacy() {
+		switch x {
+		case frameMagicLegacy:
+			// Concatenated legacy frame.
+			return b.Read(f, src, cum)
+		case cum:
+			// Only works in non concurrent mode, for concurrent mode
+			// it is handled separately.
+			// Linux kernel format appends the total uncompressed size at the end.
+			return 0, io.EOF
+		}
+	} else if x == 0 {
+		// Marker for end of stream.
+		return 0, io.EOF
+	}
+	b.Size = DataBlockSize(x)
+
+	size := b.Size.size()
+	if size > cap(b.data) {
+		return x, lz4errors.ErrOptionInvalidBlockSize
+	}
+	b.data = b.data[:size]
+	if _, err := io.ReadFull(src, b.data); err != nil {
+		return x, err
+	}
+	if f.Descriptor.Flags.BlockChecksum() {
+		sum, err := f.readUint32(src)
+		if err != nil {
+			return 0, err
+		}
+		b.Checksum = sum
+	}
+	return x, nil
+}
+
+func (b *FrameDataBlock) Uncompress(f *Frame, dst []byte, sum bool) ([]byte, error) {
+	if b.Size.Uncompressed() {
+		n := copy(dst, b.data)
+		dst = dst[:n]
+	} else {
+		n, err := lz4block.UncompressBlock(b.data, dst)
+		if err != nil {
+			return nil, err
+		}
+		dst = dst[:n]
+	}
+	if f.Descriptor.Flags.BlockChecksum() {
+		if c := xxh32.ChecksumZero(dst); c != b.Checksum {
+			err := fmt.Errorf("%w: got %x; expected %x", lz4errors.ErrInvalidBlockChecksum, c, b.Checksum)
+			return nil, err
+		}
+	}
+	if sum && f.Descriptor.Flags.ContentChecksum() {
+		_, _ = f.checksum.Write(dst)
+	}
+	return dst, nil
+}
+
+func (f *Frame) readUint32(r io.Reader) (x uint32, err error) {
+	if _, err = io.ReadFull(r, f.buf[:4]); err != nil {
+		return
+	}
+	x = binary.LittleEndian.Uint32(f.buf[:4])
+	return
+}
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/frame.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/frame.go
@@ -0,0 +1,200 @@
+// Package lz4stream provides the types that support reading and writing LZ4 data streams.
+package lz4stream
+
+import (
+	"encoding/binary"
+	"fmt"
+	"io"
+	"io/ioutil"
+
+	"github.com/pierrec/lz4/v4/internal/lz4block"
+	"github.com/pierrec/lz4/v4/internal/lz4errors"
+	"github.com/pierrec/lz4/v4/internal/xxh32"
+)
+
+//go:generate go run gen.go
+
+const (
+	frameMagic       uint32 = 0x184D2204
+	frameSkipMagic   uint32 = 0x184D2A50
+	frameMagicLegacy uint32 = 0x184C2102
+)
+
+func NewFrame() *Frame {
+	return &Frame{}
+}
+
+type Frame struct {
+	buf        [15]byte // frame descriptor needs at most 4(magic)+4+8+1=11 bytes
+	Magic      uint32
+	Descriptor FrameDescriptor
+	Blocks     Blocks
+	Checksum   uint32
+	checksum   xxh32.XXHZero
+}
+
+// Reset allows reusing the Frame.
+// The Descriptor configuration is not modified.
+func (f *Frame) Reset(num int) {
+	f.Magic = 0
+	f.Descriptor.Checksum = 0
+	f.Descriptor.ContentSize = 0
+	_ = f.Blocks.close(f, num)
+	f.Checksum = 0
+}
+
+func (f *Frame) InitW(dst io.Writer, num int, legacy bool) {
+	if legacy {
+		f.Magic = frameMagicLegacy
+		idx := lz4block.Index(lz4block.Block8Mb)
+		f.Descriptor.Flags.BlockSizeIndexSet(idx)
+	} else {
+		f.Magic = frameMagic
+		f.Descriptor.initW()
+	}
+	f.Blocks.initW(f, dst, num)
+	f.checksum.Reset()
+}
+
+func (f *Frame) CloseW(dst io.Writer, num int) error {
+	if err := f.Blocks.close(f, num); err != nil {
+		return err
+	}
+	if f.isLegacy() {
+		return nil
+	}
+	buf := f.buf[:0]
+	// End mark (data block size of uint32(0)).
+	buf = append(buf, 0, 0, 0, 0)
+	if f.Descriptor.Flags.ContentChecksum() {
+		buf = f.checksum.Sum(buf)
+	}
+	_, err := dst.Write(buf)
+	return err
+}
+
+func (f *Frame) isLegacy() bool {
+	return f.Magic == frameMagicLegacy
+}
+
+func (f *Frame) InitR(src io.Reader, num int) (chan []byte, error) {
+	if f.Magic > 0 {
+		// Header already read.
+		return nil, nil
+	}
+
+newFrame:
+	var err error
+	if f.Magic, err = f.readUint32(src); err != nil {
+		return nil, err
+	}
+	switch m := f.Magic; {
+	case m == frameMagic || m == frameMagicLegacy:
+	// All 16 values of frameSkipMagic are valid.
+	case m>>8 == frameSkipMagic>>8:
+		skip, err := f.readUint32(src)
+		if err != nil {
+			return nil, err
+		}
+		if _, err := io.CopyN(ioutil.Discard, src, int64(skip)); err != nil {
+			return nil, err
+		}
+		goto newFrame
+	default:
+		return nil, lz4errors.ErrInvalidFrame
+	}
+	if err := f.Descriptor.initR(f, src); err != nil {
+		return nil, err
+	}
+	f.checksum.Reset()
+	return f.Blocks.initR(f, num, src)
+}
+
+func (f *Frame) CloseR(src io.Reader) (err error) {
+	if f.isLegacy() {
+		return nil
+	}
+	if !f.Descriptor.Flags.ContentChecksum() {
+		return nil
+	}
+	if f.Checksum, err = f.readUint32(src); err != nil {
+		return err
+	}
+	if c := f.checksum.Sum32(); c != f.Checksum {
+		return fmt.Errorf("%w: got %x; expected %x", lz4errors.ErrInvalidFrameChecksum, c, f.Checksum)
+	}
+	return nil
+}
+
+type FrameDescriptor struct {
+	Flags       DescriptorFlags
+	ContentSize uint64
+	Checksum    uint8
+}
+
+func (fd *FrameDescriptor) initW() {
+	fd.Flags.VersionSet(1)
+	fd.Flags.BlockIndependenceSet(true)
+}
+
+func (fd *FrameDescriptor) Write(f *Frame, dst io.Writer) error {
+	if fd.Checksum > 0 {
+		// Header already written.
+		return nil
+	}
+
+	buf := f.buf[:4]
+	// Write the magic number here even though it belongs to the Frame.
+	binary.LittleEndian.PutUint32(buf, f.Magic)
+	if !f.isLegacy() {
+		buf = buf[:4+2]
+		binary.LittleEndian.PutUint16(buf[4:], uint16(fd.Flags))
+
+		if fd.Flags.Size() {
+			buf = buf[:4+2+8]
+			binary.LittleEndian.PutUint64(buf[4+2:], fd.ContentSize)
+		}
+		fd.Checksum = descriptorChecksum(buf[4:])
+		buf = append(buf, fd.Checksum)
+	}
+
+	_, err := dst.Write(buf)
+	return err
+}
+
+func (fd *FrameDescriptor) initR(f *Frame, src io.Reader) error {
+	if f.isLegacy() {
+		idx := lz4block.Index(lz4block.Block8Mb)
+		f.Descriptor.Flags.BlockSizeIndexSet(idx)
+		return nil
+	}
+	// Read the flags and the checksum, hoping that there is not content size.
+	buf := f.buf[:3]
+	if _, err := io.ReadFull(src, buf); err != nil {
+		return err
+	}
+	descr := binary.LittleEndian.Uint16(buf)
+	fd.Flags = DescriptorFlags(descr)
+	if fd.Flags.Size() {
+		// Append the 8 missing bytes.
+		buf = buf[:3+8]
+		if _, err := io.ReadFull(src, buf[3:]); err != nil {
+			return err
+		}
+		fd.ContentSize = binary.LittleEndian.Uint64(buf[2:])
+	}
+	fd.Checksum = buf[len(buf)-1] // the checksum is the last byte
+	buf = buf[:len(buf)-1]        // all descriptor fields except checksum
+	if c := descriptorChecksum(buf); fd.Checksum != c {
+		return fmt.Errorf("%w: got %x; expected %x", lz4errors.ErrInvalidHeaderChecksum, c, fd.Checksum)
+	}
+	// Validate the elements that can be.
+	if idx := fd.Flags.BlockSizeIndex(); !idx.IsValid() {
+		return lz4errors.ErrOptionInvalidBlockSize
+	}
+	return nil
+}
+
+func descriptorChecksum(buf []byte) byte {
+	return byte(xxh32.ChecksumZero(buf) >> 8)
+}
--- a/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/frame_gen.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/lz4stream/frame_gen.go
@@ -0,0 +1,103 @@
+// Code generated by `gen.exe`. DO NOT EDIT.
+
+package lz4stream
+
+import "github.com/pierrec/lz4/v4/internal/lz4block"
+
+// DescriptorFlags is defined as follow:
+//   field              bits
+//   -----              ----
+//   _                  2
+//   ContentChecksum    1
+//   Size               1
+//   BlockChecksum      1
+//   BlockIndependence  1
+//   Version            2
+//   _                  4
+//   BlockSizeIndex     3
+//   _                  1
+type DescriptorFlags uint16
+
+// Getters.
+func (x DescriptorFlags) ContentChecksum() bool   { return x>>2&1 != 0 }
+func (x DescriptorFlags) Size() bool              { return x>>3&1 != 0 }
+func (x DescriptorFlags) BlockChecksum() bool     { return x>>4&1 != 0 }
+func (x DescriptorFlags) BlockIndependence() bool { return x>>5&1 != 0 }
+func (x DescriptorFlags) Version() uint16         { return uint16(x >> 6 & 0x3) }
+func (x DescriptorFlags) BlockSizeIndex() lz4block.BlockSizeIndex {
+	return lz4block.BlockSizeIndex(x >> 12 & 0x7)
+}
+
+// Setters.
+func (x *DescriptorFlags) ContentChecksumSet(v bool) *DescriptorFlags {
+	const b = 1 << 2
+	if v {
+		*x = *x&^b | b
+	} else {
+		*x &^= b
+	}
+	return x
+}
+func (x *DescriptorFlags) SizeSet(v bool) *DescriptorFlags {
+	const b = 1 << 3
+	if v {
+		*x = *x&^b | b
+	} else {
+		*x &^= b
+	}
+	return x
+}
+func (x *DescriptorFlags) BlockChecksumSet(v bool) *DescriptorFlags {
+	const b = 1 << 4
+	if v {
+		*x = *x&^b | b
+	} else {
+		*x &^= b
+	}
+	return x
+}
+func (x *DescriptorFlags) BlockIndependenceSet(v bool) *DescriptorFlags {
+	const b = 1 << 5
+	if v {
+		*x = *x&^b | b
+	} else {
+		*x &^= b
+	}
+	return x
+}
+func (x *DescriptorFlags) VersionSet(v uint16) *DescriptorFlags {
+	*x = *x&^(0x3<<6) | (DescriptorFlags(v) & 0x3 << 6)
+	return x
+}
+func (x *DescriptorFlags) BlockSizeIndexSet(v lz4block.BlockSizeIndex) *DescriptorFlags {
+	*x = *x&^(0x7<<12) | (DescriptorFlags(v) & 0x7 << 12)
+	return x
+}
+
+// Code generated by `gen.exe`. DO NOT EDIT.
+
+// DataBlockSize is defined as follow:
+//   field         bits
+//   -----         ----
+//   size          31
+//   Uncompressed  1
+type DataBlockSize uint32
+
+// Getters.
+func (x DataBlockSize) size() int          { return int(x & 0x7FFFFFFF) }
+func (x DataBlockSize) Uncompressed() bool { return x>>31&1 != 0 }
+
+// Setters.
+func (x *DataBlockSize) sizeSet(v int) *DataBlockSize {
+	*x = *x&^0x7FFFFFFF | DataBlockSize(v)&0x7FFFFFFF
+	return x
+}
+func (x *DataBlockSize) UncompressedSet(v bool) *DataBlockSize {
+	const b = 1 << 31
+	if v {
+		*x = *x&^b | b
+	} else {
+		*x &^= b
+	}
+	return x
+}
--- a/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero.go
@@ -0,0 +1,212 @@
+// Package xxh32 implements the very fast XXH hashing algorithm (32 bits version).
+// (https://github.com/Cyan4973/XXH/)
+package xxh32
+
+import (
+	"encoding/binary"
+)
+
+const (
+	prime1 uint32 = 2654435761
+	prime2 uint32 = 2246822519
+	prime3 uint32 = 3266489917
+	prime4 uint32 = 668265263
+	prime5 uint32 = 374761393
+
+	primeMask   = 0xFFFFFFFF
+	prime1plus2 = uint32((uint64(prime1) + uint64(prime2)) & primeMask) // 606290984
+	prime1minus = uint32((-int64(prime1)) & primeMask)                  // 1640531535
+)
+
+// XXHZero represents an xxhash32 object with seed 0.
+type XXHZero struct {
+	v        [4]uint32
+	totalLen uint64
+	buf      [16]byte
+	bufused  int
+}
+
+// Sum appends the current hash to b and returns the resulting slice.
+// It does not change the underlying hash state.
+func (xxh XXHZero) Sum(b []byte) []byte {
+	h32 := xxh.Sum32()
+	return append(b, byte(h32), byte(h32>>8), byte(h32>>16), byte(h32>>24))
+}
+
+// Reset resets the Hash to its initial state.
+func (xxh *XXHZero) Reset() {
+	xxh.v[0] = prime1plus2
+	xxh.v[1] = prime2
+	xxh.v[2] = 0
+	xxh.v[3] = prime1minus
+	xxh.totalLen = 0
+	xxh.bufused = 0
+}
+
+// Size returns the number of bytes returned by Sum().
+func (xxh *XXHZero) Size() int {
+	return 4
+}
+
+// BlockSizeIndex gives the minimum number of bytes accepted by Write().
+func (xxh *XXHZero) BlockSize() int {
+	return 1
+}
+
+// Write adds input bytes to the Hash.
+// It never returns an error.
+func (xxh *XXHZero) Write(input []byte) (int, error) {
+	if xxh.totalLen == 0 {
+		xxh.Reset()
+	}
+	n := len(input)
+	m := xxh.bufused
+
+	xxh.totalLen += uint64(n)
+
+	r := len(xxh.buf) - m
+	if n < r {
+		copy(xxh.buf[m:], input)
+		xxh.bufused += len(input)
+		return n, nil
+	}
+
+	var buf *[16]byte
+	if m != 0 {
+		// some data left from previous update
+		buf = &xxh.buf
+		c := copy(buf[m:], input)
+		n -= c
+		input = input[c:]
+	}
+	update(&xxh.v, buf, input)
+	xxh.bufused = copy(xxh.buf[:], input[n-n%16:])
+
+	return n, nil
+}
+
+// Portable version of update. This updates v by processing all of buf
+// (if not nil) and all full 16-byte blocks of input.
+func updateGo(v *[4]uint32, buf *[16]byte, input []byte) {
+	// Causes compiler to work directly from registers instead of stack:
+	v1, v2, v3, v4 := v[0], v[1], v[2], v[3]
+
+	if buf != nil {
+		v1 = rol13(v1+binary.LittleEndian.Uint32(buf[:])*prime2) * prime1
+		v2 = rol13(v2+binary.LittleEndian.Uint32(buf[4:])*prime2) * prime1
+		v3 = rol13(v3+binary.LittleEndian.Uint32(buf[8:])*prime2) * prime1
+		v4 = rol13(v4+binary.LittleEndian.Uint32(buf[12:])*prime2) * prime1
+	}
+
+	for ; len(input) >= 16; input = input[16:] {
+		sub := input[:16] //BCE hint for compiler
+		v1 = rol13(v1+binary.LittleEndian.Uint32(sub[:])*prime2) * prime1
+		v2 = rol13(v2+binary.LittleEndian.Uint32(sub[4:])*prime2) * prime1
+		v3 = rol13(v3+binary.LittleEndian.Uint32(sub[8:])*prime2) * prime1
+		v4 = rol13(v4+binary.LittleEndian.Uint32(sub[12:])*prime2) * prime1
+	}
+	v[0], v[1], v[2], v[3] = v1, v2, v3, v4
+}
+
+// Sum32 returns the 32 bits Hash value.
+func (xxh *XXHZero) Sum32() uint32 {
+	h32 := uint32(xxh.totalLen)
+	if h32 >= 16 {
+		h32 += rol1(xxh.v[0]) + rol7(xxh.v[1]) + rol12(xxh.v[2]) + rol18(xxh.v[3])
+	} else {
+		h32 += prime5
+	}
+
+	p := 0
+	n := xxh.bufused
+	buf := xxh.buf
+	for n := n - 4; p <= n; p += 4 {
+		h32 += binary.LittleEndian.Uint32(buf[p:p+4]) * prime3
+		h32 = rol17(h32) * prime4
+	}
+	for ; p < n; p++ {
+		h32 += uint32(buf[p]) * prime5
+		h32 = rol11(h32) * prime1
+	}
+
+	h32 ^= h32 >> 15
+	h32 *= prime2
+	h32 ^= h32 >> 13
+	h32 *= prime3
+	h32 ^= h32 >> 16
+
+	return h32
+}
+
+// Portable version of ChecksumZero.
+func checksumZeroGo(input []byte) uint32 {
+	n := len(input)
+	h32 := uint32(n)
+
+	if n < 16 {
+		h32 += prime5
+	} else {
+		v1 := prime1plus2
+		v2 := prime2
+		v3 := uint32(0)
+		v4 := prime1minus
+		p := 0
+		for n := n - 16; p <= n; p += 16 {
+			sub := input[p:][:16] //BCE hint for compiler
+			v1 = rol13(v1+binary.LittleEndian.Uint32(sub[:])*prime2) * prime1
+			v2 = rol13(v2+binary.LittleEndian.Uint32(sub[4:])*prime2) * prime1
+			v3 = rol13(v3+binary.LittleEndian.Uint32(sub[8:])*prime2) * prime1
+			v4 = rol13(v4+binary.LittleEndian.Uint32(sub[12:])*prime2) * prime1
+		}
+		input = input[p:]
+		n -= p
+		h32 += rol1(v1) + rol7(v2) + rol12(v3) + rol18(v4)
+	}
+
+	p := 0
+	for n := n - 4; p <= n; p += 4 {
+		h32 += binary.LittleEndian.Uint32(input[p:p+4]) * prime3
+		h32 = rol17(h32) * prime4
+	}
+	for p < n {
+		h32 += uint32(input[p]) * prime5
+		h32 = rol11(h32) * prime1
+		p++
+	}
+
+	h32 ^= h32 >> 15
+	h32 *= prime2
+	h32 ^= h32 >> 13
+	h32 *= prime3
+	h32 ^= h32 >> 16
+
+	return h32
+}
+
+func rol1(u uint32) uint32 {
+	return u<<1 | u>>31
+}
+
+func rol7(u uint32) uint32 {
+	return u<<7 | u>>25
+}
+
+func rol11(u uint32) uint32 {
+	return u<<11 | u>>21
+}
+
+func rol12(u uint32) uint32 {
+	return u<<12 | u>>20
+}
+
+func rol13(u uint32) uint32 {
+	return u<<13 | u>>19
+}
+
+func rol17(u uint32) uint32 {
+	return u<<17 | u>>15
+}
+
+func rol18(u uint32) uint32 {
+	return u<<18 | u>>14
+}
--- a/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_arm.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_arm.go
@@ -0,0 +1,11 @@
+// +build !noasm
+
+package xxh32
+
+// ChecksumZero returns the 32-bit hash of input.
+//
+//go:noescape
+func ChecksumZero(input []byte) uint32
+
+//go:noescape
+func update(v *[4]uint32, buf *[16]byte, input []byte)
--- a/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_arm.s
+++ b/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_arm.s
@@ -0,0 +1,259 @@
+// +build !noasm
+
+#include "textflag.h"
+
+#define prime1		$2654435761
+#define prime2		$2246822519
+#define prime3		$3266489917
+#define prime4		$668265263
+#define prime5		$374761393
+
+#define prime1plus2	$606290984
+#define prime1minus	$1640531535
+
+// Register allocation.
+#define p	R0
+#define n	R1
+#define h	R2
+#define v1	R2	// Alias for h.
+#define v2	R3
+#define v3	R4
+#define v4	R5
+#define x1	R6
+#define x2	R7
+#define x3	R8
+#define x4	R9
+
+// We need the primes in registers. The 16-byte loop only uses prime{1,2}.
+#define prime1r	R11
+#define prime2r	R12
+#define prime3r	R3	// The rest can alias v{2-4}.
+#define prime4r	R4
+#define prime5r	R5
+
+// Update round macros. These read from and increment p.
+
+#define round16aligned			\
+	MOVM.IA.W (p), [x1, x2, x3, x4]	\
+					\
+	MULA x1, prime2r, v1, v1	\
+	MULA x2, prime2r, v2, v2	\
+	MULA x3, prime2r, v3, v3	\
+	MULA x4, prime2r, v4, v4	\
+					\
+	MOVW v1 @> 19, v1		\
+	MOVW v2 @> 19, v2		\
+	MOVW v3 @> 19, v3		\
+	MOVW v4 @> 19, v4		\
+					\
+	MUL prime1r, v1			\
+	MUL prime1r, v2			\
+	MUL prime1r, v3			\
+	MUL prime1r, v4			\
+
+#define round16unaligned 		\
+	MOVBU.P  16(p), x1		\
+	MOVBU   -15(p), x2		\
+	ORR     x2 <<  8, x1		\
+	MOVBU   -14(p), x3		\
+	MOVBU   -13(p), x4		\
+	ORR     x4 <<  8, x3		\
+	ORR     x3 << 16, x1		\
+					\
+	MULA x1, prime2r, v1, v1	\
+	MOVW v1 @> 19, v1		\
+	MUL prime1r, v1			\
+					\
+	MOVBU -12(p), x1		\
+	MOVBU -11(p), x2		\
+	ORR   x2 <<  8, x1		\
+	MOVBU -10(p), x3		\
+	MOVBU  -9(p), x4		\
+	ORR   x4 <<  8, x3		\
+	ORR   x3 << 16, x1		\
+					\
+	MULA x1, prime2r, v2, v2	\
+	MOVW v2 @> 19, v2		\
+	MUL prime1r, v2			\
+					\
+	MOVBU -8(p), x1			\
+	MOVBU -7(p), x2			\
+	ORR   x2 <<  8, x1		\
+	MOVBU -6(p), x3			\
+	MOVBU -5(p), x4			\
+	ORR   x4 <<  8, x3		\
+	ORR   x3 << 16, x1		\
+					\
+	MULA x1, prime2r, v3, v3	\
+	MOVW v3 @> 19, v3		\
+	MUL prime1r, v3			\
+					\
+	MOVBU -4(p), x1			\
+	MOVBU -3(p), x2			\
+	ORR   x2 <<  8, x1		\
+	MOVBU -2(p), x3			\
+	MOVBU -1(p), x4			\
+	ORR   x4 <<  8, x3		\
+	ORR   x3 << 16, x1		\
+					\
+	MULA x1, prime2r, v4, v4	\
+	MOVW v4 @> 19, v4		\
+	MUL prime1r, v4			\
+
+
+// func ChecksumZero([]byte) uint32
+TEXT ·ChecksumZero(SB), NOFRAME|NOSPLIT, $-4-16
+	MOVW input_base+0(FP), p
+	MOVW input_len+4(FP),  n
+
+	MOVW prime1, prime1r
+	MOVW prime2, prime2r
+
+	// Set up h for n < 16. It's tempting to say {ADD prime5, n, h}
+	// here, but that's a pseudo-op that generates a load through R11.
+	MOVW prime5, prime5r
+	ADD  prime5r, n, h
+	CMP  $0, n
+	BEQ  end
+
+	// We let n go negative so we can do comparisons with SUB.S
+	// instead of separate CMP.
+	SUB.S $16, n
+	BMI   loop16done
+
+	MOVW prime1plus2, v1
+	MOVW prime2,      v2
+	MOVW $0,          v3
+	MOVW prime1minus, v4
+
+	TST $3, p
+	BNE loop16unaligned
+
+loop16aligned:
+	SUB.S $16, n
+	round16aligned
+	BPL loop16aligned
+	B   loop16finish
+
+loop16unaligned:
+	SUB.S $16, n
+	round16unaligned
+	BPL loop16unaligned
+
+loop16finish:
+	MOVW v1 @> 31, h
+	ADD  v2 @> 25, h
+	ADD  v3 @> 20, h
+	ADD  v4 @> 14, h
+
+	// h += len(input) with v2 as temporary.
+	MOVW input_len+4(FP), v2
+	ADD  v2, h
+
+loop16done:
+	ADD $16, n	// Restore number of bytes left.
+
+	SUB.S $4, n
+	MOVW  prime3, prime3r
+	BMI   loop4done
+	MOVW  prime4, prime4r
+
+	TST $3, p
+	BNE loop4unaligned
+
+loop4aligned:
+	SUB.S $4, n
+
+	MOVW.P 4(p), x1
+	MULA   prime3r, x1, h, h
+	MOVW   h @> 15, h
+	MUL    prime4r, h
+
+	BPL loop4aligned
+	B   loop4done
+
+loop4unaligned:
+	SUB.S $4, n
+
+	MOVBU.P  4(p), x1
+	MOVBU   -3(p), x2
+	ORR     x2 <<  8, x1
+	MOVBU   -2(p), x3
+	ORR     x3 << 16, x1
+	MOVBU   -1(p), x4
+	ORR     x4 << 24, x1
+
+	MULA prime3r, x1, h, h
+	MOVW h @> 15, h
+	MUL  prime4r, h
+
+	BPL loop4unaligned
+
+loop4done:
+	ADD.S $4, n	// Restore number of bytes left.
+	BEQ   end
+
+	MOVW prime5, prime5r
+
+loop1:
+	SUB.S $1, n
+
+	MOVBU.P 1(p), x1
+	MULA    prime5r, x1, h, h
+	MOVW    h @> 21, h
+	MUL     prime1r, h
+
+	BNE loop1
+
+end:
+	MOVW prime3, prime3r
+	EOR  h >> 15, h
+	MUL  prime2r, h
+	EOR  h >> 13, h
+	MUL  prime3r, h
+	EOR  h >> 16, h
+
+	MOVW h, ret+12(FP)
+	RET
+
+
+// func update(v *[4]uint64, buf *[16]byte, p []byte)
+TEXT ·update(SB), NOFRAME|NOSPLIT, $-4-20
+	MOVW    v+0(FP), p
+	MOVM.IA (p), [v1, v2, v3, v4]
+
+	MOVW prime1, prime1r
+	MOVW prime2, prime2r
+
+	// Process buf, if not nil.
+	MOVW buf+4(FP), p
+	CMP  $0, p
+	BEQ  noBuffered
+
+	round16aligned
+
+noBuffered:
+	MOVW input_base +8(FP), p
+	MOVW input_len +12(FP), n
+
+	SUB.S $16, n
+	BMI   end
+
+	TST $3, p
+	BNE loop16unaligned
+
+loop16aligned:
+	SUB.S $16, n
+	round16aligned
+	BPL loop16aligned
+	B   end
+
+loop16unaligned:
+	SUB.S $16, n
+	round16unaligned
+	BPL loop16unaligned
+
+end:
+	MOVW    v+0(FP), p
+	MOVM.IA [v1, v2, v3, v4], (p)
+	RET
--- a/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_other.go
+++ b/vendor/github.com/pierrec/lz4/v4/internal/xxh32/xxh32zero_other.go
@@ -0,0 +1,10 @@
+// +build !arm noasm
+
+package xxh32
+
+// ChecksumZero returns the 32-bit hash of input.
+func ChecksumZero(input []byte) uint32 { return checksumZeroGo(input) }
+
+func update(v *[4]uint32, buf *[16]byte, input []byte) {
+	updateGo(v, buf, input)
+}