// +build gc
// +build !noasm

#include "textflag.h"

// Register allocation.
#define dst	R0
#define dstorig	R1
#define src	R2
#define dstend	R3
#define srcend	R4
#define match	R5	// Match address.
#define token	R6
#define len	R7	// Literal and match lengths.
#define offset	R6	// Match offset; overlaps with token.
#define tmp1	R8
#define tmp2	R9
#define tmp3	R12

#define minMatch	$4

// func decodeBlock(dst, src []byte) int
TEXT ·decodeBlock(SB), NOFRAME|NOSPLIT, $-4-28
	MOVW dst_base +0(FP), dst
	MOVW dst_len  +4(FP), dstend
	MOVW src_base+12(FP), src
	MOVW src_len +16(FP), srcend

	CMP $0, srcend
	BEQ shortSrc

	ADD dst, dstend
	ADD src, srcend

	MOVW dst, dstorig

loop:
	// Read token. Extract literal length.
	MOVBU.P 1(src), token
	MOVW    token >> 4, len
	CMP     $15, len
	BNE     readLitlenDone

readLitlenLoop:
	CMP     src, srcend
	BEQ     shortSrc
	MOVBU.P 1(src), tmp1
	ADD     tmp1, len
	CMP     $255, tmp1
	BEQ     readLitlenLoop

readLitlenDone:
	CMP $0, len
	BEQ copyLiteralDone

	// Bounds check dst+len and src+len.
	ADD    dst, len, tmp1
	CMP    dstend, tmp1
	//BHI  shortDst	// Uncomment for distinct error codes.
	ADD    src, len, tmp2
	CMP.LS srcend, tmp2
	BHI    shortSrc

	// Copy literal.
	CMP $4, len
	BLO copyLiteralFinish

	// Copy 0-3 bytes until src is aligned.
	TST        $1, src
	MOVBU.NE.P 1(src), tmp1
	MOVB.NE.P  tmp1, 1(dst)
	SUB.NE     $1, len

	TST        $2, src
	MOVHU.NE.P 2(src), tmp2
	MOVB.NE.P  tmp2, 1(dst)
	MOVW.NE    tmp2 >> 8, tmp1
	MOVB.NE.P  tmp1, 1(dst)
	SUB.NE     $2, len

	B copyLiteralLoopCond

copyLiteralLoop:
	// Aligned load, unaligned write.
	MOVW.P 4(src), tmp1
	MOVW   tmp1 >>  8, tmp2
	MOVB   tmp2, 1(dst)
	MOVW   tmp1 >> 16, tmp3
	MOVB   tmp3, 2(dst)
	MOVW   tmp1 >> 24, tmp2
	MOVB   tmp2, 3(dst)
	MOVB.P tmp1, 4(dst)
copyLiteralLoopCond:
	// Loop until len-4 < 0.
	SUB.S  $4, len
	BPL    copyLiteralLoop

	// Restore len, which is now negative.
	ADD $4, len

copyLiteralFinish:
	// Copy remaining 0-3 bytes.
	TST        $2, len
	MOVHU.NE.P 2(src), tmp2
	MOVB.NE.P  tmp2, 1(dst)
	MOVW.NE    tmp2 >> 8, tmp1
	MOVB.NE.P  tmp1, 1(dst)
	TST        $1, len
	MOVBU.NE.P 1(src), tmp1
	MOVB.NE.P  tmp1, 1(dst)

copyLiteralDone:
	CMP src, srcend
	BEQ end

	// Initial part of match length.
	// This frees up the token register for reuse as offset.
	AND $15, token, len

	// Read offset.
	ADD   $2, src
	CMP   srcend, src
	BHI   shortSrc
	MOVBU -2(src), offset
	MOVBU -1(src), tmp1
	ORR   tmp1 << 8, offset
	CMP   $0, offset
	BEQ   corrupt

	// Read rest of match length.
	CMP $15, len
	BNE readMatchlenDone

readMatchlenLoop:
	CMP     src, srcend
	BEQ     shortSrc
	MOVBU.P 1(src), tmp1
	ADD     tmp1, len
	CMP     $255, tmp1
	BEQ     readMatchlenLoop

readMatchlenDone:
	ADD minMatch, len

	// Bounds check dst+len and match = dst-offset.
	ADD    dst, len, tmp1
	CMP    dstend, tmp1
	//BHI  shortDst	// Uncomment for distinct error codes.
	SUB    offset, dst, match
	CMP.LS match, dstorig
	BHI    corrupt

	// If the offset is at least four (len is, because of minMatch),
	// do a four-way unrolled byte copy loop. Using MOVD instead of four
	// byte loads is much faster, but to remain portable we'd have to
	// align match first, which in turn is too expensive.
	CMP $4, offset
	BLO copyMatch

	SUB $4, len
copyMatch4:
	MOVBU.P 4(match), tmp1
	MOVB.P  tmp1, 4(dst)
	MOVBU   -3(match), tmp2
	MOVB    tmp2, -3(dst)
	MOVBU   -2(match), tmp3
	MOVB    tmp3, -2(dst)
	MOVBU   -1(match), tmp1
	MOVB    tmp1, -1(dst)
	SUB.S   $4, len
	BPL     copyMatch4

	// Restore len, which is now negative.
	ADD.S $4, len
	BEQ   copyMatchDone

copyMatch:
	// Simple byte-at-a-time copy.
	SUB.S   $1, len
	MOVBU.P 1(match), tmp2
	MOVB.P  tmp2, 1(dst)
	BNE     copyMatch

copyMatchDone:
	CMP src, srcend
	BNE loop

end:
	SUB  dstorig, dst, tmp1
	MOVW tmp1, ret+24(FP)
	RET

	// The three error cases have distinct labels so we can put different
	// return codes here when debugging, or if the error returns need to
	// be changed.
shortDst:
shortSrc:
corrupt:
	MOVW $-1, tmp1
	MOVW tmp1, ret+24(FP)
	RET