Macaron 1.5 (#12596)

* update macaron to v1.5 of fork * update macaron to v1.5 of fork * test gzip PR * add push method impl to context_tests * use proper gzip commit Co-authored-by: zeripath <art27@cantab.net> Co-authored-by: Lunny Xiao <xiaolunwen@gmail.com>
2025-12-07 13:28:25 +00:00 · 2020-08-27 22:47:17 -04:00
parent 211321fb93
commit c5d5d63c9c
53 changed files with 2622 additions and 665 deletions
@@ -11,10 +11,10 @@ require (
 	gitea.com/macaron/captcha v0.0.0-20190822015246-daa973478bae
 	gitea.com/macaron/cors v0.0.0-20190826180238-95aec09ea8b4
 	gitea.com/macaron/csrf v0.0.0-20190822024205-3dc5a4474439
-	gitea.com/macaron/gzip v0.0.0-20191118041502-506895b47aae
+	gitea.com/macaron/gzip v0.0.0-20200827120000-efa5e8477cf5
 	gitea.com/macaron/i18n v0.0.0-20190822004228-474e714e2223
 	gitea.com/macaron/inject v0.0.0-20190805023432-d4c86e31027a
-	gitea.com/macaron/macaron v1.4.0
+	gitea.com/macaron/macaron v1.5.0
 	gitea.com/macaron/session v0.0.0-20191207215012-613cebf0674d
 	gitea.com/macaron/toolbox v0.0.0-20190822013122-05ff0fc766b7
 	github.com/BurntSushi/toml v0.3.1
@@ -61,7 +61,7 @@ require (
 	github.com/jmhodges/levigo v1.0.0 // indirect
 	github.com/kballard/go-shellquote v0.0.0-20170619183022-cd60e84ee657
 	github.com/keybase/go-crypto v0.0.0-20200123153347-de78d2cb44f4
-	github.com/klauspost/compress v1.10.2
+	github.com/klauspost/compress v1.10.11
 	github.com/lafriks/xormstore v1.3.2
 	github.com/lib/pq v1.7.0
 	github.com/lunny/dingtalk_webhook v0.0.0-20171025031554-e3534c89ef96
@@ -104,7 +104,7 @@ require (
 	github.com/yuin/goldmark v1.2.1
 	github.com/yuin/goldmark-highlighting v0.0.0-20200307114337-60d527fdb691
 	github.com/yuin/goldmark-meta v0.0.0-20191126180153-f0638e958b60
-	golang.org/x/crypto v0.0.0-20200728195943-123391ffb6de
+	golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a
 	golang.org/x/net v0.0.0-20200707034311-ab3426394381
 	golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d
 	golang.org/x/sys v0.0.0-20200625212154-ddb9806d33ae
@@ -115,7 +115,7 @@ require (
 	gopkg.in/alexcesaro/quotedprintable.v3 v3.0.0-20150716171945-2caba252f4dc // indirect
 	gopkg.in/asn1-ber.v1 v1.0.0-20150924051756-4e86f4367175 // indirect
 	gopkg.in/gomail.v2 v2.0.0-20160411212932-81ebce5c23df
-	gopkg.in/ini.v1 v1.57.0
+	gopkg.in/ini.v1 v1.60.2
 	gopkg.in/ldap.v3 v3.0.2
 	gopkg.in/yaml.v2 v2.3.0
 	mvdan.cc/xurls/v2 v2.1.0
@@ -23,8 +23,8 @@ gitea.com/macaron/cors v0.0.0-20190826180238-95aec09ea8b4 h1:e2rAFDejB0qN8OrY4xP
 gitea.com/macaron/cors v0.0.0-20190826180238-95aec09ea8b4/go.mod h1:rtOK4J20kpMD9XcNsnO5YA843YSTe/MUMbDj/TJ/Q7A=
 gitea.com/macaron/csrf v0.0.0-20190822024205-3dc5a4474439 h1:88c34YM29a1GlWLrLBaG/GTT2htDdJz1u3n9+lmPolg=
 gitea.com/macaron/csrf v0.0.0-20190822024205-3dc5a4474439/go.mod h1:IsQPHx73HnnqFBYiVHjg87q4XBZyGXXu77xANukvZuk=
-gitea.com/macaron/gzip v0.0.0-20191118041502-506895b47aae h1:OXxYwGmGNfYrC0/sUUL9KSvr2Sfvmzwgd2YD65vIjGE=
+gitea.com/macaron/gzip v0.0.0-20200827120000-efa5e8477cf5 h1:6rbhThlqfOb+sSmhrsVFz3bZoAeoloe7TZqyeiPbbWI=
-gitea.com/macaron/gzip v0.0.0-20191118041502-506895b47aae/go.mod h1:jGHtoovArcQj+sw7NJxyPgjuRxOSG9a/oFu3VkLRTKQ=
+gitea.com/macaron/gzip v0.0.0-20200827120000-efa5e8477cf5/go.mod h1:z8vCjuhqDfvzPUJDowGqbsgoeYBvDbl95S5k6y43Pxo=
 gitea.com/macaron/i18n v0.0.0-20190822004228-474e714e2223 h1:iZWwQif/LHMjBgfY/ua8CFVa4XMDfbbs7EZ0Q1dYguU=
 gitea.com/macaron/i18n v0.0.0-20190822004228-474e714e2223/go.mod h1:+qsc10s4hBsHKU/9luGGumFh4m5FFVc7uih+8/mM1NY=
 gitea.com/macaron/inject v0.0.0-20190803172902-8375ba841591/go.mod h1:h6E4kLao1Yko6DOU6QDnQPcuoNzvbZqzj2mtPcEn1aM=
@@ -33,8 +33,8 @@ gitea.com/macaron/inject v0.0.0-20190805023432-d4c86e31027a/go.mod h1:h6E4kLao1Y
 gitea.com/macaron/macaron v1.3.3-0.20190803174002-53e005ff4827/go.mod h1:/rvxMjIkOq4BM8uPUb+VHuU02ZfAO6R4+wD//tiCiRw=
 gitea.com/macaron/macaron v1.3.3-0.20190821202302-9646c0587edb h1:amL0md6orTj1tXY16ANzVU9FmzQB+W7aJwp8pVDbrmA=
 gitea.com/macaron/macaron v1.3.3-0.20190821202302-9646c0587edb/go.mod h1:0coI+mSPSwbsyAbOuFllVS38awuk9mevhLD52l50Gjs=
-gitea.com/macaron/macaron v1.4.0 h1:FY1QDGqyuUzs21K6ChkbYbRUfwL7v2aUrhNEJ0IgsAw=
+gitea.com/macaron/macaron v1.5.0 h1:TvWEcHw1/zaHlo0GTuKEukLh3A99+QsU2mjBrXLXjVQ=
-gitea.com/macaron/macaron v1.4.0/go.mod h1:P7hfDbQjcW22lkYkXlxdRIfWOXxH2+K4EogN4Q0UlLY=
+gitea.com/macaron/macaron v1.5.0/go.mod h1:P7hfDbQjcW22lkYkXlxdRIfWOXxH2+K4EogN4Q0UlLY=
 gitea.com/macaron/session v0.0.0-20190821211443-122c47c5f705 h1:mvkQGAlON1Z6Y8pqa/+FpYIskk54mazuECUfZK5oTg0=
 gitea.com/macaron/session v0.0.0-20190821211443-122c47c5f705/go.mod h1:1ujH0jD6Ca4iK9NL0Q2a7fG2chvXx5hVa7hBfABwpkA=
 gitea.com/macaron/session v0.0.0-20191207215012-613cebf0674d h1:XLww3CvnFZkXVwauN67fniDaIpIqsE+9KVcxlZKlvLU=
@@ -489,8 +489,8 @@ github.com/kisielk/errcheck v1.1.0/go.mod h1:EZBBE59ingxPouuu3KfxchcWSUPOHkagtvW
 github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck=
 github.com/klauspost/compress v1.4.1/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
 github.com/klauspost/compress v1.9.2/go.mod h1:RyIbtBH6LamlWaDj8nUwkbUhJ87Yi3uG0guNDohfE1A=
-github.com/klauspost/compress v1.10.2 h1:Znfn6hXZAHaLPNnlqUYRrBSReFHYybslgv4PTiyz6P0=
+github.com/klauspost/compress v1.10.11 h1:K9z59aO18Aywg2b/WSgBaUX99mHy2BES18Cr5lBKZHk=
-github.com/klauspost/compress v1.10.2/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
+github.com/klauspost/compress v1.10.11/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
 github.com/klauspost/cpuid v1.2.0/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
 github.com/klauspost/cpuid v1.2.3 h1:CCtW0xUnWGVINKvE/WWOYKdsPV6mawAtvQuSl8guwQs=
 github.com/klauspost/cpuid v1.2.3/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
@@ -819,8 +819,8 @@ golang.org/x/crypto v0.0.0-20200302210943-78000ba7a073/go.mod h1:LzIPMQfyMNhhGPh
 golang.org/x/crypto v0.0.0-20200323165209-0ec3e9974c59/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/crypto v0.0.0-20200709230013-948cd5f35899/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
-golang.org/x/crypto v0.0.0-20200728195943-123391ffb6de h1:ikNHVSjEfnvz6sxdSPCaPt572qowuyMDMJLLm3Db3ig=
+golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a h1:vclmkQCjlDX5OydZ9wv8rBCcS0QyQY66Mpf/7BZbInM=
-golang.org/x/crypto v0.0.0-20200728195943-123391ffb6de/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
+golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA=
 golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8=
 golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js=
@@ -1014,6 +1014,10 @@ gopkg.in/ini.v1 v1.44.2/go.mod h1:M3Cogqpuv0QCi3ExAY5V4uOt4qb/R3xZubo9m8lK5wg=
 gopkg.in/ini.v1 v1.46.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
 gopkg.in/ini.v1 v1.57.0 h1:9unxIsFcTt4I55uWluz+UmL95q4kdJ0buvQ1ZIqVQww=
 gopkg.in/ini.v1 v1.57.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
 gopkg.in/ini.v1 v1.60.1 h1:P5y5shSkb0CFe44qEeMBgn8JLow09MP17jlJHanke5g=
 gopkg.in/ini.v1 v1.60.1/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
 gopkg.in/ini.v1 v1.60.2 h1:7i8mqModL63zqi8nQn8Q3+0zvSCZy1AxhBgthKfi4WU=
 gopkg.in/ini.v1 v1.60.2/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
 gopkg.in/ldap.v3 v3.0.2 h1:R6RBtabK6e1GO0eQKtkyOFbAHO73QesLzI2w2DZ6b9w=
 gopkg.in/ldap.v3 v3.0.2/go.mod h1:oxD7NyBuxchC+SgJDE1Q5Od05eGt29SDQVBmV+HYbzw=
 gopkg.in/resty.v1 v1.12.0/go.mod h1:mDo4pnntr5jdWRML875a/NmxYqAlA73dVijT2AXvQQo=
@@ -117,6 +117,10 @@ func (rw *mockResponseWriter) Before(b macaron.BeforeFunc) {
 	b(rw)
 }
 func (rw *mockResponseWriter) Push(target string, opts *http.PushOptions) error {
 	return nil
 }
 type mockRender struct {
 	http.ResponseWriter
 }
@@ -1,9 +1,9 @@
 kind: pipeline
-name: go1-1-2
+name: go1-14
 steps:
 - name: test
-  image: golang:1.12
+  image: golang:1.14
  environment:
    GOPROXY: https://goproxy.cn
  commands:
@@ -12,11 +12,11 @@ steps:
 ---
 kind: pipeline
-name: go1-1-3
+name: go1-15
 steps:
 - name: test
-  image: golang:1.13
+  image: golang:1.15
  environment:
    GOPROXY: https://goproxy.cn
  commands:
@@ -3,7 +3,9 @@ module gitea.com/macaron/gzip
 go 1.12
 require (
-	gitea.com/macaron/macaron v1.3.3-0.20190821202302-9646c0587edb
+	gitea.com/macaron/macaron v1.5.0
 	github.com/klauspost/compress v1.9.2
 	github.com/stretchr/testify v1.4.0
 	golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a // indirect
 	gopkg.in/ini.v1 v1.60.1 // indirect
 )
@@ -1,7 +1,7 @@
-gitea.com/macaron/inject v0.0.0-20190803172902-8375ba841591 h1:UbCTjPcLrNxR9LzKDjQBMT2zoxZuEnca1pZCpgeMuhQ=
+gitea.com/macaron/inject v0.0.0-20190805023432-d4c86e31027a h1:aOKEXkDTnh4euoH0so/THLXeHtQuqHmDPb1xEk6Ehok=
-gitea.com/macaron/inject v0.0.0-20190803172902-8375ba841591/go.mod h1:h6E4kLao1Yko6DOU6QDnQPcuoNzvbZqzj2mtPcEn1aM=
+gitea.com/macaron/inject v0.0.0-20190805023432-d4c86e31027a/go.mod h1:h6E4kLao1Yko6DOU6QDnQPcuoNzvbZqzj2mtPcEn1aM=
-gitea.com/macaron/macaron v1.3.3-0.20190821202302-9646c0587edb h1:amL0md6orTj1tXY16ANzVU9FmzQB+W7aJwp8pVDbrmA=
+gitea.com/macaron/macaron v1.5.0 h1:TvWEcHw1/zaHlo0GTuKEukLh3A99+QsU2mjBrXLXjVQ=
-gitea.com/macaron/macaron v1.3.3-0.20190821202302-9646c0587edb/go.mod h1:0coI+mSPSwbsyAbOuFllVS38awuk9mevhLD52l50Gjs=
+gitea.com/macaron/macaron v1.5.0/go.mod h1:P7hfDbQjcW22lkYkXlxdRIfWOXxH2+K4EogN4Q0UlLY=
 github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
@@ -17,17 +17,21 @@ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZN
 github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
 github.com/smartystreets/assertions v0.0.0-20190116191733-b6c0e53d7304 h1:Jpy1PXuP99tXNrhbq2BaPz9B+jNAvH1JPQQpG/9GCXY=
 github.com/smartystreets/assertions v0.0.0-20190116191733-b6c0e53d7304/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
 github.com/smartystreets/assertions v1.0.1 h1:voD4ITNjPL5jjBfgR/r8fPIIBrliWrWHeiJApdr3r4w=
 github.com/smartystreets/assertions v1.0.1/go.mod h1:kHHU4qYBaI3q23Pp3VPrmWhuIUrLW/7eUrw0BU5VaoM=
 github.com/smartystreets/goconvey v0.0.0-20181108003508-044398e4856c/go.mod h1:XDJAKZRPZ1CvBcN2aX5YOUTYGHki24fSF0Iv48Ibg0s=
 github.com/smartystreets/goconvey v0.0.0-20190731233626-505e41936337 h1:WN9BUFbdyOsSH/XohnWpXOlq9NBD5sGAB2FciQMUEe8=
 github.com/smartystreets/goconvey v0.0.0-20190731233626-505e41936337/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
 github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
-github.com/unknwon/com v0.0.0-20190804042917-757f69c95f3e h1:GSGeB9EAKY2spCABz6xOX5DbxZEXolK+nBSvmsQwRjM=
+github.com/unknwon/com v1.0.1 h1:3d1LTxD+Lnf3soQiD4Cp/0BRB+Rsa/+RTvz8GMMzIXs=
-github.com/unknwon/com v0.0.0-20190804042917-757f69c95f3e/go.mod h1:tOOxU81rwgoCLoOVVPHb6T/wt8HZygqH5id+GNnlCXM=
+github.com/unknwon/com v1.0.1/go.mod h1:tOOxU81rwgoCLoOVVPHb6T/wt8HZygqH5id+GNnlCXM=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4 h1:HuIa8hRrWRSrqYzx1qI49NNxhdi2PrY7gxVSq1JjLDc=
 golang.org/x/crypto v0.0.0-20190701094942-4def268fd1a4/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
 golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a h1:vclmkQCjlDX5OydZ9wv8rBCcS0QyQY66Mpf/7BZbInM=
 golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
 golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
@@ -38,5 +42,7 @@ gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/ini.v1 v1.44.0 h1:YRJzTUp0kSYWUVFF5XAbDFfyiqwsl0Vb9R8TVP5eRi0=
 gopkg.in/ini.v1 v1.44.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
 gopkg.in/ini.v1 v1.60.1 h1:P5y5shSkb0CFe44qEeMBgn8JLow09MP17jlJHanke5g=
 gopkg.in/ini.v1 v1.60.1/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
 gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
 gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
@@ -6,6 +6,7 @@ package gzip
 import (
 	"bufio"
 	"errors"
 	"fmt"
 	"io"
 	"net"
@@ -331,6 +332,15 @@ func (proxy *ProxyResponseWriter) Flush() {
 	proxy.internal.Flush()
 }
 // Push implements http.Pusher for HTTP/2 Push purposes
 func (proxy *ProxyResponseWriter) Push(target string, opts *http.PushOptions) error {
 	pusher, ok := proxy.internal.(http.Pusher)
 	if !ok {
 		return errors.New("the ResponseWriter doesn't support the Pusher interface")
 	}
 	return pusher.Push(target, opts)
 }
 // Hijack implements http.Hijacker. If the underlying ResponseWriter is a
 // Hijacker, its Hijack method is returned. Otherwise an error is returned.
 func (proxy *ProxyResponseWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
@@ -3,7 +3,9 @@ name: default
 steps:
 - name: test
-  image: golang:1.12
+  image: golang:1.13
  environment:
    GOPROXY: https://goproxy.cn
  commands:
  - go get -u
  - go build -v
@@ -1,5 +1,9 @@
-Macaron [![Build Status](https://travis-ci.org/go-macaron/macaron.svg?branch=v1)](https://travis-ci.org/go-macaron/macaron)
+# Macaron
-=======================
+
 [![GitHub Workflow Status](https://img.shields.io/github/workflow/status/go-macaron/macaron/Go?logo=github&style=for-the-badge)](https://github.com/go-macaron/macaron/actions?query=workflow%3AGo)
 [![codecov](https://img.shields.io/codecov/c/github/go-macaron/macaron/master?logo=codecov&style=for-the-badge)](https://codecov.io/gh/go-macaron/macaron)
 [![GoDoc](https://img.shields.io/badge/GoDoc-Reference-blue?style=for-the-badge&logo=go)](https://pkg.go.dev/gopkg.in/macaron.v1?tab=doc)
 [![Sourcegraph](https://img.shields.io/badge/view%20on-Sourcegraph-brightgreen.svg?style=for-the-badge&logo=sourcegraph)](https://sourcegraph.com/github.com/go-macaron/macaron)
 ![Macaron Logo](https://raw.githubusercontent.com/go-macaron/macaron/v1/macaronlogo.png)
@@ -193,9 +193,9 @@ func (ctx *Context) parseForm() {
 	contentType := ctx.Req.Header.Get(_CONTENT_TYPE)
 	if (ctx.Req.Method == "POST" || ctx.Req.Method == "PUT") &&
 		len(contentType) > 0 && strings.Contains(contentType, "multipart/form-data") {
-		ctx.Req.ParseMultipartForm(MaxMemory)
+		_ = ctx.Req.ParseMultipartForm(MaxMemory)
 	} else {
-		ctx.Req.ParseForm()
+		_ = ctx.Req.ParseForm()
 	}
 }
@@ -260,6 +260,11 @@ func (ctx *Context) Params(name string) string {
 	return ctx.params[name]
 }
 // AllParams returns all params.
 func (ctx *Context) AllParams() Params {
 	return ctx.params
 }
 // SetParams sets value of param with given name.
 func (ctx *Context) SetParams(name, val string) {
 	if name != "*" && !strings.HasPrefix(name, ":") {
@@ -153,7 +153,7 @@ func Recovery() Handler {
 				res.WriteHeader(http.StatusInternalServerError)
 				if nil != body {
-					res.Write(body)
+					_, _ = res.Write(body)
 				}
 			}
 		}()
@@ -36,7 +36,6 @@ import (
 const (
 	_CONTENT_TYPE    = "Content-Type"
 	_CONTENT_LENGTH  = "Content-Length"
 	_CONTENT_BINARY  = "application/octet-stream"
 	_CONTENT_JSON    = "application/json"
 	_CONTENT_HTML    = "text/html"
@@ -200,7 +199,7 @@ func NewTemplateFileSystem(opt RenderOptions, omitData bool) TplFileSystem {
 	lastDir := dirs[len(dirs)-1]
 	// We still walk the last (original) directory because it's non-sense we load templates not exist in original directory.
-	if err = filepath.Walk(lastDir, func(path string, info os.FileInfo, err error) error {
+	if err = filepath.Walk(lastDir, func(path string, info os.FileInfo, _ error) error {
 		r, err := filepath.Rel(lastDir, path)
 		if err != nil {
 			return err
@@ -458,9 +457,9 @@ func (r *TplRender) JSON(status int, v interface{}) {
 	r.Header().Set(_CONTENT_TYPE, _CONTENT_JSON+r.CompiledCharset)
 	r.WriteHeader(status)
 	if len(r.Opt.PrefixJSON) > 0 {
-		r.Write(r.Opt.PrefixJSON)
+		_, _ = r.Write(r.Opt.PrefixJSON)
 	}
-	r.Write(result)
+	_, _ = r.Write(result)
 }
 func (r *TplRender) JSONString(v interface{}) (string, error) {
@@ -494,9 +493,9 @@ func (r *TplRender) XML(status int, v interface{}) {
 	r.Header().Set(_CONTENT_TYPE, _CONTENT_XML+r.CompiledCharset)
 	r.WriteHeader(status)
 	if len(r.Opt.PrefixXML) > 0 {
-		r.Write(r.Opt.PrefixXML)
+		_, _ = r.Write(r.Opt.PrefixXML)
 	}
-	r.Write(result)
+	_, _ = r.Write(result)
 }
 func (r *TplRender) data(status int, contentType string, v []byte) {
@@ -504,7 +503,7 @@ func (r *TplRender) data(status int, contentType string, v []byte) {
 		r.Header().Set(_CONTENT_TYPE, contentType)
 	}
 	r.WriteHeader(status)
-	r.Write(v)
+	_, _ = r.Write(v)
 }
 func (r *TplRender) RawData(status int, v []byte) {
@@ -612,7 +611,7 @@ func (r *TplRender) HTMLString(name string, data interface{}, htmlOpt ...HTMLOpt
 func (r *TplRender) Error(status int, message ...string) {
 	r.WriteHeader(status)
 	if len(message) > 0 {
-		r.Write([]byte(message[0]))
+		_, _ = r.Write([]byte(message[0]))
 	}
 }
@@ -16,7 +16,7 @@ package macaron
 import (
 	"bufio"
-	"fmt"
+	"errors"
 	"net"
 	"net/http"
 )
@@ -27,6 +27,7 @@ import (
 type ResponseWriter interface {
 	http.ResponseWriter
 	http.Flusher
 	http.Pusher
 	// Status returns the status code of the response or 0 if the response has not been written.
 	Status() int
 	// Written returns whether or not the ResponseWriter has been written.
@@ -91,11 +92,12 @@ func (rw *responseWriter) Before(before BeforeFunc) {
 func (rw *responseWriter) Hijack() (net.Conn, *bufio.ReadWriter, error) {
 	hijacker, ok := rw.ResponseWriter.(http.Hijacker)
 	if !ok {
-		return nil, nil, fmt.Errorf("the ResponseWriter doesn't support the Hijacker interface")
+		return nil, nil, errors.New("the ResponseWriter doesn't support the Hijacker interface")
 	}
 	return hijacker.Hijack()
 }
 //nolint
 func (rw *responseWriter) CloseNotify() <-chan bool {
 	return rw.ResponseWriter.(http.CloseNotifier).CloseNotify()
 }
@@ -112,3 +114,11 @@ func (rw *responseWriter) Flush() {
 		flusher.Flush()
 	}
 }
 func (rw *responseWriter) Push(target string, opts *http.PushOptions) error {
 	pusher, ok := rw.ResponseWriter.(http.Pusher)
 	if !ok {
 		return errors.New("the ResponseWriter doesn't support the Pusher interface")
 	}
 	return pusher.Push(target, opts)
 }
@@ -68,9 +68,9 @@ func defaultReturnHandler() ReturnHandler {
 			respVal = respVal.Elem()
 		}
 		if isByteSlice(respVal) {
-			resp.Write(respVal.Bytes())
+			_, _ = resp.Write(respVal.Bytes())
 		} else {
-			resp.Write([]byte(respVal.String()))
+			_, _ = resp.Write([]byte(respVal.String()))
 		}
 	}
 }
@@ -17,6 +17,7 @@ package macaron
 import (
 	"encoding/base64"
 	"fmt"
 	"log"
 	"net/http"
 	"path"
@@ -148,9 +149,15 @@ func staticHandler(ctx *Context, log *log.Logger, opt StaticOptions) bool {
 	// Try to serve index file
 	if fi.IsDir() {
 		redirPath := path.Clean(ctx.Req.URL.Path)
 		// path.Clean removes the trailing slash, so we need to add it back when
 		// the original path has it.
 		if strings.HasSuffix(ctx.Req.URL.Path, "/") {
 			redirPath = redirPath + "/"
 		}
 		// Redirect if missing trailing slash.
-		if !strings.HasSuffix(ctx.Req.URL.Path, "/") {
+		if !strings.HasSuffix(redirPath, "/") {
-			http.Redirect(ctx.Resp, ctx.Req.Request, ctx.Req.URL.Path+"/", http.StatusFound)
+			http.Redirect(ctx.Resp, ctx.Req.Request, redirPath+"/", http.StatusFound)
 			return true
 		}
@@ -177,8 +184,12 @@ func staticHandler(ctx *Context, log *log.Logger, opt StaticOptions) bool {
 	}
 	if opt.ETag {
-		tag := GenerateETag(string(fi.Size()), fi.Name(), fi.ModTime().UTC().Format(http.TimeFormat))
+		tag := `"` + GenerateETag(fmt.Sprintf("%d", fi.Size()), fi.Name(), fi.ModTime().UTC().Format(http.TimeFormat)) + `"`
 		ctx.Resp.Header().Set("ETag", tag)
 		if ctx.Req.Header.Get("If-None-Match") == tag {
 			ctx.Resp.WriteHeader(http.StatusNotModified)
 			return true
 		}
 	}
 	http.ServeContent(ctx.Resp, ctx.Req.Request, file, fi.ModTime(), f)
@@ -80,9 +80,7 @@ type advancedState struct {
 	// deflate state
 	length         int
 	offset         int
 	hash           uint32
 	maxInsertIndex int
 	ii             uint16 // position of last match, intended to overflow to reset.
 	// Input hash chains
 	// hashHead[hashValue] contains the largest inputIndex with the specified hash value
@@ -97,6 +95,9 @@ type advancedState struct {
 	// input window: unprocessed data is window[index:windowEnd]
 	index     int
 	hashMatch [maxMatchLength + minMatchLength]uint32
 	hash uint32
 	ii   uint16 // position of last match, intended to overflow to reset.
 }
 type compressor struct {
@@ -107,18 +108,19 @@ type compressor struct {
 	// compression algorithm
 	fill func(*compressor, []byte) int // copy data to window
 	step func(*compressor)             // process window
 	sync bool                          // requesting flush
-	window        []byte
+	window     []byte
-	windowEnd     int
+	windowEnd  int
-	blockStart    int  // window index where current tokens start
+	blockStart int // window index where current tokens start
-	byteAvailable bool // if true, still need to process window[index-1].
+	err        error
 	err           error
 	// queued output tokens
 	tokens tokens
 	fast   fastEnc
 	state  *advancedState
 	sync          bool // requesting flush
 	byteAvailable bool // if true, still need to process window[index-1].
 }
 func (d *compressor) fillDeflate(b []byte) int {
@@ -295,10 +295,6 @@ type decompressor struct {
 	r       Reader
 	roffset int64
 	// Input bits, in top of b.
 	b  uint32
 	nb uint
 	// Huffman decoders for literal/length, distance.
 	h1, h2 huffmanDecoder
@@ -309,19 +305,24 @@ type decompressor struct {
 	// Output history, buffer.
 	dict dictDecoder
 	// Temporary buffer (avoids repeated allocation).
 	buf [4]byte
 	// Next step in the decompression,
 	// and decompression state.
 	step      func(*decompressor)
 	stepState int
 	final     bool
 	err       error
 	toRead    []byte
 	hl, hd    *huffmanDecoder
 	copyLen   int
 	copyDist  int
 	// Temporary buffer (avoids repeated allocation).
 	buf [4]byte
 	// Input bits, in top of b.
 	b uint32
 	nb    uint
 	final bool
 }
 func (f *decompressor) nextBlock() {
@@ -6,6 +6,7 @@
 package fse
 import (
 	"encoding/binary"
 	"errors"
 	"io"
 )
@@ -34,8 +35,12 @@ func (b *bitReader) init(in []byte) error {
 	}
 	b.bitsRead = 64
 	b.value = 0
-	b.fill()
+	if len(in) >= 8 {
-	b.fill()
+		b.fillFastStart()
 	} else {
 		b.fill()
 		b.fill()
 	}
 	b.bitsRead += 8 - uint8(highBits(uint32(v)))
 	return nil
 }
@@ -63,8 +68,9 @@ func (b *bitReader) fillFast() {
 	if b.bitsRead < 32 {
 		return
 	}
-	// Do single re-slice to avoid bounds checks.
+	// 2 bounds checks.
-	v := b.in[b.off-4 : b.off]
+	v := b.in[b.off-4:]
 	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value = (b.value << 32) | uint64(low)
 	b.bitsRead -= 32
@@ -77,7 +83,8 @@ func (b *bitReader) fill() {
 		return
 	}
 	if b.off > 4 {
-		v := b.in[b.off-4 : b.off]
+		v := b.in[b.off-4:]
 		v = v[:4]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value = (b.value << 32) | uint64(low)
 		b.bitsRead -= 32
@@ -91,9 +98,17 @@ func (b *bitReader) fill() {
 	}
 }
 // fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
 func (b *bitReader) fillFastStart() {
 	// Do single re-slice to avoid bounds checks.
 	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
 	b.bitsRead = 0
 	b.off -= 8
 }
 // finished returns true if all bits have been read from the bit stream.
 func (b *bitReader) finished() bool {
-	return b.off == 0 && b.bitsRead >= 64
+	return b.bitsRead >= 64 && b.off == 0
 }
 // close the bitstream and returns an error if out-of-buffer reads occurred.
@@ -25,19 +25,10 @@ func (b *byteReader) advance(n uint) {
 	b.off += int(n)
 }
 // Int32 returns a little endian int32 starting at current offset.
 func (b byteReader) Int32() int32 {
 	b2 := b.b[b.off : b.off+4 : b.off+4]
 	v3 := int32(b2[3])
 	v2 := int32(b2[2])
 	v1 := int32(b2[1])
 	v0 := int32(b2[0])
 	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
 }
 // Uint32 returns a little endian uint32 starting at current offset.
 func (b byteReader) Uint32() uint32 {
-	b2 := b.b[b.off : b.off+4 : b.off+4]
+	b2 := b.b[b.off:]
 	b2 = b2[:4]
 	v3 := uint32(b2[3])
 	v2 := uint32(b2[2])
 	v1 := uint32(b2[1])
@@ -44,18 +44,14 @@ var (
 // Scratch provides temporary storage for compression and decompression.
 type Scratch struct {
 	// Private
-	count          [maxSymbolValue + 1]uint32
+	count    [maxSymbolValue + 1]uint32
-	norm           [maxSymbolValue + 1]int16
+	norm     [maxSymbolValue + 1]int16
-	symbolLen      uint16 // Length of active part of the symbol table.
+	br       byteReader
-	actualTableLog uint8  // Selected tablelog.
+	bits     bitReader
-	br             byteReader
+	bw       bitWriter
-	bits           bitReader
+	ct       cTable      // Compression tables.
-	bw             bitWriter
+	decTable []decSymbol // Decompression table.
-	ct             cTable      // Compression tables.
+	maxCount int         // count of the most probable symbol
 	decTable       []decSymbol // Decompression table.
 	zeroBits       bool        // no bits has prob > 50%.
 	clearCount     bool        // clear count
 	maxCount       int         // count of the most probable symbol
 	// Per block parameters.
 	// These can be used to override compression parameters of the block.
@@ -68,17 +64,22 @@ type Scratch struct {
 	// and allocation will be avoided.
 	Out []byte
 	// MaxSymbolValue will override the maximum symbol value of the next block.
 	MaxSymbolValue uint8
 	// TableLog will attempt to override the tablelog for the next block.
 	TableLog uint8
 	// DecompressLimit limits the maximum decoded size acceptable.
 	// If > 0 decompression will stop when approximately this many bytes
 	// has been decoded.
 	// If 0, maximum size will be 2GB.
 	DecompressLimit int
 	symbolLen      uint16 // Length of active part of the symbol table.
 	actualTableLog uint8  // Selected tablelog.
 	zeroBits       bool   // no bits has prob > 50%.
 	clearCount     bool   // clear count
 	// MaxSymbolValue will override the maximum symbol value of the next block.
 	MaxSymbolValue uint8
 	// TableLog will attempt to override the tablelog for the next block.
 	TableLog uint8
 }
 // Histogram allows to populate the histogram and skip that step in the compression,
@@ -37,13 +37,13 @@ type Writer struct {
 	Header      // written at first call to Write, Flush, or Close
 	w           io.Writer
 	level       int
-	wroteHeader bool
+	err         error
 	compressor  *flate.Writer
 	digest      uint32 // CRC-32, IEEE polynomial (section 8)
 	size        uint32 // Uncompressed size (section 2.3.1)
 	wroteHeader bool
 	closed      bool
 	buf         [10]byte
 	err         error
 }
 // NewWriter returns a new Writer.
@@ -12,8 +12,6 @@ but it can be used as a secondary step to compressors (like Snappy) that does no
 * [Godoc documentation](https://godoc.org/github.com/klauspost/compress/huff0)
 THIS PACKAGE IS NOT CONSIDERED STABLE AND API OR ENCODING MAY CHANGE IN THE FUTURE.
 ## News
 * Mar 2018: First implementation released. Consider this beta software for now.
@@ -75,6 +73,8 @@ which can be given to the decompressor.
 Decompressing is done by calling the [`Decompress1X`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch.Decompress1X) 
 or [`Decompress4X`](https://godoc.org/github.com/klauspost/compress/huff0#Scratch.Decompress4X) function.
 For concurrently decompressing content with a fixed table a stateless [`Decoder`](https://godoc.org/github.com/klauspost/compress/huff0#Decoder) can be requested which will remain correct as long as the scratch is unchanged. The capacity of the provided slice indicates the expected output size.
 You must provide the output from the compression stage, at exactly the size you got back. If you receive an error back
 your input was likely corrupted. 
@@ -6,6 +6,7 @@
 package huff0
 import (
 	"encoding/binary"
 	"errors"
 	"io"
 )
@@ -34,29 +35,16 @@ func (b *bitReader) init(in []byte) error {
 	}
 	b.bitsRead = 64
 	b.value = 0
-	b.fill()
+	if len(in) >= 8 {
-	b.fill()
+		b.fillFastStart()
 	} else {
 		b.fill()
 		b.fill()
 	}
 	b.bitsRead += 8 - uint8(highBit32(uint32(v)))
 	return nil
 }
 // getBits will return n bits. n can be 0.
 func (b *bitReader) getBits(n uint8) uint16 {
 	if n == 0 || b.bitsRead >= 64 {
 		return 0
 	}
 	return b.getBitsFast(n)
 }
 // getBitsFast requires that at least one bit is requested every time.
 // There are no checks if the buffer is filled.
 func (b *bitReader) getBitsFast(n uint8) uint16 {
 	const regMask = 64 - 1
 	v := uint16((b.value << (b.bitsRead & regMask)) >> ((regMask + 1 - n) & regMask))
 	b.bitsRead += n
 	return v
 }
 // peekBitsFast requires that at least one bit is requested every time.
 // There are no checks if the buffer is filled.
 func (b *bitReader) peekBitsFast(n uint8) uint16 {
@@ -71,21 +59,36 @@ func (b *bitReader) fillFast() {
 	if b.bitsRead < 32 {
 		return
 	}
-	// Do single re-slice to avoid bounds checks.
+
 	// 2 bounds checks.
 	v := b.in[b.off-4 : b.off]
 	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value = (b.value << 32) | uint64(low)
 	b.bitsRead -= 32
 	b.off -= 4
 }
 func (b *bitReader) advance(n uint8) {
 	b.bitsRead += n
 }
 // fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
 func (b *bitReader) fillFastStart() {
 	// Do single re-slice to avoid bounds checks.
 	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
 	b.bitsRead = 0
 	b.off -= 8
 }
 // fill() will make sure at least 32 bits are available.
 func (b *bitReader) fill() {
 	if b.bitsRead < 32 {
 		return
 	}
 	if b.off > 4 {
-		v := b.in[b.off-4 : b.off]
+		v := b.in[b.off-4:]
 		v = v[:4]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value = (b.value << 32) | uint64(low)
 		b.bitsRead -= 32
@@ -113,3 +116,214 @@ func (b *bitReader) close() error {
 	}
 	return nil
 }
 // bitReader reads a bitstream in reverse.
 // The last set bit indicates the start of the stream and is used
 // for aligning the input.
 type bitReaderBytes struct {
 	in       []byte
 	off      uint // next byte to read is at in[off - 1]
 	value    uint64
 	bitsRead uint8
 }
 // init initializes and resets the bit reader.
 func (b *bitReaderBytes) init(in []byte) error {
 	if len(in) < 1 {
 		return errors.New("corrupt stream: too short")
 	}
 	b.in = in
 	b.off = uint(len(in))
 	// The highest bit of the last byte indicates where to start
 	v := in[len(in)-1]
 	if v == 0 {
 		return errors.New("corrupt stream, did not find end of stream")
 	}
 	b.bitsRead = 64
 	b.value = 0
 	if len(in) >= 8 {
 		b.fillFastStart()
 	} else {
 		b.fill()
 		b.fill()
 	}
 	b.advance(8 - uint8(highBit32(uint32(v))))
 	return nil
 }
 // peekBitsFast requires that at least one bit is requested every time.
 // There are no checks if the buffer is filled.
 func (b *bitReaderBytes) peekByteFast() uint8 {
 	got := uint8(b.value >> 56)
 	return got
 }
 func (b *bitReaderBytes) advance(n uint8) {
 	b.bitsRead += n
 	b.value <<= n & 63
 }
 // fillFast() will make sure at least 32 bits are available.
 // There must be at least 4 bytes available.
 func (b *bitReaderBytes) fillFast() {
 	if b.bitsRead < 32 {
 		return
 	}
 	// 2 bounds checks.
 	v := b.in[b.off-4 : b.off]
 	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value |= uint64(low) << (b.bitsRead - 32)
 	b.bitsRead -= 32
 	b.off -= 4
 }
 // fillFastStart() assumes the bitReaderBytes is empty and there is at least 8 bytes to read.
 func (b *bitReaderBytes) fillFastStart() {
 	// Do single re-slice to avoid bounds checks.
 	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
 	b.bitsRead = 0
 	b.off -= 8
 }
 // fill() will make sure at least 32 bits are available.
 func (b *bitReaderBytes) fill() {
 	if b.bitsRead < 32 {
 		return
 	}
 	if b.off > 4 {
 		v := b.in[b.off-4:]
 		v = v[:4]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value |= uint64(low) << (b.bitsRead - 32)
 		b.bitsRead -= 32
 		b.off -= 4
 		return
 	}
 	for b.off > 0 {
 		b.value |= uint64(b.in[b.off-1]) << (b.bitsRead - 8)
 		b.bitsRead -= 8
 		b.off--
 	}
 }
 // finished returns true if all bits have been read from the bit stream.
 func (b *bitReaderBytes) finished() bool {
 	return b.off == 0 && b.bitsRead >= 64
 }
 // close the bitstream and returns an error if out-of-buffer reads occurred.
 func (b *bitReaderBytes) close() error {
 	// Release reference.
 	b.in = nil
 	if b.bitsRead > 64 {
 		return io.ErrUnexpectedEOF
 	}
 	return nil
 }
 // bitReaderShifted reads a bitstream in reverse.
 // The last set bit indicates the start of the stream and is used
 // for aligning the input.
 type bitReaderShifted struct {
 	in       []byte
 	off      uint // next byte to read is at in[off - 1]
 	value    uint64
 	bitsRead uint8
 }
 // init initializes and resets the bit reader.
 func (b *bitReaderShifted) init(in []byte) error {
 	if len(in) < 1 {
 		return errors.New("corrupt stream: too short")
 	}
 	b.in = in
 	b.off = uint(len(in))
 	// The highest bit of the last byte indicates where to start
 	v := in[len(in)-1]
 	if v == 0 {
 		return errors.New("corrupt stream, did not find end of stream")
 	}
 	b.bitsRead = 64
 	b.value = 0
 	if len(in) >= 8 {
 		b.fillFastStart()
 	} else {
 		b.fill()
 		b.fill()
 	}
 	b.advance(8 - uint8(highBit32(uint32(v))))
 	return nil
 }
 // peekBitsFast requires that at least one bit is requested every time.
 // There are no checks if the buffer is filled.
 func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
 	return uint16(b.value >> ((64 - n) & 63))
 }
 func (b *bitReaderShifted) advance(n uint8) {
 	b.bitsRead += n
 	b.value <<= n & 63
 }
 // fillFast() will make sure at least 32 bits are available.
 // There must be at least 4 bytes available.
 func (b *bitReaderShifted) fillFast() {
 	if b.bitsRead < 32 {
 		return
 	}
 	// 2 bounds checks.
 	v := b.in[b.off-4 : b.off]
 	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
 	b.bitsRead -= 32
 	b.off -= 4
 }
 // fillFastStart() assumes the bitReaderShifted is empty and there is at least 8 bytes to read.
 func (b *bitReaderShifted) fillFastStart() {
 	// Do single re-slice to avoid bounds checks.
 	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
 	b.bitsRead = 0
 	b.off -= 8
 }
 // fill() will make sure at least 32 bits are available.
 func (b *bitReaderShifted) fill() {
 	if b.bitsRead < 32 {
 		return
 	}
 	if b.off > 4 {
 		v := b.in[b.off-4:]
 		v = v[:4]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value |= uint64(low) << ((b.bitsRead - 32) & 63)
 		b.bitsRead -= 32
 		b.off -= 4
 		return
 	}
 	for b.off > 0 {
 		b.value |= uint64(b.in[b.off-1]) << ((b.bitsRead - 8) & 63)
 		b.bitsRead -= 8
 		b.off--
 	}
 }
 // finished returns true if all bits have been read from the bit stream.
 func (b *bitReaderShifted) finished() bool {
 	return b.off == 0 && b.bitsRead >= 64
 }
 // close the bitstream and returns an error if out-of-buffer reads occurred.
 func (b *bitReaderShifted) close() error {
 	// Release reference.
 	b.in = nil
 	if b.bitsRead > 64 {
 		return io.ErrUnexpectedEOF
 	}
 	return nil
 }
@@ -43,6 +43,11 @@ func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
 func (b *bitWriter) encSymbol(ct cTable, symbol byte) {
 	enc := ct[symbol]
 	b.bitContainer |= uint64(enc.val) << (b.nBits & 63)
 	if false {
 		if enc.nBits == 0 {
 			panic("nbits 0")
 		}
 	}
 	b.nBits += enc.nBits
 }
@@ -54,6 +59,14 @@ func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) {
 	sh := b.nBits & 63
 	combined := uint64(encA.val) | (uint64(encB.val) << (encA.nBits & 63))
 	b.bitContainer |= combined << sh
 	if false {
 		if encA.nBits == 0 {
 			panic("nbitsA 0")
 		}
 		if encB.nBits == 0 {
 			panic("nbitsB 0")
 		}
 	}
 	b.nBits += encA.nBits + encB.nBits
 }
@@ -77,8 +77,11 @@ func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error)
 		// Each symbol present maximum once or too well distributed.
 		return nil, false, ErrIncompressible
 	}
-
+	if s.Reuse == ReusePolicyMust && !canReuse {
-	if s.Reuse == ReusePolicyPrefer && canReuse {
+		// We must reuse, but we can't.
 		return nil, false, ErrIncompressible
 	}
 	if (s.Reuse == ReusePolicyPrefer || s.Reuse == ReusePolicyMust) && canReuse {
 		keepTable := s.cTable
 		keepTL := s.actualTableLog
 		s.cTable = s.prevTable
@@ -90,6 +93,9 @@ func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error)
 			s.OutData = s.Out
 			return s.Out, true, nil
 		}
 		if s.Reuse == ReusePolicyMust {
 			return nil, false, ErrIncompressible
 		}
 		// Do not attempt to re-use later.
 		s.prevTable = s.prevTable[:0]
 	}
@@ -55,6 +55,9 @@ const (
 	// ReusePolicyNone will disable re-use of tables.
 	// This is slightly faster than ReusePolicyAllow but may produce larger output.
 	ReusePolicyNone
 	// ReusePolicyMust must allow reuse and produce smaller output.
 	ReusePolicyMust
 )
 type Scratch struct {
@@ -79,6 +82,13 @@ type Scratch struct {
 	// Slice of the returned data.
 	OutData []byte
 	// MaxDecodedSize will set the maximum allowed output size.
 	// This value will automatically be set to BlockSizeMax if not set.
 	// Decoders will return ErrMaxDecodedSizeExceeded is this limit is exceeded.
 	MaxDecodedSize int
 	br byteReader
 	// MaxSymbolValue will override the maximum symbol value of the next block.
 	MaxSymbolValue uint8
@@ -95,12 +105,6 @@ type Scratch struct {
 	// If WantLogLess == 0 any improvement will do.
 	WantLogLess uint8
 	// MaxDecodedSize will set the maximum allowed output size.
 	// This value will automatically be set to BlockSizeMax if not set.
 	// Decoders will return ErrMaxDecodedSizeExceeded is this limit is exceeded.
 	MaxDecodedSize int
 	br             byteReader
 	symbolLen      uint16 // Length of active part of the symbol table.
 	maxCount       int    // count of the most probable symbol
 	clearCount     bool   // clear count
@@ -5,11 +5,9 @@ It offers a very wide range of compression / speed trade-off, while being backed
 A high performance compression algorithm is implemented. For now focused on speed. 
 This package provides [compression](#Compressor) to and [decompression](#Decompressor) of Zstandard content. 
-Note that custom dictionaries are not supported yet, so if your code relies on that, 
+Note that custom dictionaries are only supported for decompression.
 you cannot use the package as-is.
 This package is pure Go and without use of "unsafe". 
 If a significant speedup can be achieved using "unsafe", it may be added as an option later.
 The `zstd` package is provided as open source software using a Go standard license.
@@ -142,80 +140,96 @@ Using the Encoder for both a stream and individual blocks concurrently is safe.
 I have collected some speed examples to compare speed and compression against other compressors.
 * `file` is the input file.
-* `out` is the compressor used. `zskp` is this package. `gzstd` is gzip standard library. `zstd` is the Datadog cgo library.
+* `out` is the compressor used. `zskp` is this package. `zstd` is the Datadog cgo library. `gzstd/gzkp` is gzip standard and this library.
 * `level` is the compression level used. For `zskp` level 1 is "fastest", level 2 is "default".
 * `insize`/`outsize` is the input/output size.
 * `millis` is the number of milliseconds used for compression.
 * `mb/s` is megabytes (2^20 bytes) per second.
 ```
-The test data for the Large Text Compression Benchmark is the first
+Silesia Corpus:
-10^9 bytes of the English Wikipedia dump on Mar. 3, 2006.
+http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip
 http://mattmahoney.net/dc/textdata.html
-file    out     level   insize  outsize     millis  mb/s
+This package:
-enwik9  zskp    1   1000000000  343833033   5840    163.30
+file    out     level   insize      outsize     millis  mb/s
-enwik9  zskp    2   1000000000  317822183   8449    112.87
+silesia.tar zskp    1   211947520   73101992    643     313.87
-enwik9  gzstd   1   1000000000  382578136   13627   69.98
+silesia.tar zskp    2   211947520   67504318    969     208.38
-enwik9  gzstd   3   1000000000  349139651   22344   42.68
+silesia.tar zskp    3   211947520   65177448    1899    106.44
-enwik9  zstd    1   1000000000  357416379   4838    197.12
+
-enwik9  zstd    3   1000000000  313734522   7556    126.21
+cgo zstd:
 silesia.tar zstd    1   211947520   73605392    543     371.56
 silesia.tar zstd    3   211947520   66793289    864     233.68
 silesia.tar zstd    6   211947520   62916450    1913    105.66
 gzip, stdlib/this package:
 silesia.tar gzstd   1   211947520   80007735    1654    122.21
 silesia.tar gzkp    1   211947520   80369488    1168    173.06
 GOB stream of binary data. Highly compressible.
 https://files.klauspost.com/compress/gob-stream.7z
-file        out level   insize      outsize     millis  mb/s
+file        out     level   insize  outsize     millis  mb/s
-gob-stream  zskp    1   1911399616  234981983   5100    357.42
+gob-stream  zskp    1   1911399616  235022249   3088    590.30
-gob-stream  zskp    2   1911399616  208674003   6698    272.15
+gob-stream  zskp    2   1911399616  205669791   3786    481.34
-gob-stream  gzstd   1   1911399616  357382641   14727   123.78
+gob-stream  zskp    3   1911399616  185792019   9324    195.48
-gob-stream  gzstd   3   1911399616  327835097   17005   107.19
+gob-stream  zstd    1   1911399616  249810424   2637    691.26
-gob-stream  zstd    1   1911399616  250787165   4075    447.22
+gob-stream  zstd    3   1911399616  208192146   3490    522.31
-gob-stream  zstd    3   1911399616  208191888   5511    330.77
+gob-stream  zstd    6   1911399616  193632038   6687    272.56
 gob-stream  gzstd   1   1911399616  357382641   10251   177.82
 gob-stream  gzkp    1   1911399616  362156523   5695    320.08
-Highly compressible JSON file. Similar to logs in a lot of ways.
+The test data for the Large Text Compression Benchmark is the first
-https://files.klauspost.com/compress/adresser.001.gz
+10^9 bytes of the English Wikipedia dump on Mar. 3, 2006.
 http://mattmahoney.net/dc/textdata.html
-file            out level   insize      outsize     millis  mb/s
+file    out level   insize      outsize     millis  mb/s
-adresser.001    zskp    1   1073741824  18510122    1477    692.83
+enwik9  zskp    1   1000000000  343848582   3609    264.18
-adresser.001    zskp    2   1073741824  19831697    1705    600.59
+enwik9  zskp    2   1000000000  317276632   5746    165.97
-adresser.001    gzstd   1   1073741824  47755503    3079    332.47
+enwik9  zskp    3   1000000000  294540704   11725   81.34
-adresser.001    gzstd   3   1073741824  40052381    3051    335.63
+enwik9  zstd    1   1000000000  358072021   3110    306.65
-adresser.001    zstd    1   1073741824  16135896    994     1030.18
+enwik9  zstd    3   1000000000  313734672   4784    199.35
-adresser.001    zstd    3   1073741824  17794465    905     1131.49
+enwik9  zstd    6   1000000000  295138875   10290   92.68
 enwik9  gzstd   1   1000000000  382578136   9604    99.30
 enwik9  gzkp    1   1000000000  383825945   6544    145.73
 Highly compressible JSON file.
 https://files.klauspost.com/compress/github-june-2days-2019.json.zst
 file                        out level   insize      outsize     millis  mb/s
 github-june-2days-2019.json zskp    1   6273951764  699045015   10620   563.40
 github-june-2days-2019.json zskp    2   6273951764  617881763   11687   511.96
 github-june-2days-2019.json zskp    3   6273951764  537511906   29252   204.54
 github-june-2days-2019.json zstd    1   6273951764  766284037   8450    708.00
 github-june-2days-2019.json zstd    3   6273951764  661889476   10927   547.57
 github-june-2days-2019.json zstd    6   6273951764  642756859   22996   260.18
 github-june-2days-2019.json gzstd   1   6273951764  1164400847  29948   199.79
 github-june-2days-2019.json gzkp    1   6273951764  1128755542  19236   311.03
 VM Image, Linux mint with a few installed applications:
 https://files.klauspost.com/compress/rawstudio-mint14.7z
-file    out level   insize  outsize millis  mb/s
+file                    out level   insize      outsize     millis  mb/s
-rawstudio-mint14.tar    zskp    1   8558382592  3648168838  33398   244.38
+rawstudio-mint14.tar    zskp    1   8558382592  3667489370  20210   403.84
-rawstudio-mint14.tar    zskp    2   8558382592  3376721436  50962   160.16
+rawstudio-mint14.tar    zskp    2   8558382592  3364592300  31873   256.07
-rawstudio-mint14.tar    gzstd   1   8558382592  3926257486  84712   96.35
+rawstudio-mint14.tar    zskp    3   8558382592  3224594213  71751   113.75
-rawstudio-mint14.tar    gzstd   3   8558382592  3740711978  176344  46.28
+rawstudio-mint14.tar    zstd    1   8558382592  3609250104  17136   476.27
-rawstudio-mint14.tar    zstd    1   8558382592  3607859742  27903   292.51
+rawstudio-mint14.tar    zstd    3   8558382592  3341679997  29262   278.92
-rawstudio-mint14.tar    zstd    3   8558382592  3341710879  46700   174.77
+rawstudio-mint14.tar    zstd    6   8558382592  3235846406  77904   104.77
 rawstudio-mint14.tar    gzstd   1   8558382592  3926257486  57722   141.40
 rawstudio-mint14.tar    gzkp    1   8558382592  3970463184  41749   195.49
 CSV data:
 https://files.klauspost.com/compress/nyc-taxi-data-10M.csv.zst
-The test data is designed to test archivers in realistic backup scenarios.
+file                    out level   insize      outsize     millis  mb/s
-http://mattmahoney.net/dc/10gb.html
+nyc-taxi-data-10M.csv   zskp    1   3325605752  641339945   8925    355.35
-
+nyc-taxi-data-10M.csv   zskp    2   3325605752  591748091   11268   281.44
-file    out level   insize  outsize millis  mb/s
+nyc-taxi-data-10M.csv   zskp    3   3325605752  538490114   19880   159.53
-10gb.tar    zskp    1   10065157632 4883149814  45715   209.97
+nyc-taxi-data-10M.csv   zstd    1   3325605752  687399637   8233    385.18
-10gb.tar    zskp    2   10065157632 4638110010  60970   157.44
+nyc-taxi-data-10M.csv   zstd    3   3325605752  598514411   10065   315.07
-10gb.tar    gzstd   1   10065157632 5198296126  97769   98.18
+nyc-taxi-data-10M.csv   zstd    6   3325605752  570522953   20038   158.27
-10gb.tar    gzstd   3   10065157632 4932665487  313427  30.63
+nyc-taxi-data-10M.csv   gzstd   1   3325605752  928656485   23876   132.83
-10gb.tar    zstd    1   10065157632 4940796535  40391   237.65
+nyc-taxi-data-10M.csv   gzkp    1   3325605752  924718719   16388   193.53
 10gb.tar    zstd    3   10065157632 4638618579  52911   181.42
 Silesia Corpus:
 http://sun.aei.polsl.pl/~sdeor/corpus/silesia.zip
 file    out level   insize  outsize millis  mb/s
 silesia.tar zskp    1   211947520   73025800    1108    182.26
 silesia.tar zskp    2   211947520   67674684    1599    126.41
 silesia.tar gzstd   1   211947520   80007735    2515    80.37
 silesia.tar gzstd   3   211947520   73133380    4259    47.45
 silesia.tar zstd    1   211947520   73513991    933     216.64
 silesia.tar zstd    3   211947520   66793301    1377    146.79
 ```
 ### Converters
@@ -309,6 +323,20 @@ The decoder can be used for *concurrent* decompression of multiple buffers.
 It will only allow a certain number of concurrent operations to run. 
 To tweak that yourself use the `WithDecoderConcurrency(n)` option when creating the decoder.   
 ### Dictionaries
 Data compressed with [dictionaries](https://github.com/facebook/zstd#the-case-for-small-data-compression) can be decompressed.
 Dictionaries are added individually to Decoders.
 Dictionaries are generated by the `zstd --train` command and contains an initial state for the decoder.
 To add a dictionary use the `WithDecoderDicts(dicts ...[]byte)` option with the dictionary data.
 Several dictionaries can be added at once.
 The dictionary will be used automatically for the data that specifies them.
 A re-used Decoder will still contain the dictionaries registered.
 When registering multiple dictionaries with the same ID, the last one will be used.
 ### Allocation-less operation
 The decoder has been designed to operate without allocations after a warmup. 
@@ -350,36 +378,42 @@ These are some examples of performance compared to [datadog cgo library](https:/
 The first two are streaming decodes and the last are smaller inputs. 
 ```
-BenchmarkDecoderSilesia-8             20       642550210 ns/op   329.85 MB/s      3101 B/op        8 allocs/op
+BenchmarkDecoderSilesia-8                          3     385000067 ns/op     550.51 MB/s        5498 B/op          8 allocs/op
-BenchmarkDecoderSilesiaCgo-8         100       384930000 ns/op   550.61 MB/s    451878 B/op     9713 allocs/op
+BenchmarkDecoderSilesiaCgo-8                       6     197666567 ns/op    1072.25 MB/s      270672 B/op          8 allocs/op
-BenchmarkDecoderEnwik9-2              10        3146000080 ns/op         317.86 MB/s        2649 B/op          9 allocs/op
+BenchmarkDecoderEnwik9-8                           1    2027001600 ns/op     493.34 MB/s       10496 B/op         18 allocs/op
-BenchmarkDecoderEnwik9Cgo-2           20        1905900000 ns/op         524.69 MB/s     1125120 B/op      45785 allocs/op
+BenchmarkDecoderEnwik9Cgo-8                        2     979499200 ns/op    1020.93 MB/s      270672 B/op          8 allocs/op
-BenchmarkDecoder_DecodeAll/z000000.zst-8               200     7049994 ns/op   138.26 MB/s        40 B/op        2 allocs/op
+Concurrent performance:
 BenchmarkDecoder_DecodeAll/z000001.zst-8            100000       19560 ns/op    97.49 MB/s        40 B/op        2 allocs/op
 BenchmarkDecoder_DecodeAll/z000002.zst-8              5000      297599 ns/op   236.99 MB/s        40 B/op        2 allocs/op
 BenchmarkDecoder_DecodeAll/z000003.zst-8              2000      725502 ns/op   141.17 MB/s        40 B/op        2 allocs/op
 BenchmarkDecoder_DecodeAll/z000004.zst-8            200000        9314 ns/op    54.54 MB/s        40 B/op        2 allocs/op
 BenchmarkDecoder_DecodeAll/z000005.zst-8             10000      137500 ns/op   104.72 MB/s        40 B/op        2 allocs/op
 BenchmarkDecoder_DecodeAll/z000006.zst-8               500     2316009 ns/op   206.06 MB/s        40 B/op        2 allocs/op
 BenchmarkDecoder_DecodeAll/z000007.zst-8             20000       64499 ns/op   344.90 MB/s        40 B/op        2 allocs/op
 BenchmarkDecoder_DecodeAll/z000008.zst-8             50000       24900 ns/op   219.56 MB/s        40 B/op        2 allocs/op
 BenchmarkDecoder_DecodeAll/z000009.zst-8              1000     2348999 ns/op   154.01 MB/s        40 B/op        2 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000000.zst-8            500     4268005 ns/op   228.38 MB/s   1228849 B/op        3 allocs/op
+BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-16                28915         42469 ns/op    4340.07 MB/s         114 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000001.zst-8         100000       15250 ns/op   125.05 MB/s      2096 B/op        3 allocs/op
+BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-16           116505          9965 ns/op    11900.16 MB/s         16 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000002.zst-8          10000      147399 ns/op   478.49 MB/s     73776 B/op        3 allocs/op
+BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-16              8952        134272 ns/op    3588.70 MB/s         915 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000003.zst-8           5000      320798 ns/op   319.27 MB/s    139312 B/op        3 allocs/op
+BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-16               11820        102538 ns/op    4161.90 MB/s         594 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000004.zst-8         200000       10004 ns/op    50.77 MB/s       560 B/op        3 allocs/op
+BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-16             34782         34184 ns/op    3661.88 MB/s          60 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000005.zst-8          20000       73599 ns/op   195.64 MB/s     19120 B/op        3 allocs/op
+BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-16              27712         43447 ns/op    3500.58 MB/s          99 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000006.zst-8           1000     1119003 ns/op   426.48 MB/s    557104 B/op        3 allocs/op
+BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-16                 62826         18750 ns/op    21845.10 MB/s        104 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000007.zst-8          20000      103450 ns/op   215.04 MB/s     71296 B/op        9 allocs/op
+BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-16          631545          1794 ns/op    57078.74 MB/s          2 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000008.zst-8         100000       20130 ns/op   271.58 MB/s      6192 B/op        3 allocs/op
+BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-16         1690140           712 ns/op    172938.13 MB/s         1 B/op          0 allocs/op
-BenchmarkDecoder_DecodeAllCgo/z000009.zst-8           2000     1123500 ns/op   322.00 MB/s    368688 B/op        3 allocs/op
+BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-16                 10432        113593 ns/op    6180.73 MB/s        1143 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallel/html.zst-16                    113206         10671 ns/op    9596.27 MB/s          15 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-16          1530615           779 ns/op    5229.49 MB/s           0 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/kppkn.gtb.zst-16             65217         16192 ns/op    11383.34 MB/s         46 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/geo.protodata.zst-16        292671          4039 ns/op    29363.19 MB/s          6 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/plrabn12.txt.zst-16          26314         46021 ns/op    10470.43 MB/s        293 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/lcet10.txt.zst-16            33897         34900 ns/op    12227.96 MB/s        205 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/asyoulik.txt.zst-16         104348         11433 ns/op    10949.01 MB/s         20 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/alice29.txt.zst-16           75949         15510 ns/op    9805.60 MB/s          32 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/html_x_4.zst-16             173910          6756 ns/op    60624.29 MB/s         37 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/paper-100k.pdf.zst-16       923076          1339 ns/op    76474.87 MB/s          1 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/fireworks.jpeg.zst-16       922920          1351 ns/op    91102.57 MB/s          2 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/urls.10K.zst-16              27649         43618 ns/op    16096.19 MB/s        407 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/html.zst-16                 279073          4160 ns/op    24614.18 MB/s          6 B/op          0 allocs/op
 BenchmarkDecoder_DecodeAllParallelCgo/comp-data.bin.zst-16        749938          1579 ns/op    2581.71 MB/s           0 B/op          0 allocs/op
 ```
-This reflects the performance around May 2019, but this may be out of date.
+This reflects the performance around May 2020, but this may be out of date.
 # Contributions
@@ -5,6 +5,7 @@
 package zstd
 import (
 	"encoding/binary"
 	"errors"
 	"io"
 	"math/bits"
@@ -34,8 +35,12 @@ func (b *bitReader) init(in []byte) error {
 	}
 	b.bitsRead = 64
 	b.value = 0
-	b.fill()
+	if len(in) >= 8 {
-	b.fill()
+		b.fillFastStart()
 	} else {
 		b.fill()
 		b.fill()
 	}
 	b.bitsRead += 8 - uint8(highBits(uint32(v)))
 	return nil
 }
@@ -63,21 +68,31 @@ func (b *bitReader) fillFast() {
 	if b.bitsRead < 32 {
 		return
 	}
-	// Do single re-slice to avoid bounds checks.
+	// 2 bounds checks.
-	v := b.in[b.off-4 : b.off]
+	v := b.in[b.off-4:]
 	v = v[:4]
 	low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 	b.value = (b.value << 32) | uint64(low)
 	b.bitsRead -= 32
 	b.off -= 4
 }
 // fillFastStart() assumes the bitreader is empty and there is at least 8 bytes to read.
 func (b *bitReader) fillFastStart() {
 	// Do single re-slice to avoid bounds checks.
 	b.value = binary.LittleEndian.Uint64(b.in[b.off-8:])
 	b.bitsRead = 0
 	b.off -= 8
 }
 // fill() will make sure at least 32 bits are available.
 func (b *bitReader) fill() {
 	if b.bitsRead < 32 {
 		return
 	}
 	if b.off >= 4 {
-		v := b.in[b.off-4 : b.off]
+		v := b.in[b.off-4:]
 		v = v[:4]
 		low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
 		b.value = (b.value << 32) | uint64(low)
 		b.bitsRead -= 32
@@ -75,21 +75,29 @@ type blockDec struct {
 	// Window size of the block.
 	WindowSize uint64
-	Type       blockType
+
-	RLESize    uint32
+	history     chan *history
 	input       chan struct{}
 	result      chan decodeOutput
 	sequenceBuf []seq
 	err         error
 	decWG       sync.WaitGroup
 	// Frame to use for singlethreaded decoding.
 	// Should not be used by the decoder itself since parent may be another frame.
 	localFrame *frameDec
 	// Block is RLE, this is the size.
 	RLESize uint32
 	tmp     [4]byte
 	Type blockType
 	// Is this the last block of a frame?
 	Last bool
 	// Use less memory
-	lowMem      bool
+	lowMem bool
 	history     chan *history
 	input       chan struct{}
 	result      chan decodeOutput
 	sequenceBuf []seq
 	tmp         [4]byte
 	err         error
 	decWG       sync.WaitGroup
 }
 func (b *blockDec) String() string {
@@ -127,25 +135,37 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 	b.Type = blockType((bh >> 1) & 3)
 	// find size.
 	cSize := int(bh >> 3)
 	maxSize := maxBlockSize
 	switch b.Type {
 	case blockTypeReserved:
 		return ErrReservedBlockType
 	case blockTypeRLE:
 		b.RLESize = uint32(cSize)
 		if b.lowMem {
 			maxSize = cSize
 		}
 		cSize = 1
 	case blockTypeCompressed:
 		if debug {
 			println("Data size on stream:", cSize)
 		}
 		b.RLESize = 0
 		maxSize = maxCompressedBlockSize
 		if windowSize < maxCompressedBlockSize && b.lowMem {
 			maxSize = int(windowSize)
 		}
 		if cSize > maxCompressedBlockSize || uint64(cSize) > b.WindowSize {
 			if debug {
 				printf("compressed block too big: csize:%d block: %+v\n", uint64(cSize), b)
 			}
 			return ErrCompressedSizeTooBig
 		}
-	default:
+	case blockTypeRaw:
 		b.RLESize = 0
 		// We do not need a destination for raw blocks.
 		maxSize = -1
 	default:
 		panic("Invalid block type")
 	}
 	// Read block data.
@@ -156,8 +176,8 @@ func (b *blockDec) reset(br byteBuffer, windowSize uint64) error {
 			b.dataStorage = make([]byte, 0, maxBlockSize)
 		}
 	}
-	if cap(b.dst) <= maxBlockSize {
+	if cap(b.dst) <= maxSize {
-		b.dst = make([]byte, 0, maxBlockSize+1)
+		b.dst = make([]byte, 0, maxSize+1)
 	}
 	var err error
 	b.data, err = br.readBig(cSize, b.dataStorage)
@@ -445,26 +465,22 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		if huff == nil {
 			huff = &huff0.Scratch{}
 		}
 		huff.Out = b.literalBuf[:0]
 		huff, literals, err = huff0.ReadTable(literals, huff)
 		if err != nil {
 			println("reading huffman table:", err)
 			return err
 		}
 		// Use our out buffer.
 		huff.Out = b.literalBuf[:0]
 		huff.MaxDecodedSize = litRegenSize
 		if fourStreams {
-			literals, err = huff.Decompress4X(literals, litRegenSize)
+			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
 		} else {
-			literals, err = huff.Decompress1X(literals)
+			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
 		}
 		if err != nil {
 			println("decoding compressed literals:", err)
 			return err
 		}
 		// Make sure we don't leak our literals buffer
 		huff.Out = nil
 		if len(literals) != litRegenSize {
 			return fmt.Errorf("literal output size mismatch want %d, got %d", litRegenSize, len(literals))
 		}
@@ -615,15 +631,12 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		var err error
 		// Use our out buffer.
 		huff = hist.huffTree
 		huff.Out = b.literalBuf[:0]
 		huff.MaxDecodedSize = litRegenSize
 		if fourStreams {
-			literals, err = huff.Decompress4X(literals, litRegenSize)
+			literals, err = huff.Decoder().Decompress4X(b.literalBuf[:0:litRegenSize], literals)
 		} else {
-			literals, err = huff.Decompress1X(literals)
+			literals, err = huff.Decoder().Decompress1X(b.literalBuf[:0:litRegenSize], literals)
 		}
 		// Make sure we don't leak our literals buffer
 		huff.Out = nil
 		if err != nil {
 			println("decompressing literals:", err)
 			return err
@@ -633,12 +646,13 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 		}
 	} else {
 		if hist.huffTree != nil && huff != nil {
-			huffDecoderPool.Put(hist.huffTree)
+			if hist.dict == nil || hist.dict.litDec != hist.huffTree {
 				huffDecoderPool.Put(hist.huffTree)
 			}
 			hist.huffTree = nil
 		}
 	}
 	if huff != nil {
 		huff.Out = nil
 		hist.huffTree = huff
 	}
 	if debug {
@@ -671,12 +685,21 @@ func (b *blockDec) decodeCompressed(hist *history) error {
 	//   If only recent offsets were not transferred, this would be an obvious win.
 	// 	 Also, if first 3 sequences don't reference recent offsets, all sequences can be decoded.
 	hbytes := hist.b
 	if len(hbytes) > hist.windowSize {
 		hbytes = hbytes[len(hbytes)-hist.windowSize:]
 		// We do not need history any more.
 		if hist.dict != nil {
 			hist.dict.content = nil
 		}
 	}
 	if err := seqs.initialize(br, hist, literals, b.dst); err != nil {
 		println("initializing sequences:", err)
 		return err
 	}
-	err = seqs.decode(nSeqs, br, hist.b)
+	err = seqs.decode(nSeqs, br, hbytes)
 	if err != nil {
 		return err
 	}
@@ -295,7 +295,7 @@ func (b *blockEnc) encodeRaw(a []byte) {
 	b.output = bh.appendTo(b.output[:0])
 	b.output = append(b.output, a...)
 	if debug {
-		println("Adding RAW block, length", len(a))
+		println("Adding RAW block, length", len(a), "last:", b.last)
 	}
 }
@@ -308,7 +308,7 @@ func (b *blockEnc) encodeRawTo(dst, src []byte) []byte {
 	dst = bh.appendTo(dst)
 	dst = append(dst, src...)
 	if debug {
-		println("Adding RAW block, length", len(src))
+		println("Adding RAW block, length", len(src), "last:", b.last)
 	}
 	return dst
 }
@@ -322,7 +322,7 @@ func (b *blockEnc) encodeLits(raw bool) error {
 	// Don't compress extremely small blocks
 	if len(b.literals) < 32 || raw {
 		if debug {
-			println("Adding RAW block, length", len(b.literals))
+			println("Adding RAW block, length", len(b.literals), "last:", b.last)
 		}
 		bh.setType(blockTypeRaw)
 		b.output = bh.appendTo(b.output)
@@ -349,7 +349,7 @@ func (b *blockEnc) encodeLits(raw bool) error {
 	switch err {
 	case huff0.ErrIncompressible:
 		if debug {
-			println("Adding RAW block, length", len(b.literals))
+			println("Adding RAW block, length", len(b.literals), "last:", b.last)
 		}
 		bh.setType(blockTypeRaw)
 		b.output = bh.appendTo(b.output)
@@ -444,9 +444,9 @@ func fuzzFseEncoder(data []byte) int {
 }
 // encode will encode the block and append the output in b.output.
-func (b *blockEnc) encode(raw bool) error {
+func (b *blockEnc) encode(raw, rawAllLits bool) error {
 	if len(b.sequences) == 0 {
-		return b.encodeLits(raw)
+		return b.encodeLits(rawAllLits)
 	}
 	// We want some difference
 	if len(b.literals) > (b.size - (b.size >> 5)) {
@@ -31,7 +31,8 @@ func (b *byteReader) overread() bool {
 // Int32 returns a little endian int32 starting at current offset.
 func (b byteReader) Int32() int32 {
-	b2 := b.b[b.off : b.off+4 : b.off+4]
+	b2 := b.b[b.off:]
 	b2 = b2[:4]
 	v3 := int32(b2[3])
 	v2 := int32(b2[2])
 	v1 := int32(b2[1])
@@ -55,7 +56,20 @@ func (b byteReader) Uint32() uint32 {
 		}
 		return v
 	}
-	b2 := b.b[b.off : b.off+4 : b.off+4]
+	b2 := b.b[b.off:]
 	b2 = b2[:4]
 	v3 := uint32(b2[3])
 	v2 := uint32(b2[2])
 	v1 := uint32(b2[1])
 	v0 := uint32(b2[0])
 	return v0 | (v1 << 8) | (v2 << 16) | (v3 << 24)
 }
 // Uint32NC returns a little endian uint32 starting at current offset.
 // The caller must be sure if there are at least 4 bytes left.
 func (b byteReader) Uint32NC() uint32 {
 	b2 := b.b[b.off:]
 	b2 = b2[:4]
 	v3 := uint32(b2[3])
 	v2 := uint32(b2[2])
 	v1 := uint32(b2[1])
@@ -23,17 +23,15 @@ type Decoder struct {
 	// Unreferenced decoders, ready for use.
 	decoders chan *blockDec
 	// Unreferenced decoders, ready for use.
 	frames chan *frameDec
 	// Streams ready to be decoded.
 	stream chan decodeStream
 	// Current read position used for Reader functionality.
 	current decoderState
-	// Custom dictionaries
+	// Custom dictionaries.
-	dicts map[uint32]struct{}
+	// Always uses copies.
 	dicts map[uint32]dict
 	// streamWg is the waitgroup for all streams
 	streamWg sync.WaitGroup
@@ -66,7 +64,7 @@ var (
 // A Decoder can be used in two modes:
 //
 // 1) As a stream, or
-// 2) For stateless decoding using DecodeAll or DecodeBuffer.
+// 2) For stateless decoding using DecodeAll.
 //
 // Only a single stream can be decoded concurrently, but the same decoder
 // can run multiple concurrent stateless decodes. It is even possible to
@@ -87,12 +85,19 @@ func NewReader(r io.Reader, opts ...DOption) (*Decoder, error) {
 	d.current.output = make(chan decodeOutput, d.o.concurrent)
 	d.current.flushed = true
 	// Transfer option dicts.
 	d.dicts = make(map[uint32]dict, len(d.o.dicts))
 	for _, dc := range d.o.dicts {
 		d.dicts[dc.id] = dc
 	}
 	d.o.dicts = nil
 	// Create decoders
 	d.decoders = make(chan *blockDec, d.o.concurrent)
 	d.frames = make(chan *frameDec, d.o.concurrent)
 	for i := 0; i < d.o.concurrent; i++ {
-		d.frames <- newFrameDec(d.o)
+		dec := newBlockDec(d.o.lowMem)
-		d.decoders <- newBlockDec(d.o.lowMem)
+		dec.localFrame = newFrameDec(d.o)
 		d.decoders <- dec
 	}
 	if r == nil {
@@ -169,7 +174,12 @@ func (d *Decoder) Reset(r io.Reader) error {
 			println("*bytes.Buffer detected, doing sync decode, len:", bb.Len())
 		}
 		b := bb.Bytes()
-		dst, err := d.DecodeAll(b, nil)
+		var dst []byte
 		if cap(d.current.b) > 0 {
 			dst = d.current.b
 		}
 		dst, err := d.DecodeAll(b, dst[:0])
 		if err == nil {
 			err = io.EOF
 		}
@@ -277,23 +287,31 @@ func (d *Decoder) DecodeAll(input, dst []byte) ([]byte, error) {
 	}
 	// Grab a block decoder and frame decoder.
-	block, frame := <-d.decoders, <-d.frames
+	block := <-d.decoders
 	frame := block.localFrame
 	defer func() {
 		if debug {
 			printf("re-adding decoder: %p", block)
 		}
 		d.decoders <- block
 		frame.rawInput = nil
 		frame.bBuf = nil
-		d.frames <- frame
+		d.decoders <- block
 	}()
 	frame.bBuf = input
 	for {
 		frame.history.reset()
 		err := frame.reset(&frame.bBuf)
 		if err == io.EOF {
 			return dst, nil
 		}
 		if frame.DictionaryID != nil {
 			dict, ok := d.dicts[*frame.DictionaryID]
 			if !ok {
 				return nil, ErrUnknownDictionary
 			}
 			frame.history.setDict(&dict)
 		}
 		if err != nil {
 			return dst, err
 		}
@@ -456,10 +474,19 @@ func (d *Decoder) startStreamDecoder(inStream chan decodeStream) {
 		br := readerWrapper{r: stream.r}
 	decodeStream:
 		for {
 			frame.history.reset()
 			err := frame.reset(&br)
 			if debug && err != nil {
 				println("Frame decoder returned", err)
 			}
 			if err == nil && frame.DictionaryID != nil {
 				dict, ok := d.dicts[*frame.DictionaryID]
 				if !ok {
 					err = ErrUnknownDictionary
 				} else {
 					frame.history.setDict(&dict)
 				}
 			}
 			if err != nil {
 				stream.output <- decodeOutput{
 					err: err,
@@ -18,6 +18,7 @@ type decoderOptions struct {
 	lowMem         bool
 	concurrent     int
 	maxDecodedSize uint64
 	dicts          []dict
 }
 func (o *decoderOptions) setDefault() {
@@ -66,3 +67,18 @@ func WithDecoderMaxMemory(n uint64) DOption {
 		return nil
 	}
 }
 // WithDecoderDicts allows to register one or more dictionaries for the decoder.
 // If several dictionaries with the same ID is provided the last one will be used.
 func WithDecoderDicts(dicts ...[]byte) DOption {
 	return func(o *decoderOptions) error {
 		for _, b := range dicts {
 			d, err := loadDict(b)
 			if err != nil {
 				return err
 			}
 			o.dicts = append(o.dicts, *d)
 		}
 		return nil
 	}
 }
@@ -0,0 +1,104 @@
 package zstd
 import (
 	"bytes"
 	"encoding/binary"
 	"errors"
 	"fmt"
 	"io"
 	"github.com/klauspost/compress/huff0"
 )
 type dict struct {
 	id uint32
 	litDec              *huff0.Scratch
 	llDec, ofDec, mlDec sequenceDec
 	offsets             [3]int
 	content             []byte
 }
 var dictMagic = [4]byte{0x37, 0xa4, 0x30, 0xec}
 // Load a dictionary as described in
 // https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format
 func loadDict(b []byte) (*dict, error) {
 	// Check static field size.
 	if len(b) <= 8+(3*4) {
 		return nil, io.ErrUnexpectedEOF
 	}
 	d := dict{
 		llDec: sequenceDec{fse: &fseDecoder{}},
 		ofDec: sequenceDec{fse: &fseDecoder{}},
 		mlDec: sequenceDec{fse: &fseDecoder{}},
 	}
 	if !bytes.Equal(b[:4], dictMagic[:]) {
 		return nil, ErrMagicMismatch
 	}
 	d.id = binary.LittleEndian.Uint32(b[4:8])
 	if d.id == 0 {
 		return nil, errors.New("dictionaries cannot have ID 0")
 	}
 	// Read literal table
 	var err error
 	d.litDec, b, err = huff0.ReadTable(b[8:], nil)
 	if err != nil {
 		return nil, err
 	}
 	br := byteReader{
 		b:   b,
 		off: 0,
 	}
 	readDec := func(i tableIndex, dec *fseDecoder) error {
 		if err := dec.readNCount(&br, uint16(maxTableSymbol[i])); err != nil {
 			return err
 		}
 		if br.overread() {
 			return io.ErrUnexpectedEOF
 		}
 		err = dec.transform(symbolTableX[i])
 		if err != nil {
 			println("Transform table error:", err)
 			return err
 		}
 		if debug {
 			println("Read table ok", "symbolLen:", dec.symbolLen)
 		}
 		// Set decoders as predefined so they aren't reused.
 		dec.preDefined = true
 		return nil
 	}
 	if err := readDec(tableOffsets, d.ofDec.fse); err != nil {
 		return nil, err
 	}
 	if err := readDec(tableMatchLengths, d.mlDec.fse); err != nil {
 		return nil, err
 	}
 	if err := readDec(tableLiteralLengths, d.llDec.fse); err != nil {
 		return nil, err
 	}
 	if br.remain() < 12 {
 		return nil, io.ErrUnexpectedEOF
 	}
 	d.offsets[0] = int(br.Uint32())
 	br.advance(4)
 	d.offsets[1] = int(br.Uint32())
 	br.advance(4)
 	d.offsets[2] = int(br.Uint32())
 	br.advance(4)
 	if d.offsets[0] <= 0 || d.offsets[1] <= 0 || d.offsets[2] <= 0 {
 		return nil, errors.New("invalid offset in dictionary")
 	}
 	d.content = make([]byte, br.remain())
 	copy(d.content, br.unread())
 	if d.offsets[0] > len(d.content) || d.offsets[1] > len(d.content) || d.offsets[2] > len(d.content) {
 		return nil, fmt.Errorf("initial offset bigger than dictionary content size %d, offsets: %v", len(d.content), d.offsets)
 	}
 	return &d, nil
 }
@@ -0,0 +1,518 @@
 // Copyright 2019+ Klaus Post. All rights reserved.
 // License information can be found in the LICENSE file.
 // Based on work by Yann Collet, released under BSD License.
 package zstd
 import "fmt"
 const (
 	betterLongTableBits = 19                       // Bits used in the long match table
 	betterLongTableSize = 1 << betterLongTableBits // Size of the table
 	// Note: Increasing the short table bits or making the hash shorter
 	// can actually lead to compression degradation since it will 'steal' more from the
 	// long match table and match offsets are quite big.
 	// This greatly depends on the type of input.
 	betterShortTableBits = 13                        // Bits used in the short match table
 	betterShortTableSize = 1 << betterShortTableBits // Size of the table
 )
 type prevEntry struct {
 	offset int32
 	prev   int32
 }
 // betterFastEncoder uses 2 tables, one for short matches (5 bytes) and one for long matches.
 // The long match table contains the previous entry with the same hash,
 // effectively making it a "chain" of length 2.
 // When we find a long match we choose between the two values and select the longest.
 // When we find a short match, after checking the long, we check if we can find a long at n+1
 // and that it is longer (lazy matching).
 type betterFastEncoder struct {
 	fastBase
 	table     [betterShortTableSize]tableEntry
 	longTable [betterLongTableSize]prevEntry
 }
 // Encode improves compression...
 func (e *betterFastEncoder) Encode(blk *blockEnc, src []byte) {
 	const (
 		// Input margin is the number of bytes we read (8)
 		// and the maximum we will read ahead (2)
 		inputMargin            = 8 + 2
 		minNonLiteralBlockSize = 16
 	)
 	// Protect against e.cur wraparound.
 	for e.cur >= bufferReset {
 		if len(e.hist) == 0 {
 			for i := range e.table[:] {
 				e.table[i] = tableEntry{}
 			}
 			for i := range e.longTable[:] {
 				e.longTable[i] = prevEntry{}
 			}
 			e.cur = e.maxMatchOff
 			break
 		}
 		// Shift down everything in the table that isn't already too far away.
 		minOff := e.cur + int32(len(e.hist)) - e.maxMatchOff
 		for i := range e.table[:] {
 			v := e.table[i].offset
 			if v < minOff {
 				v = 0
 			} else {
 				v = v - e.cur + e.maxMatchOff
 			}
 			e.table[i].offset = v
 		}
 		for i := range e.longTable[:] {
 			v := e.longTable[i].offset
 			v2 := e.longTable[i].prev
 			if v < minOff {
 				v = 0
 				v2 = 0
 			} else {
 				v = v - e.cur + e.maxMatchOff
 				if v2 < minOff {
 					v2 = 0
 				} else {
 					v2 = v2 - e.cur + e.maxMatchOff
 				}
 			}
 			e.longTable[i] = prevEntry{
 				offset: v,
 				prev:   v2,
 			}
 		}
 		e.cur = e.maxMatchOff
 		break
 	}
 	s := e.addBlock(src)
 	blk.size = len(src)
 	if len(src) < minNonLiteralBlockSize {
 		blk.extraLits = len(src)
 		blk.literals = blk.literals[:len(src)]
 		copy(blk.literals, src)
 		return
 	}
 	// Override src
 	src = e.hist
 	sLimit := int32(len(src)) - inputMargin
 	// stepSize is the number of bytes to skip on every main loop iteration.
 	// It should be >= 1.
 	const stepSize = 1
 	const kSearchStrength = 9
 	// nextEmit is where in src the next emitLiteral should start from.
 	nextEmit := s
 	cv := load6432(src, s)
 	// Relative offsets
 	offset1 := int32(blk.recentOffsets[0])
 	offset2 := int32(blk.recentOffsets[1])
 	addLiterals := func(s *seq, until int32) {
 		if until == nextEmit {
 			return
 		}
 		blk.literals = append(blk.literals, src[nextEmit:until]...)
 		s.litLen = uint32(until - nextEmit)
 	}
 	if debug {
 		println("recent offsets:", blk.recentOffsets)
 	}
 encodeLoop:
 	for {
 		var t int32
 		// We allow the encoder to optionally turn off repeat offsets across blocks
 		canRepeat := len(blk.sequences) > 2
 		var matched int32
 		for {
 			if debugAsserts && canRepeat && offset1 == 0 {
 				panic("offset0 was 0")
 			}
 			nextHashS := hash5(cv, betterShortTableBits)
 			nextHashL := hash8(cv, betterLongTableBits)
 			candidateL := e.longTable[nextHashL]
 			candidateS := e.table[nextHashS]
 			const repOff = 1
 			repIndex := s - offset1 + repOff
 			off := s + e.cur
 			e.longTable[nextHashL] = prevEntry{offset: off, prev: candidateL.offset}
 			e.table[nextHashS] = tableEntry{offset: off, val: uint32(cv)}
 			if canRepeat {
 				if repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff*8)) {
 					// Consider history as well.
 					var seq seq
 					lenght := 4 + e.matchlen(s+4+repOff, repIndex+4, src)
 					seq.matchLen = uint32(lenght - zstdMinMatch)
 					// We might be able to match backwards.
 					// Extend as long as we can.
 					start := s + repOff
 					// We end the search early, so we don't risk 0 literals
 					// and have to do special offset treatment.
 					startLimit := nextEmit + 1
 					tMin := s - e.maxMatchOff
 					if tMin < 0 {
 						tMin = 0
 					}
 					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
 						repIndex--
 						start--
 						seq.matchLen++
 					}
 					addLiterals(&seq, start)
 					// rep 0
 					seq.offset = 1
 					if debugSequences {
 						println("repeat sequence", seq, "next s:", s)
 					}
 					blk.sequences = append(blk.sequences, seq)
 					// Index match start+1 (long) -> s - 1
 					index0 := s + repOff
 					s += lenght + repOff
 					nextEmit = s
 					if s >= sLimit {
 						if debug {
 							println("repeat ended", s, lenght)
 						}
 						break encodeLoop
 					}
 					// Index skipped...
 					for index0 < s-1 {
 						cv0 := load6432(src, index0)
 						cv1 := cv0 >> 8
 						h0 := hash8(cv0, betterLongTableBits)
 						off := index0 + e.cur
 						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
 						e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
 						index0 += 2
 					}
 					cv = load6432(src, s)
 					continue
 				}
 				const repOff2 = 1
 				// We deviate from the reference encoder and also check offset 2.
 				// Still slower and not much better, so disabled.
 				// repIndex = s - offset2 + repOff2
 				if false && repIndex >= 0 && load6432(src, repIndex) == load6432(src, s+repOff) {
 					// Consider history as well.
 					var seq seq
 					lenght := 8 + e.matchlen(s+8+repOff2, repIndex+8, src)
 					seq.matchLen = uint32(lenght - zstdMinMatch)
 					// We might be able to match backwards.
 					// Extend as long as we can.
 					start := s + repOff2
 					// We end the search early, so we don't risk 0 literals
 					// and have to do special offset treatment.
 					startLimit := nextEmit + 1
 					tMin := s - e.maxMatchOff
 					if tMin < 0 {
 						tMin = 0
 					}
 					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
 						repIndex--
 						start--
 						seq.matchLen++
 					}
 					addLiterals(&seq, start)
 					// rep 2
 					seq.offset = 2
 					if debugSequences {
 						println("repeat sequence 2", seq, "next s:", s)
 					}
 					blk.sequences = append(blk.sequences, seq)
 					index0 := s + repOff2
 					s += lenght + repOff2
 					nextEmit = s
 					if s >= sLimit {
 						if debug {
 							println("repeat ended", s, lenght)
 						}
 						break encodeLoop
 					}
 					// Index skipped...
 					for index0 < s-1 {
 						cv0 := load6432(src, index0)
 						cv1 := cv0 >> 8
 						h0 := hash8(cv0, betterLongTableBits)
 						off := index0 + e.cur
 						e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
 						e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
 						index0 += 2
 					}
 					cv = load6432(src, s)
 					// Swap offsets
 					offset1, offset2 = offset2, offset1
 					continue
 				}
 			}
 			// Find the offsets of our two matches.
 			coffsetL := candidateL.offset - e.cur
 			coffsetLP := candidateL.prev - e.cur
 			// Check if we have a long match.
 			if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
 				// Found a long match, at least 8 bytes.
 				matched = e.matchlen(s+8, coffsetL+8, src) + 8
 				t = coffsetL
 				if debugAsserts && s <= t {
 					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 				}
 				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
 				}
 				if debugMatches {
 					println("long match")
 				}
 				if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
 					// Found a long match, at least 8 bytes.
 					prevMatch := e.matchlen(s+8, coffsetLP+8, src) + 8
 					if prevMatch > matched {
 						matched = prevMatch
 						t = coffsetLP
 					}
 					if debugAsserts && s <= t {
 						panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 					}
 					if debugAsserts && s-t > e.maxMatchOff {
 						panic("s - t >e.maxMatchOff")
 					}
 					if debugMatches {
 						println("long match")
 					}
 				}
 				break
 			}
 			// Check if we have a long match on prev.
 			if s-coffsetLP < e.maxMatchOff && cv == load6432(src, coffsetLP) {
 				// Found a long match, at least 8 bytes.
 				matched = e.matchlen(s+8, coffsetLP+8, src) + 8
 				t = coffsetLP
 				if debugAsserts && s <= t {
 					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 				}
 				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
 				}
 				if debugMatches {
 					println("long match")
 				}
 				break
 			}
 			coffsetS := candidateS.offset - e.cur
 			// Check if we have a short match.
 			if s-coffsetS < e.maxMatchOff && uint32(cv) == candidateS.val {
 				// found a regular match
 				matched = e.matchlen(s+4, coffsetS+4, src) + 4
 				// See if we can find a long match at s+1
 				const checkAt = 1
 				cv := load6432(src, s+checkAt)
 				nextHashL = hash8(cv, betterLongTableBits)
 				candidateL = e.longTable[nextHashL]
 				coffsetL = candidateL.offset - e.cur
 				// We can store it, since we have at least a 4 byte match.
 				e.longTable[nextHashL] = prevEntry{offset: s + checkAt + e.cur, prev: candidateL.offset}
 				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
 					// Found a long match, at least 8 bytes.
 					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
 					if matchedNext > matched {
 						t = coffsetL
 						s += checkAt
 						matched = matchedNext
 						if debugMatches {
 							println("long match (after short)")
 						}
 						break
 					}
 				}
 				// Check prev long...
 				coffsetL = candidateL.prev - e.cur
 				if s-coffsetL < e.maxMatchOff && cv == load6432(src, coffsetL) {
 					// Found a long match, at least 8 bytes.
 					matchedNext := e.matchlen(s+8+checkAt, coffsetL+8, src) + 8
 					if matchedNext > matched {
 						t = coffsetL
 						s += checkAt
 						matched = matchedNext
 						if debugMatches {
 							println("prev long match (after short)")
 						}
 						break
 					}
 				}
 				t = coffsetS
 				if debugAsserts && s <= t {
 					panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 				}
 				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
 				}
 				if debugAsserts && t < 0 {
 					panic("t<0")
 				}
 				if debugMatches {
 					println("short match")
 				}
 				break
 			}
 			// No match found, move forward in input.
 			s += stepSize + ((s - nextEmit) >> (kSearchStrength - 1))
 			if s >= sLimit {
 				break encodeLoop
 			}
 			cv = load6432(src, s)
 		}
 		// A 4-byte match has been found. Update recent offsets.
 		// We'll later see if more than 4 bytes.
 		offset2 = offset1
 		offset1 = s - t
 		if debugAsserts && s <= t {
 			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 		}
 		if debugAsserts && canRepeat && int(offset1) > len(src) {
 			panic("invalid offset")
 		}
 		// Extend the n-byte match as long as possible.
 		l := matched
 		// Extend backwards
 		tMin := s - e.maxMatchOff
 		if tMin < 0 {
 			tMin = 0
 		}
 		for t > tMin && s > nextEmit && src[t-1] == src[s-1] && l < maxMatchLength {
 			s--
 			t--
 			l++
 		}
 		// Write our sequence
 		var seq seq
 		seq.litLen = uint32(s - nextEmit)
 		seq.matchLen = uint32(l - zstdMinMatch)
 		if seq.litLen > 0 {
 			blk.literals = append(blk.literals, src[nextEmit:s]...)
 		}
 		seq.offset = uint32(s-t) + 3
 		s += l
 		if debugSequences {
 			println("sequence", seq, "next s:", s)
 		}
 		blk.sequences = append(blk.sequences, seq)
 		nextEmit = s
 		if s >= sLimit {
 			break encodeLoop
 		}
 		// Index match start+1 (long) -> s - 1
 		index0 := s - l + 1
 		for index0 < s-1 {
 			cv0 := load6432(src, index0)
 			cv1 := cv0 >> 8
 			h0 := hash8(cv0, betterLongTableBits)
 			off := index0 + e.cur
 			e.longTable[h0] = prevEntry{offset: off, prev: e.longTable[h0].offset}
 			e.table[hash5(cv1, betterShortTableBits)] = tableEntry{offset: off + 1, val: uint32(cv1)}
 			index0 += 2
 		}
 		cv = load6432(src, s)
 		if !canRepeat {
 			continue
 		}
 		// Check offset 2
 		for {
 			o2 := s - offset2
 			if load3232(src, o2) != uint32(cv) {
 				// Do regular search
 				break
 			}
 			// Store this, since we have it.
 			nextHashS := hash5(cv, betterShortTableBits)
 			nextHashL := hash8(cv, betterLongTableBits)
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
 			l := 4 + e.matchlen(s+4, o2+4, src)
 			e.longTable[nextHashL] = prevEntry{offset: s + e.cur, prev: e.longTable[nextHashL].offset}
 			e.table[nextHashS] = tableEntry{offset: s + e.cur, val: uint32(cv)}
 			seq.matchLen = uint32(l) - zstdMinMatch
 			seq.litLen = 0
 			// Since litlen is always 0, this is offset 1.
 			seq.offset = 1
 			s += l
 			nextEmit = s
 			if debugSequences {
 				println("sequence", seq, "next s:", s)
 			}
 			blk.sequences = append(blk.sequences, seq)
 			// Swap offset 1 and 2.
 			offset1, offset2 = offset2, offset1
 			if s >= sLimit {
 				// Finished
 				break encodeLoop
 			}
 			cv = load6432(src, s)
 		}
 	}
 	if int(nextEmit) < len(src) {
 		blk.literals = append(blk.literals, src[nextEmit:]...)
 		blk.extraLits = len(src) - int(nextEmit)
 	}
 	blk.recentOffsets[0] = uint32(offset1)
 	blk.recentOffsets[1] = uint32(offset2)
 	if debug {
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 	}
 }
 // EncodeNoHist will encode a block with no history and no following blocks.
 // Most notable difference is that src will not be copied for history and
 // we do not need to check for max match length.
 func (e *betterFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 	e.Encode(blk, src)
 }
@@ -80,10 +80,7 @@ func (e *doubleFastEncoder) Encode(blk *blockEnc, src []byte) {
 	sLimit := int32(len(src)) - inputMargin
 	// stepSize is the number of bytes to skip on every main loop iteration.
 	// It should be >= 1.
-	stepSize := int32(e.o.targetLength)
+	const stepSize = 1
 	if stepSize == 0 {
 		stepSize++
 	}
 	const kSearchStrength = 8
@@ -172,55 +169,6 @@ encodeLoop:
 					cv = load6432(src, s)
 					continue
 				}
 				const repOff2 = 1
 				// We deviate from the reference encoder and also check offset 2.
 				// Slower and not consistently better, so disabled.
 				// repIndex = s - offset2 + repOff2
 				if false && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>(repOff2*8)) {
 					// Consider history as well.
 					var seq seq
 					lenght := 4 + e.matchlen(s+4+repOff2, repIndex+4, src)
 					seq.matchLen = uint32(lenght - zstdMinMatch)
 					// We might be able to match backwards.
 					// Extend as long as we can.
 					start := s + repOff2
 					// We end the search early, so we don't risk 0 literals
 					// and have to do special offset treatment.
 					startLimit := nextEmit + 1
 					tMin := s - e.maxMatchOff
 					if tMin < 0 {
 						tMin = 0
 					}
 					for repIndex > tMin && start > startLimit && src[repIndex-1] == src[start-1] && seq.matchLen < maxMatchLength-zstdMinMatch-1 {
 						repIndex--
 						start--
 						seq.matchLen++
 					}
 					addLiterals(&seq, start)
 					// rep 2
 					seq.offset = 2
 					if debugSequences {
 						println("repeat sequence 2", seq, "next s:", s)
 					}
 					blk.sequences = append(blk.sequences, seq)
 					s += lenght + repOff2
 					nextEmit = s
 					if s >= sLimit {
 						if debug {
 							println("repeat ended", s, lenght)
 						}
 						break encodeLoop
 					}
 					cv = load6432(src, s)
 					// Swap offsets
 					offset1, offset2 = offset2, offset1
 					continue
 				}
 			}
 			// Find the offsets of our two matches.
 			coffsetL := s - (candidateL.offset - e.cur)
@@ -372,7 +320,7 @@ encodeLoop:
 			}
 			// Store this, since we have it.
-			nextHashS := hash5(cv1>>8, dFastShortTableBits)
+			nextHashS := hash5(cv, dFastShortTableBits)
 			nextHashL := hash8(cv, dFastLongTableBits)
 			// We have at least 4 byte match.
@@ -450,10 +398,7 @@ func (e *doubleFastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 	sLimit := int32(len(src)) - inputMargin
 	// stepSize is the number of bytes to skip on every main loop iteration.
 	// It should be >= 1.
-	stepSize := int32(e.o.targetLength)
+	const stepSize = 1
 	if stepSize == 0 {
 		stepSize++
 	}
 	const kSearchStrength = 8
@@ -726,4 +671,8 @@ encodeLoop:
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 	}
 	// We do not store history, so we must offset e.cur to avoid false matches for next user.
 	if e.cur < bufferReset {
 		e.cur += int32(len(src))
 	}
 }
@@ -6,6 +6,7 @@ package zstd
 import (
 	"fmt"
 	"math"
 	"math/bits"
 	"github.com/klauspost/compress/zstd/internal/xxhash"
@@ -23,26 +24,29 @@ type tableEntry struct {
 	offset int32
 }
-type fastEncoder struct {
+type fastBase struct {
 	o encParams
 	// cur is the offset at the start of hist
 	cur int32
 	// maximum offset. Should be at least 2x block size.
 	maxMatchOff int32
 	hist        []byte
 	crc         *xxhash.Digest
 	table       [tableSize]tableEntry
 	tmp         [8]byte
 	blk         *blockEnc
 }
 type fastEncoder struct {
 	fastBase
 	table [tableSize]tableEntry
 }
 // CRC returns the underlying CRC writer.
-func (e *fastEncoder) CRC() *xxhash.Digest {
+func (e *fastBase) CRC() *xxhash.Digest {
 	return e.crc
 }
 // AppendCRC will append the CRC to the destination slice and return it.
-func (e *fastEncoder) AppendCRC(dst []byte) []byte {
+func (e *fastBase) AppendCRC(dst []byte) []byte {
 	crc := e.crc.Sum(e.tmp[:0])
 	dst = append(dst, crc[7], crc[6], crc[5], crc[4])
 	return dst
@@ -50,7 +54,7 @@ func (e *fastEncoder) AppendCRC(dst []byte) []byte {
 // WindowSize returns the window size of the encoder,
 // or a window size small enough to contain the input size, if > 0.
-func (e *fastEncoder) WindowSize(size int) int32 {
+func (e *fastBase) WindowSize(size int) int32 {
 	if size > 0 && size < int(e.maxMatchOff) {
 		b := int32(1) << uint(bits.Len(uint(size)))
 		// Keep minimum window.
@@ -63,7 +67,7 @@ func (e *fastEncoder) WindowSize(size int) int32 {
 }
 // Block returns the current block.
-func (e *fastEncoder) Block() *blockEnc {
+func (e *fastBase) Block() *blockEnc {
 	return e.blk
 }
@@ -112,11 +116,7 @@ func (e *fastEncoder) Encode(blk *blockEnc, src []byte) {
 	sLimit := int32(len(src)) - inputMargin
 	// stepSize is the number of bytes to skip on every main loop iteration.
 	// It should be >= 2.
-	stepSize := int32(e.o.targetLength)
+	const stepSize = 2
 	if stepSize == 0 {
 		stepSize++
 	}
 	stepSize++
 	// TEMPLATE
 	const hashLog = tableBits
@@ -169,9 +169,22 @@ encodeLoop:
 			if canRepeat && repIndex >= 0 && load3232(src, repIndex) == uint32(cv>>16) {
 				// Consider history as well.
 				var seq seq
-				lenght := 4 + e.matchlen(s+6, repIndex+4, src)
+				var length int32
 				// length = 4 + e.matchlen(s+6, repIndex+4, src)
 				{
 					a := src[s+6:]
 					b := src[repIndex+4:]
 					endI := len(a) & (math.MaxInt32 - 7)
 					length = int32(endI) + 4
 					for i := 0; i < endI; i += 8 {
 						if diff := load64(a, i) ^ load64(b, i); diff != 0 {
 							length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
 							break
 						}
 					}
 				}
-				seq.matchLen = uint32(lenght - zstdMinMatch)
+				seq.matchLen = uint32(length - zstdMinMatch)
 				// We might be able to match backwards.
 				// Extend as long as we can.
@@ -197,11 +210,11 @@ encodeLoop:
 					println("repeat sequence", seq, "next s:", s)
 				}
 				blk.sequences = append(blk.sequences, seq)
-				s += lenght + 2
+				s += length + 2
 				nextEmit = s
 				if s >= sLimit {
 					if debug {
-						println("repeat ended", s, lenght)
+						println("repeat ended", s, length)
 					}
 					break encodeLoop
@@ -257,7 +270,20 @@ encodeLoop:
 		}
 		// Extend the 4-byte match as long as possible.
-		l := e.matchlen(s+4, t+4, src) + 4
+		//l := e.matchlen(s+4, t+4, src) + 4
 		var l int32
 		{
 			a := src[s+4:]
 			b := src[t+4:]
 			endI := len(a) & (math.MaxInt32 - 7)
 			l = int32(endI) + 4
 			for i := 0; i < endI; i += 8 {
 				if diff := load64(a, i) ^ load64(b, i); diff != 0 {
 					l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
 					break
 				}
 			}
 		}
 		// Extend backwards
 		tMin := s - e.maxMatchOff
@@ -294,7 +320,20 @@ encodeLoop:
 		if o2 := s - offset2; canRepeat && load3232(src, o2) == uint32(cv) {
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
-			l := 4 + e.matchlen(s+4, o2+4, src)
+			//l := 4 + e.matchlen(s+4, o2+4, src)
 			var l int32
 			{
 				a := src[s+4:]
 				b := src[o2+4:]
 				endI := len(a) & (math.MaxInt32 - 7)
 				l = int32(endI) + 4
 				for i := 0; i < endI; i += 8 {
 					if diff := load64(a, i) ^ load64(b, i); diff != 0 {
 						l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
 						break
 					}
 				}
 			}
 			// Store this, since we have it.
 			nextHash := hash6(cv, hashLog)
@@ -344,6 +383,7 @@ func (e *fastEncoder) EncodeNoHist(blk *blockEnc, src []byte) {
 			panic("src too big")
 		}
 	}
 	// Protect against e.cur wraparound.
 	if e.cur >= bufferReset {
 		for i := range e.table[:] {
@@ -412,10 +452,23 @@ encodeLoop:
 			if len(blk.sequences) > 2 && load3232(src, repIndex) == uint32(cv>>16) {
 				// Consider history as well.
 				var seq seq
-				// lenght := 4 + e.matchlen(s+6, repIndex+4, src)
+				// length := 4 + e.matchlen(s+6, repIndex+4, src)
-				lenght := 4 + int32(matchLen(src[s+6:], src[repIndex+4:]))
+				// length := 4 + int32(matchLen(src[s+6:], src[repIndex+4:]))
 				var length int32
 				{
 					a := src[s+6:]
 					b := src[repIndex+4:]
 					endI := len(a) & (math.MaxInt32 - 7)
 					length = int32(endI) + 4
 					for i := 0; i < endI; i += 8 {
 						if diff := load64(a, i) ^ load64(b, i); diff != 0 {
 							length = int32(i+bits.TrailingZeros64(diff)>>3) + 4
 							break
 						}
 					}
 				}
-				seq.matchLen = uint32(lenght - zstdMinMatch)
+				seq.matchLen = uint32(length - zstdMinMatch)
 				// We might be able to match backwards.
 				// Extend as long as we can.
@@ -441,11 +494,11 @@ encodeLoop:
 					println("repeat sequence", seq, "next s:", s)
 				}
 				blk.sequences = append(blk.sequences, seq)
-				s += lenght + 2
+				s += length + 2
 				nextEmit = s
 				if s >= sLimit {
 					if debug {
-						println("repeat ended", s, lenght)
+						println("repeat ended", s, length)
 					}
 					break encodeLoop
@@ -464,6 +517,9 @@ encodeLoop:
 				if debugAsserts && s-t > e.maxMatchOff {
 					panic("s - t >e.maxMatchOff")
 				}
 				if debugAsserts && t < 0 {
 					panic(fmt.Sprintf("t (%d) < 0, candidate.offset: %d, e.cur: %d, coffset0: %d, e.maxMatchOff: %d", t, candidate.offset, e.cur, coffset0, e.maxMatchOff))
 				}
 				break
 			}
@@ -496,9 +552,25 @@ encodeLoop:
 			panic(fmt.Sprintf("s (%d) <= t (%d)", s, t))
 		}
 		if debugAsserts && t < 0 {
 			panic(fmt.Sprintf("t (%d) < 0 ", t))
 		}
 		// Extend the 4-byte match as long as possible.
 		//l := e.matchlenNoHist(s+4, t+4, src) + 4
-		l := int32(matchLen(src[s+4:], src[t+4:])) + 4
+		// l := int32(matchLen(src[s+4:], src[t+4:])) + 4
 		var l int32
 		{
 			a := src[s+4:]
 			b := src[t+4:]
 			endI := len(a) & (math.MaxInt32 - 7)
 			l = int32(endI) + 4
 			for i := 0; i < endI; i += 8 {
 				if diff := load64(a, i) ^ load64(b, i); diff != 0 {
 					l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
 					break
 				}
 			}
 		}
 		// Extend backwards
 		tMin := s - e.maxMatchOff
@@ -536,7 +608,20 @@ encodeLoop:
 			// We have at least 4 byte match.
 			// No need to check backwards. We come straight from a match
 			//l := 4 + e.matchlenNoHist(s+4, o2+4, src)
-			l := 4 + int32(matchLen(src[s+4:], src[o2+4:]))
+			// l := 4 + int32(matchLen(src[s+4:], src[o2+4:]))
 			var l int32
 			{
 				a := src[s+4:]
 				b := src[o2+4:]
 				endI := len(a) & (math.MaxInt32 - 7)
 				l = int32(endI) + 4
 				for i := 0; i < endI; i += 8 {
 					if diff := load64(a, i) ^ load64(b, i); diff != 0 {
 						l = int32(i+bits.TrailingZeros64(diff)>>3) + 4
 						break
 					}
 				}
 			}
 			// Store this, since we have it.
 			nextHash := hash6(cv, hashLog)
@@ -569,9 +654,13 @@ encodeLoop:
 	if debug {
 		println("returning, recent offsets:", blk.recentOffsets, "extra literals:", blk.extraLits)
 	}
 	// We do not store history, so we must offset e.cur to avoid false matches for next user.
 	if e.cur < bufferReset {
 		e.cur += int32(len(src))
 	}
 }
-func (e *fastEncoder) addBlock(src []byte) int32 {
+func (e *fastBase) addBlock(src []byte) int32 {
 	if debugAsserts && e.cur > bufferReset {
 		panic(fmt.Sprintf("ecur (%d) > buffer reset (%d)", e.cur, bufferReset))
 	}
@@ -602,17 +691,17 @@ func (e *fastEncoder) addBlock(src []byte) int32 {
 // useBlock will replace the block with the provided one,
 // but transfer recent offsets from the previous.
-func (e *fastEncoder) UseBlock(enc *blockEnc) {
+func (e *fastBase) UseBlock(enc *blockEnc) {
 	enc.reset(e.blk)
 	e.blk = enc
 }
-func (e *fastEncoder) matchlenNoHist(s, t int32, src []byte) int32 {
+func (e *fastBase) matchlenNoHist(s, t int32, src []byte) int32 {
 	// Extend the match to be as long as possible.
 	return int32(matchLen(src[s:], src[t:]))
 }
-func (e *fastEncoder) matchlen(s, t int32, src []byte) int32 {
+func (e *fastBase) matchlen(s, t int32, src []byte) int32 {
 	if debugAsserts {
 		if s < 0 {
 			err := fmt.Sprintf("s (%d) < 0", s)
@@ -626,18 +715,17 @@ func (e *fastEncoder) matchlen(s, t int32, src []byte) int32 {
 			err := fmt.Sprintf("s (%d) - t (%d) > maxMatchOff (%d)", s, t, e.maxMatchOff)
 			panic(err)
 		}
-	}
+		if len(src)-int(s) > maxCompressedBlockSize {
-	s1 := int(s) + maxMatchLength - 4
+			panic(fmt.Sprintf("len(src)-s (%d) > maxCompressedBlockSize (%d)", len(src)-int(s), maxCompressedBlockSize))
-	if s1 > len(src) {
+		}
 		s1 = len(src)
 	}
 	// Extend the match to be as long as possible.
-	return int32(matchLen(src[s:s1], src[t:]))
+	return int32(matchLen(src[s:], src[t:]))
 }
 // Reset the encoding table.
-func (e *fastEncoder) Reset() {
+func (e *fastBase) Reset(singleBlock bool) {
 	if e.blk == nil {
 		e.blk = &blockEnc{}
 		e.blk.init()
@@ -650,7 +738,7 @@ func (e *fastEncoder) Reset() {
 	} else {
 		e.crc.Reset()
 	}
-	if cap(e.hist) < int(e.maxMatchOff*2) {
+	if !singleBlock && cap(e.hist) < int(e.maxMatchOff*2) {
 		l := e.maxMatchOff * 2
 		// Make it at least 1MB.
 		if l < 1<<20 {
@@ -4,6 +4,8 @@
 package zstd
 /*
 // encParams are not really used, just here for reference.
 type encParams struct {
 	// largest match distance : larger == more compression, more memory needed during decompression
 	windowLog uint8
@@ -152,3 +154,4 @@ var defEncParams = [4][]encParams{
 		{14, 15, 15, 10, 3, 999, strategyBtultra2}, // level 22.
 	},
 }
 */
@@ -35,21 +35,22 @@ type encoder interface {
 	AppendCRC([]byte) []byte
 	WindowSize(size int) int32
 	UseBlock(*blockEnc)
-	Reset()
+	Reset(singleBlock bool)
 }
 type encoderState struct {
-	w             io.Writer
+	w                io.Writer
-	filling       []byte
+	filling          []byte
-	current       []byte
+	current          []byte
-	previous      []byte
+	previous         []byte
-	encoder       encoder
+	encoder          encoder
-	writing       *blockEnc
+	writing          *blockEnc
-	err           error
+	err              error
-	writeErr      error
+	writeErr         error
-	nWritten      int64
+	nWritten         int64
-	headerWritten bool
+	headerWritten    bool
-	eofWritten    bool
+	eofWritten       bool
 	fullFrameWritten bool
 	// This waitgroup indicates an encode is running.
 	wg sync.WaitGroup
@@ -71,27 +72,26 @@ func NewWriter(w io.Writer, opts ...EOption) (*Encoder, error) {
 	}
 	if w != nil {
 		e.Reset(w)
 	} else {
 		e.init.Do(func() {
 			e.initialize()
 		})
 	}
 	return &e, nil
 }
 func (e *Encoder) initialize() {
 	if e.o.concurrent == 0 {
 		e.o.setDefault()
 	}
 	e.encoders = make(chan encoder, e.o.concurrent)
 	for i := 0; i < e.o.concurrent; i++ {
-		e.encoders <- e.o.encoder()
+		enc := e.o.encoder()
 		// If not single block, history will be allocated on first use.
 		enc.Reset(true)
 		e.encoders <- enc
 	}
 }
 // Reset will re-initialize the writer and new writes will encode to the supplied writer
 // as a new, independent stream.
 func (e *Encoder) Reset(w io.Writer) {
 	e.init.Do(func() {
 		e.initialize()
 	})
 	s := &e.state
 	s.wg.Wait()
 	s.wWg.Wait()
@@ -115,9 +115,10 @@ func (e *Encoder) Reset(w io.Writer) {
 	s.filling = s.filling[:0]
 	s.current = s.current[:0]
 	s.previous = s.previous[:0]
-	s.encoder.Reset()
+	s.encoder.Reset(false)
 	s.headerWritten = false
 	s.eofWritten = false
 	s.fullFrameWritten = false
 	s.w = w
 	s.err = nil
 	s.nWritten = 0
@@ -176,6 +177,23 @@ func (e *Encoder) nextBlock(final bool) error {
 		return fmt.Errorf("block > maxStoreBlockSize")
 	}
 	if !s.headerWritten {
 		// If we have a single block encode, do a sync compression.
 		if final && len(s.filling) > 0 {
 			s.current = e.EncodeAll(s.filling, s.current[:0])
 			var n2 int
 			n2, s.err = s.w.Write(s.current)
 			if s.err != nil {
 				return s.err
 			}
 			s.nWritten += int64(n2)
 			s.current = s.current[:0]
 			s.filling = s.filling[:0]
 			s.headerWritten = true
 			s.fullFrameWritten = true
 			s.eofWritten = true
 			return nil
 		}
 		var tmp [maxHeaderSize]byte
 		fh := frameHeader{
 			ContentSize:   0,
@@ -263,7 +281,7 @@ func (e *Encoder) nextBlock(final bool) error {
 			// If we got the exact same number of literals as input,
 			// assume the literals cannot be compressed.
 			if len(src) != len(blk.literals) || len(src) != e.o.blockSize {
-				err = blk.encode(e.o.noEntropy)
+				err = blk.encode(e.o.noEntropy, !e.o.allLitEntropy)
 			}
 			switch err {
 			case errIncompressible:
@@ -298,7 +316,9 @@ func (e *Encoder) ReadFrom(r io.Reader) (n int64, err error) {
 	src := e.state.filling
 	for {
 		n2, err := r.Read(src)
-		_, _ = e.state.encoder.CRC().Write(src[:n2])
+		if e.o.crc {
 			_, _ = e.state.encoder.CRC().Write(src[:n2])
 		}
 		// src is now the unfilled part...
 		src = src[n2:]
 		n += int64(n2)
@@ -363,6 +383,9 @@ func (e *Encoder) Close() error {
 	if err != nil {
 		return err
 	}
 	if e.state.fullFrameWritten {
 		return s.err
 	}
 	s.wg.Wait()
 	s.wWg.Wait()
@@ -422,18 +445,14 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 		}
 		return dst
 	}
-	e.init.Do(func() {
+	e.init.Do(e.initialize)
 		e.o.setDefault()
 		e.initialize()
 	})
 	enc := <-e.encoders
 	defer func() {
 		// Release encoder reference to last block.
-		enc.Reset()
+		// If a non-single block is needed the encoder will reset again.
 		enc.Reset(true)
 		e.encoders <- enc
 	}()
 	enc.Reset()
 	blk := enc.Block()
 	// Use single segments when above minimum window and below 1MB.
 	single := len(src) < 1<<20 && len(src) > MinWindowSize
 	if e.o.single != nil {
@@ -456,12 +475,13 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 		panic(err)
 	}
-	if len(src) <= e.o.blockSize && len(src) <= maxBlockSize {
+	// If we can do everything in one block, prefer that.
 	if len(src) <= maxCompressedBlockSize {
 		// Slightly faster with no history and everything in one block.
 		if e.o.crc {
 			_, _ = enc.CRC().Write(src)
 		}
-		blk.reset(nil)
+		blk := enc.Block()
 		blk.last = true
 		enc.EncodeNoHist(blk, src)
@@ -472,7 +492,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 		if len(blk.literals) != len(src) || len(src) != e.o.blockSize {
 			// Output directly to dst
 			blk.output = dst
-			err = blk.encode(e.o.noEntropy)
+			err = blk.encode(e.o.noEntropy, !e.o.allLitEntropy)
 		}
 		switch err {
@@ -488,6 +508,8 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 		}
 		blk.output = oldout
 	} else {
 		enc.Reset(false)
 		blk := enc.Block()
 		for len(src) > 0 {
 			todo := src
 			if len(todo) > e.o.blockSize {
@@ -507,7 +529,7 @@ func (e *Encoder) EncodeAll(src, dst []byte) []byte {
 			// If we got the exact same number of literals as input,
 			// assume the literals cannot be compressed.
 			if len(blk.literals) != len(todo) || len(todo) != e.o.blockSize {
-				err = blk.encode(e.o.noEntropy)
+				err = blk.encode(e.o.noEntropy, !e.o.allLitEntropy)
 			}
 			switch err {
@@ -12,15 +12,18 @@ type EOption func(*encoderOptions) error
 // options retains accumulated state of multiple options.
 type encoderOptions struct {
-	concurrent int
+	concurrent      int
-	crc        bool
+	level           EncoderLevel
-	single     *bool
+	single          *bool
-	pad        int
+	pad             int
-	blockSize  int
+	blockSize       int
-	windowSize int
+	windowSize      int
-	level      EncoderLevel
+	crc             bool
-	fullZero   bool
+	fullZero        bool
-	noEntropy  bool
+	noEntropy       bool
 	allLitEntropy   bool
 	customWindow    bool
 	customALEntropy bool
 }
 func (o *encoderOptions) setDefault() {
@@ -30,7 +33,7 @@ func (o *encoderOptions) setDefault() {
 		crc:        true,
 		single:     nil,
 		blockSize:  1 << 16,
-		windowSize: 1 << 22,
+		windowSize: 8 << 20,
 		level:      SpeedDefault,
 	}
 }
@@ -39,9 +42,11 @@ func (o *encoderOptions) setDefault() {
 func (o encoderOptions) encoder() encoder {
 	switch o.level {
 	case SpeedDefault:
-		return &doubleFastEncoder{fastEncoder: fastEncoder{maxMatchOff: int32(o.windowSize)}}
+		return &doubleFastEncoder{fastEncoder: fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}}
 	case SpeedBetterCompression:
 		return &betterFastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}
 	case SpeedFastest:
-		return &fastEncoder{maxMatchOff: int32(o.windowSize)}
+		return &fastEncoder{fastBase: fastBase{maxMatchOff: int32(o.windowSize)}}
 	}
 	panic("unknown compression level")
 }
@@ -67,7 +72,7 @@ func WithEncoderConcurrency(n int) EOption {
 }
 // WithWindowSize will set the maximum allowed back-reference distance.
-// The value must be a power of two between WindowSizeMin and WindowSizeMax.
+// The value must be a power of two between MinWindowSize and MaxWindowSize.
 // A larger value will enable better compression but allocate more memory and,
 // for above-default values, take considerably longer.
 // The default value is determined by the compression level.
@@ -83,6 +88,7 @@ func WithWindowSize(n int) EOption {
 		}
 		o.windowSize = n
 		o.customWindow = true
 		if o.blockSize > o.windowSize {
 			o.blockSize = o.windowSize
 		}
@@ -130,18 +136,18 @@ const (
 	// This is roughly equivalent to the default Zstandard mode (level 3).
 	SpeedDefault
 	// SpeedBetterCompression will yield better compression than the default.
 	// Currently it is about zstd level 7-8 with ~ 2x-3x the default CPU usage.
 	// By using this, notice that CPU usage may go up in the future.
 	SpeedBetterCompression
 	// speedLast should be kept as the last actual compression option.
 	// The is not for external usage, but is used to keep track of the valid options.
 	speedLast
 	// SpeedBetterCompression will (in the future) yield better compression than the default,
 	// but at approximately 4x the CPU usage of the default.
 	// For now this is not implemented.
 	SpeedBetterCompression = SpeedDefault
 	// SpeedBestCompression will choose the best available compression option.
 	// For now this is not implemented.
-	SpeedBestCompression = SpeedDefault
+	SpeedBestCompression = SpeedBetterCompression
 )
 // EncoderLevelFromString will convert a string representation of an encoding level back
@@ -163,8 +169,10 @@ func EncoderLevelFromZstd(level int) EncoderLevel {
 	switch {
 	case level < 3:
 		return SpeedFastest
-	case level >= 3:
+	case level >= 3 && level < 6:
 		return SpeedDefault
 	case level > 5:
 		return SpeedBetterCompression
 	}
 	return SpeedDefault
 }
@@ -176,6 +184,8 @@ func (e EncoderLevel) String() string {
 		return "fastest"
 	case SpeedDefault:
 		return "default"
 	case SpeedBetterCompression:
 		return "better"
 	default:
 		return "invalid"
 	}
@@ -189,6 +199,20 @@ func WithEncoderLevel(l EncoderLevel) EOption {
 			return fmt.Errorf("unknown encoder level")
 		}
 		o.level = l
 		if !o.customWindow {
 			switch o.level {
 			case SpeedFastest:
 				o.windowSize = 4 << 20
 			case SpeedDefault:
 				o.windowSize = 8 << 20
 			case SpeedBetterCompression:
 				o.windowSize = 16 << 20
 			}
 		}
 		if !o.customALEntropy {
 			o.allLitEntropy = l > SpeedFastest
 		}
 		return nil
 	}
 }
@@ -203,6 +227,18 @@ func WithZeroFrames(b bool) EOption {
 	}
 }
 // WithAllLitEntropyCompression will apply entropy compression if no matches are found.
 // Disabling this will skip incompressible data faster, but in cases with no matches but
 // skewed character distribution compression is lost.
 // Default value depends on the compression level selected.
 func WithAllLitEntropyCompression(b bool) EOption {
 	return func(o *encoderOptions) error {
 		o.customALEntropy = true
 		o.allLitEntropy = b
 		return nil
 	}
 }
 // WithNoEntropyCompression will always skip entropy compression of literals.
 // This can be useful if content has matches, but unlikely to benefit from entropy
 // compression. Usually the slight speed improvement is not worth enabling this.
@@ -16,16 +16,11 @@ import (
 )
 type frameDec struct {
-	o         decoderOptions
+	o      decoderOptions
-	crc       hash.Hash64
+	crc    hash.Hash64
-	frameDone sync.WaitGroup
+	offset int64
 	offset    int64
-	WindowSize       uint64
+	WindowSize uint64
 	DictionaryID     uint32
 	FrameContentSize uint64
 	HasCheckSum      bool
 	SingleSegment    bool
 	// maxWindowSize is the maximum windows size to support.
 	// should never be bigger than max-int.
@@ -42,9 +37,16 @@ type frameDec struct {
 	// Byte buffer that can be reused for small input blocks.
 	bBuf byteBuf
 	FrameContentSize uint64
 	frameDone        sync.WaitGroup
 	DictionaryID  *uint32
 	HasCheckSum   bool
 	SingleSegment bool
 	// asyncRunning indicates whether the async routine processes input on 'decoding'.
 	asyncRunning   bool
 	asyncRunningMu sync.Mutex
 	asyncRunning   bool
 }
 const (
@@ -140,7 +142,7 @@ func (d *frameDec) reset(br byteBuffer) error {
 	// Read Dictionary_ID
 	// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary_id
-	d.DictionaryID = 0
+	d.DictionaryID = nil
 	if size := fhd & 3; size != 0 {
 		if size == 3 {
 			size = 4
@@ -152,19 +154,22 @@ func (d *frameDec) reset(br byteBuffer) error {
 			}
 			return io.ErrUnexpectedEOF
 		}
 		var id uint32
 		switch size {
 		case 1:
-			d.DictionaryID = uint32(b[0])
+			id = uint32(b[0])
 		case 2:
-			d.DictionaryID = uint32(b[0]) | (uint32(b[1]) << 8)
+			id = uint32(b[0]) | (uint32(b[1]) << 8)
 		case 4:
-			d.DictionaryID = uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
+			id = uint32(b[0]) | (uint32(b[1]) << 8) | (uint32(b[2]) << 16) | (uint32(b[3]) << 24)
 		}
 		if debug {
-			println("Dict size", size, "ID:", d.DictionaryID)
+			println("Dict size", size, "ID:", id)
 		}
-		if d.DictionaryID != 0 {
+		if id > 0 {
-			return ErrUnknownDictionary
+			// ID 0 means "sorry, no dictionary anyway".
 			// https://github.com/facebook/zstd/blob/dev/doc/zstd_compression_format.md#dictionary-format
 			d.DictionaryID = &id
 		}
 	}
@@ -231,7 +236,11 @@ func (d *frameDec) reset(br byteBuffer) error {
 		return ErrWindowSizeTooSmall
 	}
 	d.history.windowSize = int(d.WindowSize)
-	d.history.maxSize = d.history.windowSize + maxBlockSize
+	if d.o.lowMem && d.history.windowSize < maxBlockSize {
 		d.history.maxSize = d.history.windowSize * 2
 	} else {
 		d.history.maxSize = d.history.windowSize + maxBlockSize
 	}
 	// history contains input - maybe we do something
 	d.rawInput = br
 	return nil
@@ -318,8 +327,8 @@ func (d *frameDec) checkCRC() error {
 func (d *frameDec) initAsync() {
 	if !d.o.lowMem && !d.SingleSegment {
-		// set max extra size history to 20MB.
+		// set max extra size history to 10MB.
-		d.history.maxSize = d.history.windowSize + maxBlockSize*10
+		d.history.maxSize = d.history.windowSize + maxBlockSize*5
 	}
 	// re-alloc if more than one extra block size.
 	if d.o.lowMem && cap(d.history.b) > d.history.maxSize+maxBlockSize {
@@ -345,8 +354,6 @@ func (d *frameDec) initAsync() {
 // When the frame has finished decoding the *bufio.Reader
 // containing the remaining input will be sent on frameDec.frameDone.
 func (d *frameDec) startDecoder(output chan decodeOutput) {
 	// TODO: Init to dictionary
 	d.history.reset()
 	written := int64(0)
 	defer func() {
@@ -439,8 +446,6 @@ func (d *frameDec) startDecoder(output chan decodeOutput) {
 // runDecoder will create a sync decoder that will decode a block of data.
 func (d *frameDec) runDecoder(dst []byte, dec *blockDec) ([]byte, error) {
 	// TODO: Init to dictionary
 	d.history.reset()
 	saved := d.history.b
 	// We use the history for output to avoid copying it.
@@ -19,7 +19,7 @@ const (
 	 *  Increasing memory usage improves compression ratio
 	 *  Reduced memory usage can improve speed, due to cache effect
 	 *  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
-	maxMemoryUsage = 11
+	maxMemoryUsage = tablelogAbsoluteMax + 2
 	maxTableLog    = maxMemoryUsage - 2
 	maxTablesize   = 1 << maxTableLog
@@ -55,7 +55,7 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 	if b.remain() < 4 {
 		return errors.New("input too small")
 	}
-	bitStream := b.Uint32()
+	bitStream := b.Uint32NC()
 	nbBits := uint((bitStream & 0xF) + minTablelog) // extract tableLog
 	if nbBits > tablelogAbsoluteMax {
 		println("Invalid tablelog:", nbBits)
@@ -79,7 +79,8 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 				n0 += 24
 				if r := b.remain(); r > 5 {
 					b.advance(2)
-					bitStream = b.Uint32() >> bitCount
+					// The check above should make sure we can read 32 bits
 					bitStream = b.Uint32NC() >> bitCount
 				} else {
 					// end of bit stream
 					bitStream >>= 16
@@ -104,10 +105,11 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 				charnum++
 			}
-			if r := b.remain(); r >= 7 || r+int(bitCount>>3) >= 4 {
+			if r := b.remain(); r >= 7 || r-int(bitCount>>3) >= 4 {
 				b.advance(bitCount >> 3)
 				bitCount &= 7
-				bitStream = b.Uint32() >> bitCount
+				// The check above should make sure we can read 32 bits
 				bitStream = b.Uint32NC() >> bitCount
 			} else {
 				bitStream >>= 2
 			}
@@ -148,17 +150,16 @@ func (s *fseDecoder) readNCount(b *byteReader, maxSymbol uint16) error {
 			threshold >>= 1
 		}
-		//println("b.off:", b.off, "len:", len(b.b), "bc:", bitCount, "remain:", b.remain())
+		if r := b.remain(); r >= 7 || r-int(bitCount>>3) >= 4 {
 		if r := b.remain(); r >= 7 || r+int(bitCount>>3) >= 4 {
 			b.advance(bitCount >> 3)
 			bitCount &= 7
 			// The check above should make sure we can read 32 bits
 			bitStream = b.Uint32NC() >> (bitCount & 31)
 		} else {
 			bitCount -= (uint)(8 * (len(b.b) - 4 - b.off))
 			b.off = len(b.b) - 4
-			//println("b.off:", b.off, "len:", len(b.b), "bc:", bitCount, "iend", iend)
+			bitStream = b.Uint32() >> (bitCount & 31)
 		}
 		bitStream = b.Uint32() >> (bitCount & 31)
 		//printf("bitstream is now: 0b%b", bitStream)
 	}
 	s.symbolLen = charnum
 	if s.symbolLen <= 1 {
@@ -17,6 +17,7 @@ type history struct {
 	windowSize    int
 	maxSize       int
 	error         bool
 	dict          *dict
 }
 // reset will reset the history to initial state of a frame.
@@ -36,12 +37,27 @@ func (h *history) reset() {
 	}
 	h.decoders = sequenceDecs{}
 	if h.huffTree != nil {
-		huffDecoderPool.Put(h.huffTree)
+		if h.dict == nil || h.dict.litDec != h.huffTree {
 			huffDecoderPool.Put(h.huffTree)
 		}
 	}
 	h.huffTree = nil
 	h.dict = nil
 	//printf("history created: %+v (l: %d, c: %d)", *h, len(h.b), cap(h.b))
 }
 func (h *history) setDict(dict *dict) {
 	if dict == nil {
 		return
 	}
 	h.dict = dict
 	h.decoders.litLengths = dict.llDec
 	h.decoders.offsets = dict.ofDec
 	h.decoders.matchLengths = dict.mlDec
 	h.recentOffsets = dict.offsets
 	h.huffTree = dict.litDec
 }
 // append bytes to history.
 // This function will make sure there is space for it,
 // if the buffer has been allocated with enough extra space.
@@ -62,8 +62,10 @@ type sequenceDecs struct {
 	matchLengths sequenceDec
 	prevOffset   [3]int
 	hist         []byte
 	dict         []byte
 	literals     []byte
 	out          []byte
 	windowSize   int
 	maxBits      uint8
 }
@@ -82,7 +84,12 @@ func (s *sequenceDecs) initialize(br *bitReader, hist *history, literals, out []
 	s.hist = hist.b
 	s.prevOffset = hist.recentOffsets
 	s.maxBits = s.litLengths.fse.maxBits + s.offsets.fse.maxBits + s.matchLengths.fse.maxBits
 	s.windowSize = hist.windowSize
 	s.out = out
 	s.dict = nil
 	if hist.dict != nil {
 		s.dict = hist.dict.content
 	}
 	return nil
 }
@@ -98,23 +105,78 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 			printf("reading sequence %d, exceeded available data\n", seqs-i)
 			return io.ErrUnexpectedEOF
 		}
-		var litLen, matchOff, matchLen int
+		var ll, mo, ml int
 		if br.off > 4+((maxOffsetBits+16+16)>>3) {
-			litLen, matchOff, matchLen = s.nextFast(br, llState, mlState, ofState)
+			// inlined function:
 			// ll, mo, ml = s.nextFast(br, llState, mlState, ofState)
 			// Final will not read from stream.
 			var llB, mlB, moB uint8
 			ll, llB = llState.final()
 			ml, mlB = mlState.final()
 			mo, moB = ofState.final()
 			// extra bits are stored in reverse order.
 			br.fillFast()
 			mo += br.getBits(moB)
 			if s.maxBits > 32 {
 				br.fillFast()
 			}
 			ml += br.getBits(mlB)
 			ll += br.getBits(llB)
 			if moB > 1 {
 				s.prevOffset[2] = s.prevOffset[1]
 				s.prevOffset[1] = s.prevOffset[0]
 				s.prevOffset[0] = mo
 			} else {
 				// mo = s.adjustOffset(mo, ll, moB)
 				// Inlined for rather big speedup
 				if ll == 0 {
 					// There is an exception though, when current sequence's literals_length = 0.
 					// In this case, repeated offsets are shifted by one, so an offset_value of 1 means Repeated_Offset2,
 					// an offset_value of 2 means Repeated_Offset3, and an offset_value of 3 means Repeated_Offset1 - 1_byte.
 					mo++
 				}
 				if mo == 0 {
 					mo = s.prevOffset[0]
 				} else {
 					var temp int
 					if mo == 3 {
 						temp = s.prevOffset[0] - 1
 					} else {
 						temp = s.prevOffset[mo]
 					}
 					if temp == 0 {
 						// 0 is not valid; input is corrupted; force offset to 1
 						println("temp was 0")
 						temp = 1
 					}
 					if mo != 1 {
 						s.prevOffset[2] = s.prevOffset[1]
 					}
 					s.prevOffset[1] = s.prevOffset[0]
 					s.prevOffset[0] = temp
 					mo = temp
 				}
 			}
 			br.fillFast()
 		} else {
-			litLen, matchOff, matchLen = s.next(br, llState, mlState, ofState)
+			ll, mo, ml = s.next(br, llState, mlState, ofState)
 			br.fill()
 		}
 		if debugSequences {
-			println("Seq", seqs-i-1, "Litlen:", litLen, "matchOff:", matchOff, "(abs) matchLen:", matchLen)
+			println("Seq", seqs-i-1, "Litlen:", ll, "mo:", mo, "(abs) ml:", ml)
 		}
-		if litLen > len(s.literals) {
+		if ll > len(s.literals) {
-			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", litLen, len(s.literals))
+			return fmt.Errorf("unexpected literal count, want %d bytes, but only %d is available", ll, len(s.literals))
 		}
-		size := litLen + matchLen + len(s.out)
+		size := ll + ml + len(s.out)
 		if size-startSize > maxBlockSize {
 			return fmt.Errorf("output (%d) bigger than max block size", size)
 		}
@@ -125,49 +187,70 @@ func (s *sequenceDecs) decode(seqs int, br *bitReader, hist []byte) error {
 			s.out = append(s.out, make([]byte, maxBlockSize)...)
 			s.out = s.out[:len(s.out)-maxBlockSize]
 		}
-		if matchLen > maxMatchLen {
+		if ml > maxMatchLen {
-			return fmt.Errorf("match len (%d) bigger than max allowed length", matchLen)
+			return fmt.Errorf("match len (%d) bigger than max allowed length", ml)
 		}
 		if matchOff > len(s.out)+len(hist)+litLen {
 			return fmt.Errorf("match offset (%d) bigger than current history (%d)", matchOff, len(s.out)+len(hist)+litLen)
 		}
 		if matchOff == 0 && matchLen > 0 {
 			return fmt.Errorf("zero matchoff and matchlen > 0")
 		}
-		s.out = append(s.out, s.literals[:litLen]...)
+		// Add literals
-		s.literals = s.literals[litLen:]
+		s.out = append(s.out, s.literals[:ll]...)
 		s.literals = s.literals[ll:]
 		out := s.out
 		if mo > len(s.out)+len(hist) || mo > s.windowSize {
 			if len(s.dict) == 0 {
 				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist))
 			}
 			// we may be in dictionary.
 			dictO := len(s.dict) - (mo - (len(s.out) + len(hist)))
 			if dictO < 0 || dictO >= len(s.dict) {
 				return fmt.Errorf("match offset (%d) bigger than current history (%d)", mo, len(s.out)+len(hist))
 			}
 			end := dictO + ml
 			if end > len(s.dict) {
 				out = append(out, s.dict[dictO:]...)
 				mo -= len(s.dict) - dictO
 				ml -= len(s.dict) - dictO
 			} else {
 				out = append(out, s.dict[dictO:end]...)
 				mo = 0
 				ml = 0
 			}
 		}
 		if mo == 0 && ml > 0 {
 			return fmt.Errorf("zero matchoff and matchlen (%d) > 0", ml)
 		}
 		// Copy from history.
 		// TODO: Blocks without history could be made to ignore this completely.
-		if v := matchOff - len(s.out); v > 0 {
+		if v := mo - len(s.out); v > 0 {
 			// v is the start position in history from end.
 			start := len(s.hist) - v
-			if matchLen > v {
+			if ml > v {
 				// Some goes into current block.
 				// Copy remainder of history
 				out = append(out, s.hist[start:]...)
-				matchOff -= v
+				mo -= v
-				matchLen -= v
+				ml -= v
 			} else {
-				out = append(out, s.hist[start:start+matchLen]...)
+				out = append(out, s.hist[start:start+ml]...)
-				matchLen = 0
+				ml = 0
 			}
 		}
 		// We must be in current buffer now
-		if matchLen > 0 {
+		if ml > 0 {
-			start := len(s.out) - matchOff
+			start := len(s.out) - mo
-			if matchLen <= len(s.out)-start {
+			if ml <= len(s.out)-start {
 				// No overlap
-				out = append(out, s.out[start:start+matchLen]...)
+				out = append(out, s.out[start:start+ml]...)
 			} else {
 				// Overlapping copy
 				// Extend destination slice and copy one byte at the time.
-				out = out[:len(out)+matchLen]
+				out = out[:len(out)+ml]
-				src := out[start : start+matchLen]
+				src := out[start : start+ml]
 				// Destination is the space we just added.
-				dst := out[len(out)-matchLen:]
+				dst := out[len(out)-ml:]
 				dst = dst[:len(src)]
 				for i := range src {
 					dst[i] = src[i]
@@ -178,7 +178,7 @@ func (r *SnappyConverter) Convert(in io.Reader, w io.Writer) (int64, error) {
 				r.err = ErrSnappyCorrupt
 				return written, r.err
 			}
-			err = r.block.encode(false)
+			err = r.block.encode(false, false)
 			switch err {
 			case errIncompressible:
 				r.block.popOffsets()
@@ -87,6 +87,17 @@ func printf(format string, a ...interface{}) {
 	}
 }
 // matchLenFast does matching, but will not match the last up to 7 bytes.
 func matchLenFast(a, b []byte) int {
 	endI := len(a) & (math.MaxInt32 - 7)
 	for i := 0; i < endI; i += 8 {
 		if diff := load64(a, i) ^ load64(b, i); diff != 0 {
 			return i + bits.TrailingZeros64(diff)>>3
 		}
 	}
 	return endI
 }
 // matchLen returns the maximum length.
 // a must be the shortest of the two.
 // The function also returns whether all bytes matched.
@@ -97,33 +108,18 @@ func matchLen(a, b []byte) int {
 			return i + (bits.TrailingZeros64(diff) >> 3)
 		}
 	}
 	checked := (len(a) >> 3) << 3
 	a = a[checked:]
 	b = b[checked:]
 	// TODO: We could do a 4 check.
 	for i := range a {
 		if a[i] != b[i] {
-			return int(i) + checked
+			return i + checked
 		}
 	}
 	return len(a) + checked
 }
 // matchLen returns a match length in src between index s and t
 func matchLenIn(src []byte, s, t int32) int32 {
 	s1 := len(src)
 	b := src[t:]
 	a := src[s:s1]
 	b = b[:len(a)]
 	// Extend the match to be as long as possible.
 	for i := range a {
 		if a[i] != b[i] {
 			return int32(i)
 		}
 	}
 	return int32(len(a))
 }
 func load3232(b []byte, i int32) uint32 {
 	// Help the compiler eliminate bounds checks on the read so it can be done in a single read.
 	b = b[i:]
@@ -55,6 +55,9 @@ func newFile(dataSources []dataSource, opts LoadOptions) *File {
 	if len(opts.KeyValueDelimiterOnWrite) == 0 {
 		opts.KeyValueDelimiterOnWrite = "="
 	}
 	if len(opts.ChildSectionDelimiter) == 0 {
 		opts.ChildSectionDelimiter = "."
 	}
 	return &File{
 		BlockMode:   true,
@@ -82,7 +85,7 @@ func (f *File) NewSection(name string) (*Section, error) {
 		return nil, errors.New("empty section name")
 	}
-	if f.options.Insensitive && name != DefaultSection {
+	if (f.options.Insensitive || f.options.InsensitiveSections) && name != DefaultSection {
 		name = strings.ToLower(name)
 	}
@@ -144,7 +147,7 @@ func (f *File) SectionsByName(name string) ([]*Section, error) {
 	if len(name) == 0 {
 		name = DefaultSection
 	}
-	if f.options.Insensitive {
+	if f.options.Insensitive || f.options.InsensitiveSections {
 		name = strings.ToLower(name)
 	}
@@ -236,7 +239,7 @@ func (f *File) DeleteSectionWithIndex(name string, index int) error {
 	if len(name) == 0 {
 		name = DefaultSection
 	}
-	if f.options.Insensitive {
+	if f.options.Insensitive || f.options.InsensitiveSections {
 		name = strings.ToLower(name)
 	}
@@ -347,7 +350,7 @@ func (f *File) writeToBuffer(indent string) (*bytes.Buffer, error) {
 			}
 		}
-		if i > 0 || DefaultHeader {
+		if i > 0 || DefaultHeader || (i == 0 && strings.ToUpper(sec.name) != DefaultSection) {
 			if _, err := buf.WriteString("[" + sname + "]" + LineBreak); err != nil {
 				return nil, err
 			}
@@ -451,6 +454,8 @@ func (f *File) writeToBuffer(indent string) (*bytes.Buffer, error) {
 					val = `"""` + val + `"""`
 				} else if !f.options.IgnoreInlineComment && strings.ContainsAny(val, "#;") {
 					val = "`" + val + "`"
 				} else if len(strings.TrimSpace(val)) != len(val) {
 					val = `"` + val + `"`
 				}
 				if _, err := buf.WriteString(equalSign + val + LineBreak); err != nil {
 					return nil, err
@@ -71,6 +71,10 @@ type LoadOptions struct {
 	Loose bool
 	// Insensitive indicates whether the parser forces all section and key names to lowercase.
 	Insensitive bool
 	// InsensitiveSections indicates whether the parser forces all section to lowercase.
 	InsensitiveSections bool
 	// InsensitiveKeys indicates whether the parser forces all key names to lowercase.
 	InsensitiveKeys bool
 	// IgnoreContinuation indicates whether to ignore continuation lines while parsing.
 	IgnoreContinuation bool
 	// IgnoreInlineComment indicates whether to ignore comments at the end of value and treat it as part of value.
@@ -109,6 +113,8 @@ type LoadOptions struct {
 	KeyValueDelimiters string
 	// KeyValueDelimiters is the delimiter that are used to separate key and value output. By default, it is "=".
 	KeyValueDelimiterOnWrite string
 	// ChildSectionDelimiter is the delimiter that is used to separate child sections. By default, it is ".".
 	ChildSectionDelimiter string
 	// PreserveSurroundedQuote indicates whether to preserve surrounded quote (single and double quotes).
 	PreserveSurroundedQuote bool
 	// DebugFunc is called to collect debug information (currently only useful to debug parsing Python-style multiline values).
@@ -377,7 +377,7 @@ func (f *File) parse(reader io.Reader) (err error) {
 	// Ignore error because default section name is never empty string.
 	name := DefaultSection
-	if f.options.Insensitive {
+	if f.options.Insensitive || f.options.InsensitiveSections {
 		name = strings.ToLower(DefaultSection)
 	}
 	section, _ := f.NewSection(name)
@@ -469,7 +469,7 @@ func (f *File) parse(reader io.Reader) (err error) {
 			inUnparseableSection = false
 			for i := range f.options.UnparseableSections {
 				if f.options.UnparseableSections[i] == name ||
-					(f.options.Insensitive && strings.EqualFold(f.options.UnparseableSections[i], name)) {
+					((f.options.Insensitive || f.options.InsensitiveSections) && strings.EqualFold(f.options.UnparseableSections[i], name)) {
 					inUnparseableSection = true
 					continue
 				}
@@ -66,7 +66,7 @@ func (s *Section) SetBody(body string) {
 func (s *Section) NewKey(name, val string) (*Key, error) {
 	if len(name) == 0 {
 		return nil, errors.New("error creating new key: empty key name")
-	} else if s.f.options.Insensitive {
+	} else if s.f.options.Insensitive || s.f.options.InsensitiveKeys {
 		name = strings.ToLower(name)
 	}
@@ -109,7 +109,7 @@ func (s *Section) GetKey(name string) (*Key, error) {
 	if s.f.BlockMode {
 		s.f.lock.RLock()
 	}
-	if s.f.options.Insensitive {
+	if s.f.options.Insensitive || s.f.options.InsensitiveKeys {
 		name = strings.ToLower(name)
 	}
 	key := s.keys[name]
@@ -121,7 +121,7 @@ func (s *Section) GetKey(name string) (*Key, error) {
 		// Check if it is a child-section.
 		sname := s.name
 		for {
-			if i := strings.LastIndex(sname, "."); i > -1 {
+			if i := strings.LastIndex(sname, s.f.options.ChildSectionDelimiter); i > -1 {
 				sname = sname[:i]
 				sec, err := s.f.GetSection(sname)
 				if err != nil {
@@ -188,7 +188,7 @@ func (s *Section) ParentKeys() []*Key {
 	var parentKeys []*Key
 	sname := s.name
 	for {
-		if i := strings.LastIndex(sname, "."); i > -1 {
+		if i := strings.LastIndex(sname, s.f.options.ChildSectionDelimiter); i > -1 {
 			sname = sname[:i]
 			sec, err := s.f.GetSection(sname)
 			if err != nil {
@@ -245,7 +245,7 @@ func (s *Section) DeleteKey(name string) {
 // For example, "[parent.child1]" and "[parent.child12]" are child sections
 // of section "[parent]".
 func (s *Section) ChildSections() []*Section {
-	prefix := s.name + "."
+	prefix := s.name + s.f.options.ChildSectionDelimiter
 	children := make([]*Section, 0, 3)
 	for _, name := range s.f.sectionList {
 		if strings.HasPrefix(name, prefix) {
@@ -479,7 +479,7 @@ func reflectSliceWithProperType(key *Key, field reflect.Value, delim string, all
 				_ = keyWithShadows.AddShadow(val)
 			}
 		}
-		key = keyWithShadows
+		*key = *keyWithShadows
 		return nil
 	}
@@ -595,7 +595,7 @@ func (s *Section) reflectFrom(val reflect.Value) error {
 			continue
 		}
-		if (tpField.Type.Kind() == reflect.Ptr && tpField.Anonymous) ||
+		if (tpField.Type.Kind() == reflect.Ptr && tpField.Type.Elem().Kind() == reflect.Struct) ||
 			(tpField.Type.Kind() == reflect.Struct && tpField.Type.Name() != "Time") {
 			// Note: The only error here is section doesn't exist.
 			sec, err := s.f.GetSection(fieldName)
@@ -25,7 +25,7 @@ gitea.com/macaron/cors
 # gitea.com/macaron/csrf v0.0.0-20190822024205-3dc5a4474439
 ## explicit
 gitea.com/macaron/csrf
-# gitea.com/macaron/gzip v0.0.0-20191118041502-506895b47aae
+# gitea.com/macaron/gzip v0.0.0-20200827120000-efa5e8477cf5
 ## explicit
 gitea.com/macaron/gzip
 # gitea.com/macaron/i18n v0.0.0-20190822004228-474e714e2223
@@ -34,7 +34,7 @@ gitea.com/macaron/i18n
 # gitea.com/macaron/inject v0.0.0-20190805023432-d4c86e31027a
 ## explicit
 gitea.com/macaron/inject
-# gitea.com/macaron/macaron v1.4.0
+# gitea.com/macaron/macaron v1.5.0
 ## explicit
 gitea.com/macaron/macaron
 # gitea.com/macaron/session v0.0.0-20191207215012-613cebf0674d
@@ -479,7 +479,7 @@ github.com/keybase/go-crypto/openpgp/errors
 github.com/keybase/go-crypto/openpgp/packet
 github.com/keybase/go-crypto/openpgp/s2k
 github.com/keybase/go-crypto/rsa
-# github.com/klauspost/compress v1.10.2
+# github.com/klauspost/compress v1.10.11
 ## explicit
 github.com/klauspost/compress/flate
 github.com/klauspost/compress/fse
@@ -768,7 +768,7 @@ go.mongodb.org/mongo-driver/bson/bsonrw
 go.mongodb.org/mongo-driver/bson/bsontype
 go.mongodb.org/mongo-driver/bson/primitive
 go.mongodb.org/mongo-driver/x/bsonx/bsoncore
-# golang.org/x/crypto v0.0.0-20200728195943-123391ffb6de
+# golang.org/x/crypto v0.0.0-20200820211705-5c72a883971a
 ## explicit
 golang.org/x/crypto/acme
 golang.org/x/crypto/acme/autocert
@@ -933,7 +933,7 @@ gopkg.in/asn1-ber.v1
 # gopkg.in/gomail.v2 v2.0.0-20160411212932-81ebce5c23df
 ## explicit
 gopkg.in/gomail.v2
-# gopkg.in/ini.v1 v1.57.0
+# gopkg.in/ini.v1 v1.60.2
 ## explicit
 gopkg.in/ini.v1
 # gopkg.in/ldap.v3 v3.0.2