diff --git a/modules/booklist/booklist.go b/modules/booklist/booklist.go
index c4c49dc4..2b83560f 100644
--- a/modules/booklist/booklist.go
+++ b/modules/booklist/booklist.go
@@ -2,18 +2,20 @@ package booklist
 
 import (
 	"fmt"
+	"image"
 	"image/jpeg"
 	"log"
+	"math"
 	"os"
 	"path/filepath"
 	"runtime/debug"
 	"sort"
 	"strings"
 
+	"github.com/bamiaux/rez"
 	"github.com/geek1011/BookBrowser/formats"
 	"github.com/geek1011/BookBrowser/models"
 	zglob "github.com/mattn/go-zglob"
-	"github.com/nfnt/resize"
 )
 
 // BookList represents a list of Books
@@ -92,8 +94,43 @@ func NewBookListFromDir(dir, coverOutDir string, verbose, nocovers bool) (*BookL
 					continue
 				}
 
-				// Better quality: thumb := resize.Resize(200, 0, img, resize.Lanczos2)
-				thumb := resize.Resize(200, 0, cover, resize.Bicubic)
+				coverBounds := cover.Bounds()
+				coverWidth := coverBounds.Dx()
+				coverHeight := coverBounds.Dy()
+
+				if coverWidth <= 200 {
+					continue
+				}
+
+				// Scale to fit in 200x900
+				scale := math.Min(float64(200.0/float64(coverWidth)), float64(900.0/float64(coverHeight)))
+
+				// Scale and round down
+				coverWidth = int(float64(coverWidth) * scale)
+				coverHeight = int(float64(coverHeight) * scale)
+
+				r := image.Rect(0, 0, coverWidth, coverHeight)
+				var thumb image.Image
+				switch t := cover.(type) {
+				case *image.YCbCr:
+					thumb = image.NewYCbCr(r, t.SubsampleRatio)
+				case *image.RGBA:
+					thumb = image.NewRGBA(r)
+				case *image.NRGBA:
+					thumb = image.NewNRGBA(r)
+				case *image.Gray:
+					thumb = image.NewGray(r)
+				default:
+					continue
+				}
+
+				// rez.NewLanczos(2.0) is faster, but slower
+				err = rez.Convert(thumb, cover, rez.NewBicubicFilter())
+				if err != nil {
+					fmt.Println(coverWidth, coverHeight, scale, err)
+					continue
+				}
+
 				thumbFile, err := os.Create(thumbPath)
 				if err != nil {
 					continue
diff --git a/vendor/github.com/bamiaux/rez/LICENSE b/vendor/github.com/bamiaux/rez/LICENSE
new file mode 100644
index 00000000..f346f200
--- /dev/null
+++ b/vendor/github.com/bamiaux/rez/LICENSE
@@ -0,0 +1,20 @@
+The MIT License (MIT)
+
+Copyright (c) 2014 Benoît Amiaux
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/vendor/github.com/bamiaux/rez/README.md b/vendor/github.com/bamiaux/rez/README.md
new file mode 100644
index 00000000..534cf1d3
--- /dev/null
+++ b/vendor/github.com/bamiaux/rez/README.md
@@ -0,0 +1,49 @@
+# rez [![GoDoc](https://godoc.org/github.com/bamiaux/rez/web?status.png)](https://godoc.org/github.com/bamiaux/rez) [![Build Status](https://travis-ci.org/bamiaux/rez.png)](https://travis-ci.org/bamiaux/rez)
+Package rez provides image resizing in pure Go and SIMD.
+
+Download:
+```shell
+go get github.com/bamiaux/rez
+```
+
+
+Full documentation at http://godoc.org/github.com/bamiaux/rez
+
+* * *
+Package rez provides image resizing in pure Go and SIMD.
+
+Featuring:
+
+```
+- YCbCr, RGBA, NRGBA & Gray resizes
+- YCbCr Chroma subsample ratio conversions
+- Optional interlaced-aware resizes
+- Parallel resizes
+- SIMD optimisations on AMD64
+```
+
+The easiest way to use it is:
+
+```
+err := Convert(output, input, NewBicubicFilter())
+```
+
+However, if you plan to convert video, where resize parameters are the same for
+multiple images, the best way is:
+
+```
+cfg, err := PrepareConversion(output, input)
+converter, err := NewConverter(cfg, NewBicubicFilter())
+for i := 0; i < N; i++ {
+    err := converter.Convert(output[i], input[i])
+}
+```
+
+Note that by default, images are resized in parallel with GOMAXPROCS slices.
+Best performance is obtained when GOMAXPROCS is at least equal to your CPU
+count.
+
+
+
+* * *
+Automatically generated by [autoreadme](https://github.com/jimmyfrasche/autoreadme) on 2014.11.25
diff --git a/vendor/github.com/bamiaux/rez/README.md.template b/vendor/github.com/bamiaux/rez/README.md.template
new file mode 100644
index 00000000..1821e658
--- /dev/null
+++ b/vendor/github.com/bamiaux/rez/README.md.template
@@ -0,0 +1,19 @@
+#{{.Name}} [![GoDoc](https://godoc.org/{{.Import}}/web?status.png)](https://godoc.org/{{.Import}}) [![Build Status](https://travis-ci.org/bamiaux/rez.png)](https://travis-ci.org/bamiaux/rez)
+{{.Synopsis}}
+
+Download:
+```shell
+go get {{.Import}}
+```
+{{if .Library}}
+
+Full documentation at http://godoc.org/{{.Import}}
+{{end}}
+* * *
+{{.Doc}}
+{{if .Bugs}}
+#Bugs
+{{range .Bugs}}* {{.}}{{end}}
+{{end}}
+* * *
+Automatically generated by [autoreadme](https://github.com/jimmyfrasche/autoreadme) on {{.Today}}
diff --git a/vendor/github.com/bamiaux/rez/filters.go b/vendor/github.com/bamiaux/rez/filters.go
new file mode 100644
index 00000000..fa724600
--- /dev/null
+++ b/vendor/github.com/bamiaux/rez/filters.go
@@ -0,0 +1,103 @@
+// Copyright 2013 Benoît Amiaux. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+package rez
+
+import (
+	"math"
+)
+
+// Filter is an interpolation filter interface
+// It is used to compute weights for every input pixel
+type Filter interface {
+	Taps() int
+	Name() string
+	Get(dx float64) float64
+}
+
+type bilinear struct{}
+
+func (bilinear) Taps() int    { return 1 }
+func (bilinear) Name() string { return "bilinear" }
+
+func (bilinear) Get(x float64) float64 {
+	if x < 1 {
+		return 1 - x
+	}
+	return 0
+}
+
+// NewBilinearFilter exports a bilinear filter
+func NewBilinearFilter() Filter {
+	return bilinear{}
+}
+
+type bicubic struct {
+	a, b, c, d, e, f, g float64
+}
+
+func (bicubic) Taps() int {
+	return 2
+}
+
+func (bicubic) Name() string {
+	return "bicubic"
+}
+
+func (f *bicubic) Get(x float64) float64 {
+	if x < 1 {
+		return f.a + x*x*(f.b+x*f.c)
+	} else if x < 2 {
+		return f.d + x*(f.e+x*(f.f+x*f.g))
+	}
+	return 0
+}
+
+// NewCustomBicubicFilter exports a bicubic filter where <b> and <c> can be
+// customized.
+// For example, the Mitchell-Netravali bicubic filter is b = c = 1/3
+func NewCustomBicubicFilter(b, c float64) Filter {
+	f := &bicubic{}
+	f.a = 1 - b/3
+	f.b = -3 + 2*b + c
+	f.c = 2 - 3*b/2 - c
+	f.d = 4*b/3 + 4*c
+	f.e = -2*b - 8*c
+	f.f = b + 5*c
+	f.g = -b/6 - c
+	return f
+}
+
+// NewBicubicFilter exports a classic bicubic filter
+func NewBicubicFilter() Filter {
+	return NewCustomBicubicFilter(0, 0.5)
+}
+
+type lanczos struct {
+	alpha float64
+}
+
+func (f lanczos) Taps() int {
+	return int(f.alpha)
+}
+
+func (lanczos) Name() string {
+	return "lanczos"
+}
+
+func (f lanczos) Get(x float64) float64 {
+	if x > f.alpha {
+		return 0
+	} else if x == 0 {
+		return 1
+	}
+	b := x * math.Pi
+	c := b / f.alpha
+	return math.Sin(b) * math.Sin(c) / (b * c)
+}
+
+// NewLanczosFilter exports a lanczos filter where <alpha> is filter size
+func NewLanczosFilter(alpha int) Filter {
+	return lanczos{alpha: float64(alpha)}
+}
diff --git a/vendor/github.com/bamiaux/rez/fixedscalers.go b/vendor/github.com/bamiaux/rez/fixedscalers.go
new file mode 100644
index 00000000..840e97da
--- /dev/null
+++ b/vendor/github.com/bamiaux/rez/fixedscalers.go
@@ -0,0 +1,277 @@
+// Copyright 2013 Benoît Amiaux. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+package rez
+
+// This file is auto-generated - do not modify
+
+func h8scale2Go(dst, src []byte, cof, off []int16,
+	taps, width, height, dp, sp int) {
+	di := 0
+	si := 0
+	for y := 0; y < height; y++ {
+		c := cof
+		s := src[si:]
+		d := dst[di:]
+		for x, xoff := range off[:width] {
+			pix := int(s[xoff+0])*int(c[0]) +
+				int(s[xoff+1])*int(c[1])
+			d[x] = u8((pix + 1<<(Bits-1)) >> Bits)
+			c = c[2:]
+		}
+		di += dp
+		si += sp
+	}
+}
+
+func v8scale2Go(dst, src []byte, cof, off []int16,
+	taps, width, height, dp, sp int) {
+	di := 0
+	for _, yoff := range off[:height] {
+		src = src[sp*int(yoff):]
+		d := dst[di:]
+		for x := range d[:width] {
+			pix := int(src[sp*0+x])*int(cof[0]) +
+				int(src[sp*1+x])*int(cof[1])
+			d[x] = u8((pix + 1<<(Bits-1)) >> Bits)
+		}
+		cof = cof[2:]
+		di += dp
+	}
+}
+
+func h8scale4Go(dst, src []byte, cof, off []int16,
+	taps, width, height, dp, sp int) {
+	di := 0
+	si := 0
+	for y := 0; y < height; y++ {
+		c := cof
+		s := src[si:]
+		d := dst[di:]
+		for x, xoff := range off[:width] {
+			pix := int(s[xoff+0])*int(c[0]) +
+				int(s[xoff+1])*int(c[1]) +
+				int(s[xoff+2])*int(c[2]) +
+				int(s[xoff+3])*int(c[3])
+			d[x] = u8((pix + 1<<(Bits-1)) >> Bits)
+			c = c[4:]
+		}
+		di += dp
+		si += sp
+	}
+}
+
+func v8scale4Go(dst, src []byte, cof, off []int16,
+	taps, width, height, dp, sp int) {
+	di := 0
+	for _, yoff := range off[:height] {
+		src = src[sp*int(yoff):]
+		d := dst[di:]
+		for x := range d[:width] {
+			pix := int(src[sp*0+x])*int(cof[0]) +
+				int(src[sp*1+x])*int(cof[1]) +
+				int(src[sp*2+x])*int(cof[2]) +
+				int(src[sp*3+x])*int(cof[3])
+			d[x] = u8((pix + 1<<(Bits-1)) >> Bits)
+		}
+		cof = cof[4:]
+		di += dp
+	}
+}
+
+func h8scale6Go(dst, src []byte, cof, off []int16,
+	taps, width, height, dp, sp int) {
+	di := 0
+	si := 0
+	for y := 0; y < height; y++ {
+		c := cof
+		s := src[si:]
+		d := dst[di:]
+		for x, xoff := range off[:width] {
+			pix := int(s[xoff+0])*int(c[0]) +
+				int(s[xoff+1])*int(c[1]) +
+				int(s[xoff+2])*int(c[2]) +
+				int(s[xoff+3])*int(c[3]) +
+				int(s[xoff+4])*int(c[4]) +
+				int(s[xoff+5])*int(c[5])
+			d[x] = u8((pix + 1<<(Bits-1)) >> Bits)
+			c = c[6:]
+		}
+		di += dp
+		si += sp
+	}
+}
+
+func v8scale6Go(dst, src []byte, cof, off []int16,
+	taps, width, height, dp, sp int) {
+	di := 0
+	for _, yoff := range off[:height] {
+		src = src[sp*int(yoff):]
+		d := dst[di:]
+		for x := range d[:width] {
+			pix := int(src[sp*0+x])*int(cof[0]) +
+				int(src[sp*1+x])*int(cof[1]) +
+				int(src[sp*2+x])*int(cof[2]) +
+				int(src[sp*3+x])*int(cof[3]) +
+				int(src[sp*4+x])*int(cof[4]) +
+				int(src[sp*5+x])*int(cof[5])
+			d[x] = u8((pix + 1<<(Bits-1)) >> Bits)
+		}
+		cof = cof[6:]
+		di += dp
+	}
+}
+
+func h8scale8Go(dst, src []byte, cof, off []int16,
+	taps, width, height, dp, sp int) {
+	di := 0
+	si := 0
+	for y := 0; y < height; y++ {
+		c := cof
+		s := src[si:]
+		d := dst[di:]
+		for x, xoff := range off[:width] {
+			pix := int(s[xoff+0])*int(c[0]) +
+				int(s[xoff+1])*int(c[1]) +
+				int(s[xoff+2])*int(c[2]) +
+				int(s[xoff+3])*int(c[3]) +
+				int(s[xoff+4])*int(c[4]) +
+				int(s[xoff+5])*int(c[5]) +
+				int(s[xoff+6])*int(c[6]) +
+				int(s[xoff+7])*int(c[7])
+			d[x] = u8((pix + 1<<(Bits-1)) >> Bits)
+			c = c[8:]
+		}
+		di += dp
+		si += sp
+	}
+}
+
+func v8scale8Go(dst, src []byte, cof, off []int16,
+	taps, width, height, dp, sp int) {
+	di := 0
+	for _, yoff := range off[:height] {
+		src = src[sp*int(yoff):]
+		d := dst[di:]
+		for x := range d[:width] {
+			pix := int(src[sp*0+x])*int(cof[0]) +
+				int(src[sp*1+x])*int(cof[1]) +
+				int(src[sp*2+x])*int(cof[2]) +
+				int(src[sp*3+x])*int(cof[3]) +
+				int(src[sp*4+x])*int(cof[4]) +
+				int(src[sp*5+x])*int(cof[5]) +
+				int(src[sp*6+x])*int(cof[6]) +
+				int(src[sp*7+x])*int(cof[7])
+			d[x] = u8((pix + 1<<(Bits-1)) >> Bits)
+		}
+		cof = cof[8:]
+		di += dp
+	}
+}
+
+func h8scale10Go(dst, src []byte, cof, off []int16,
+	taps, width, height, dp, sp int) {
+	di := 0
+	si := 0
+	for y := 0; y < height; y++ {
+		c := cof
+		s := src[si:]
+		d := dst[di:]
+		for x, xoff := range off[:width] {
+			pix := int(s[xoff+0])*int(c[0]) +
+				int(s[xoff+1])*int(c[1]) +
+				int(s[xoff+2])*int(c[2]) +
+				int(s[xoff+3])*int(c[3]) +
+				int(s[xoff+4])*int(c[4]) +
+				int(s[xoff+5])*int(c[5]) +
+				int(s[xoff+6])*int(c[6]) +
+				int(s[xoff+7])*int(c[7]) +
+				int(s[xoff+8])*int(c[8]) +
+				int(s[xoff+9])*int(c[9])
+			d[x] = u8((pix + 1<<(Bits-1)) >> Bits)
+			c = c[10:]
+		}
+		di += dp
+		si += sp
+	}
+}
+
+func v8scale10Go(dst, src []byte, cof, off []int16,
+	taps, width, height, dp, sp int) {
+	di := 0
+	for _, yoff := range off[:height] {
+		src = src[sp*int(yoff):]
+		d := dst[di:]
+		for x := range d[:width] {
+			pix := int(src[sp*0+x])*int(cof[0]) +
+				int(src[sp*1+x])*int(cof[1]) +
+				int(src[sp*2+x])*int(cof[2]) +
+				int(src[sp*3+x])*int(cof[3]) +
+				int(src[sp*4+x])*int(cof[4]) +
+				int(src[sp*5+x])*int(cof[5]) +
+				int(src[sp*6+x])*int(cof[6]) +
+				int(src[sp*7+x])*int(cof[7]) +
+				int(src[sp*8+x])*int(cof[8]) +
+				int(src[sp*9+x])*int(cof[9])
+			d[x] = u8((pix + 1<<(Bits-1)) >> Bits)
+		}
+		cof = cof[10:]
+		di += dp
+	}
+}
+
+func h8scale12Go(dst, src []byte, cof, off []int16,
+	taps, width, height, dp, sp int) {
+	di := 0
+	si := 0
+	for y := 0; y < height; y++ {
+		c := cof
+		s := src[si:]
+		d := dst[di:]
+		for x, xoff := range off[:width] {
+			pix := int(s[xoff+0])*int(c[0]) +
+				int(s[xoff+1])*int(c[1]) +
+				int(s[xoff+2])*int(c[2]) +
+				int(s[xoff+3])*int(c[3]) +
+				int(s[xoff+4])*int(c[4]) +
+				int(s[xoff+5])*int(c[5]) +
+				int(s[xoff+6])*int(c[6]) +
+				int(s[xoff+7])*int(c[7]) +
+				int(s[xoff+8])*int(c[8]) +
+				int(s[xoff+9])*int(c[9]) +
+				int(s[xoff+10])*int(c[10]) +
+				int(s[xoff+11])*int(c[11])
+			d[x] = u8((pix + 1<<(Bits-1)) >> Bits)
+			c = c[12:]
+		}
+		di += dp
+		si += sp
+	}
+}
+
+func v8scale12Go(dst, src []byte, cof, off []int16,
+	taps, width, height, dp, sp int) {
+	di := 0
+	for _, yoff := range off[:height] {
+		src = src[sp*int(yoff):]
+		d := dst[di:]
+		for x := range d[:width] {
+			pix := int(src[sp*0+x])*int(cof[0]) +
+				int(src[sp*1+x])*int(cof[1]) +
+				int(src[sp*2+x])*int(cof[2]) +
+				int(src[sp*3+x])*int(cof[3]) +
+				int(src[sp*4+x])*int(cof[4]) +
+				int(src[sp*5+x])*int(cof[5]) +
+				int(src[sp*6+x])*int(cof[6]) +
+				int(src[sp*7+x])*int(cof[7]) +
+				int(src[sp*8+x])*int(cof[8]) +
+				int(src[sp*9+x])*int(cof[9]) +
+				int(src[sp*10+x])*int(cof[10]) +
+				int(src[sp*11+x])*int(cof[11])
+			d[x] = u8((pix + 1<<(Bits-1)) >> Bits)
+		}
+		cof = cof[12:]
+		di += dp
+	}
+}
diff --git a/vendor/github.com/bamiaux/rez/fixedscalers.go.input b/vendor/github.com/bamiaux/rez/fixedscalers.go.input
new file mode 100644
index 00000000..e88ca533
--- /dev/null
+++ b/vendor/github.com/bamiaux/rez/fixedscalers.go.input
@@ -0,0 +1,10 @@
+{
+    "taps": [
+        [0,0],
+        [0,0,0,0],
+        [0,0,0,0,0,0],
+        [0,0,0,0,0,0,0,0],
+        [0,0,0,0,0,0,0,0,0,0],
+        [0,0,0,0,0,0,0,0,0,0,0,0]
+    ]
+}
diff --git a/vendor/github.com/bamiaux/rez/fixedscalers.go.template b/vendor/github.com/bamiaux/rez/fixedscalers.go.template
new file mode 100644
index 00000000..4dbcd42f
--- /dev/null
+++ b/vendor/github.com/bamiaux/rez/fixedscalers.go.template
@@ -0,0 +1,45 @@
+// Copyright 2013 Benoît Amiaux. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+package rez
+
+// This file is auto-generated - do not modify
+
+{{range $_, $tab := .taps}}
+{{$n := len $tab}}
+func h8scale{{$n}}Go(dst, src []byte, cof, off []int16,
+	taps, width, height, dp, sp int) {
+	di := 0
+	si := 0
+	for y := 0; y < height; y++ {
+		c := cof
+		s := src[si:]
+		d := dst[di:]
+		for x, xoff := range off[:width] {
+			pix :={{range $i, $_ := $tab}}{{if gt $i 0}} +
+			{{end}}int(s[xoff+{{$i}}]) * int(c[{{$i}}]){{end}}
+			d[x] = u8((pix + 1<<(Bits-1)) >> Bits)
+			c = c[{{$n}}:]
+		}
+		di += dp
+		si += sp
+	}
+}
+
+func v8scale{{$n}}Go(dst, src []byte, cof, off []int16,
+	taps, width, height, dp, sp int) {
+	di := 0
+	for _, yoff := range off[:height] {
+		src = src[sp*int(yoff):]
+		d := dst[di:]
+		for x := range d[:width] {
+			pix:={{range $i, $_ := $tab}}{{if gt $i 0}} +
+			{{end}}int(src[sp*{{$i}}+x]) * int(cof[{{$i}}]){{end}}
+			d[x] = u8((pix + 1<<(Bits-1)) >> Bits)
+		}
+		cof = cof[{{$n}}:]
+		di += dp
+	}
+}
+{{end}}
diff --git a/vendor/github.com/bamiaux/rez/hscalers_amd64.s b/vendor/github.com/bamiaux/rez/hscalers_amd64.s
new file mode 100644
index 00000000..0b018eaf
--- /dev/null
+++ b/vendor/github.com/bamiaux/rez/hscalers_amd64.s
@@ -0,0 +1,1440 @@
+// Copyright 2014 Benoît Amiaux. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+// This file is auto-generated - do not modify
+
+DATA	zero_0<>+0x00(SB)/8, $0x0000000000000000
+DATA	zero_0<>+0x08(SB)/8, $0x0000000000000000
+GLOBL	zero_0<>(SB), 8, $16
+DATA	hbits_1<>+0x00(SB)/8, $0x0000200000002000
+DATA	hbits_1<>+0x08(SB)/8, $0x0000200000002000
+GLOBL	hbits_1<>(SB), 8, $16
+DATA	u8max_2<>+0x00(SB)/8, $0x00000000000000FF
+DATA	u8max_2<>+0x08(SB)/8, $0x00000000000000FF
+GLOBL	u8max_2<>(SB), 8, $16
+
+TEXT ·h8scale2Amd64(SB),4,$40-136
+		MOVQ	dp+120(FP), BX
+		MOVQ	width+104(FP), CX
+		MOVQ	CX, DX
+		SUBQ	CX, BX
+		SHRQ	$4, CX
+		ANDQ	$15, DX
+		MOVQ	BX, dstoff+-32(SP)
+		MOVQ	CX, simdroll+-8(SP)
+		MOVQ	DX, asmroll+-16(SP)
+		MOVQ	src+24(FP), AX
+		MOVQ	AX, srcref+-24(SP)
+		MOVQ	taps+96(FP), DX
+		SUBQ	$2, DX
+		PXOR	X15, X15
+		MOVO	hbits_1<>(SB), X14
+		MOVQ	src+24(FP), SI
+		MOVQ	dst+0(FP), DI
+yloop_0:
+		MOVQ	off+72(FP), BX
+		MOVQ	cof+48(FP), BP
+		MOVQ	simdroll+-8(SP), CX
+		ORQ	CX, CX
+		JE	nosimdloop_3
+simdloop_1:
+		MOVWQSX	(BX), R8
+		MOVWQSX	2(BX), R9
+		MOVWQSX	4(BX), R10
+		MOVWQSX	6(BX), R11
+		PINSRW	$0, (SI)(R8*1), X0
+		PINSRW	$1, (SI)(R9*1), X0
+		PINSRW	$2, (SI)(R10*1), X0
+		PINSRW	$3, (SI)(R11*1), X0
+		MOVWQSX	8(BX), R8
+		MOVWQSX	10(BX), R9
+		MOVWQSX	12(BX), R10
+		MOVWQSX	14(BX), R11
+		PINSRW	$0, (SI)(R8*1), X1
+		PINSRW	$1, (SI)(R9*1), X1
+		PINSRW	$2, (SI)(R10*1), X1
+		PINSRW	$3, (SI)(R11*1), X1
+		MOVWQSX	16(BX), R8
+		MOVWQSX	18(BX), R9
+		MOVWQSX	20(BX), R10
+		MOVWQSX	22(BX), R11
+		PINSRW	$0, (SI)(R8*1), X2
+		PINSRW	$1, (SI)(R9*1), X2
+		PINSRW	$2, (SI)(R10*1), X2
+		PINSRW	$3, (SI)(R11*1), X2
+		MOVWQSX	24(BX), R8
+		MOVWQSX	26(BX), R9
+		MOVWQSX	28(BX), R10
+		MOVWQSX	30(BX), R11
+		PINSRW	$0, (SI)(R8*1), X3
+		PINSRW	$1, (SI)(R9*1), X3
+		PINSRW	$2, (SI)(R10*1), X3
+		PINSRW	$3, (SI)(R11*1), X3
+		ADDQ	$32, BX
+		PUNPCKLBW	X15, X0
+		PMADDWL	(BP), X0
+		PUNPCKLBW	X15, X1
+		PMADDWL	16(BP), X1
+		PUNPCKLBW	X15, X2
+		PMADDWL	32(BP), X2
+		PUNPCKLBW	X15, X3
+		PMADDWL	48(BP), X3
+		ADDQ	$64, BP
+		PADDL	X14, X0
+		PADDL	X14, X1
+		PADDL	X14, X2
+		PADDL	X14, X3
+		PSRAL	$14, X0
+		PSRAL	$14, X1
+		PSRAL	$14, X2
+		PSRAL	$14, X3
+		PACKSSLW	X1, X0
+		PACKSSLW	X3, X2
+		PACKUSWB	X2, X0
+		MOVOU	X0, (DI)
+		ADDQ	$16, DI
+		SUBQ	$1, CX
+		JNE	simdloop_1
+nosimdloop_3:
+		MOVQ	asmroll+-16(SP), CX
+		ORQ	CX, CX
+		JE	end_4
+asmloop_2:
+		MOVWQSX	(BX), DX
+		MOVBQZX	(SI)(DX*1), AX
+		MOVWQSX	(BP), DX
+		IMULQ	DX
+		MOVQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	1(SI)(DX*1), AX
+		MOVWQSX	2(BP), DX
+		IMULQ	DX
+		ADDQ	$4, BP
+		ADDQ	sum+-40(SP), AX
+		ADDQ	$8192, AX
+		CMOVQLT	zero_0<>(SB), AX
+		SHRQ	$14, AX
+		CMPQ	u8max_2<>(SB), AX
+		CMOVQLT	u8max_2<>(SB), AX
+		ADDQ	$2, BX
+		MOVB	AL, (DI)
+		ADDQ	$1, DI
+		SUBQ	$1, CX
+		JNE	asmloop_2
+end_4:
+		MOVQ	srcref+-24(SP), SI
+		ADDQ	dstoff+-32(SP), DI
+		ADDQ	sp+128(FP), SI
+		MOVQ	SI, srcref+-24(SP)
+		SUBQ	$1, height+112(FP)
+		JNE	yloop_0
+		RET
+
+TEXT ·h8scale4Amd64(SB),4,$40-136
+		MOVQ	dp+120(FP), BX
+		MOVQ	width+104(FP), CX
+		MOVQ	CX, DX
+		SUBQ	CX, BX
+		SHRQ	$4, CX
+		ANDQ	$15, DX
+		MOVQ	BX, dstoff+-32(SP)
+		MOVQ	CX, simdroll+-8(SP)
+		MOVQ	DX, asmroll+-16(SP)
+		MOVQ	src+24(FP), AX
+		MOVQ	AX, srcref+-24(SP)
+		MOVQ	taps+96(FP), DX
+		SUBQ	$2, DX
+		PXOR	X15, X15
+		MOVO	hbits_1<>(SB), X14
+		MOVQ	src+24(FP), SI
+		MOVQ	dst+0(FP), DI
+yloop_5:
+		MOVQ	off+72(FP), BX
+		MOVQ	cof+48(FP), BP
+		MOVQ	simdroll+-8(SP), CX
+		ORQ	CX, CX
+		JE	nosimdloop_8
+simdloop_6:
+		MOVWQSX	(BX), AX
+		MOVWQSX	2(BX), DX
+		MOVL	(SI)(AX*1), X0
+		MOVL	(SI)(DX*1), X8
+		MOVWQSX	4(BX), AX
+		MOVWQSX	6(BX), DX
+		MOVL	(SI)(AX*1), X1
+		MOVL	(SI)(DX*1), X9
+		PUNPCKLLQ	X8, X0
+		PUNPCKLLQ	X9, X1
+		MOVWQSX	8(BX), AX
+		MOVWQSX	10(BX), DX
+		MOVL	(SI)(AX*1), X2
+		MOVL	(SI)(DX*1), X10
+		MOVWQSX	12(BX), AX
+		MOVWQSX	14(BX), DX
+		MOVL	(SI)(AX*1), X3
+		MOVL	(SI)(DX*1), X11
+		PUNPCKLLQ	X10, X2
+		PUNPCKLLQ	X11, X3
+		MOVWQSX	16(BX), AX
+		MOVWQSX	18(BX), DX
+		MOVL	(SI)(AX*1), X4
+		MOVL	(SI)(DX*1), X12
+		MOVWQSX	20(BX), AX
+		MOVWQSX	22(BX), DX
+		MOVL	(SI)(AX*1), X5
+		MOVL	(SI)(DX*1), X13
+		PUNPCKLLQ	X12, X4
+		PUNPCKLLQ	X13, X5
+		MOVWQSX	24(BX), AX
+		MOVWQSX	26(BX), DX
+		MOVL	(SI)(AX*1), X6
+		MOVL	(SI)(DX*1), X8
+		MOVWQSX	28(BX), AX
+		MOVWQSX	30(BX), DX
+		MOVL	(SI)(AX*1), X7
+		MOVL	(SI)(DX*1), X9
+		PUNPCKLLQ	X8, X6
+		PUNPCKLLQ	X9, X7
+		ADDQ	$32, BX
+		PUNPCKLBW	X15, X0
+		PMADDWL	(BP), X0
+		PUNPCKLBW	X15, X1
+		PMADDWL	16(BP), X1
+		PUNPCKLBW	X15, X2
+		PMADDWL	32(BP), X2
+		PUNPCKLBW	X15, X3
+		PMADDWL	48(BP), X3
+		MOVO	X0, X10
+		MOVO	X2, X11
+		SHUFPS	$221, X1, X10
+		SHUFPS	$221, X3, X11
+		SHUFPS	$136, X1, X0
+		SHUFPS	$136, X3, X2
+		PADDL	X10, X0
+		PADDL	X11, X2
+		PUNPCKLBW	X15, X4
+		PMADDWL	64(BP), X4
+		PUNPCKLBW	X15, X5
+		PMADDWL	80(BP), X5
+		PUNPCKLBW	X15, X6
+		PMADDWL	96(BP), X6
+		PUNPCKLBW	X15, X7
+		PMADDWL	112(BP), X7
+		MOVO	X4, X12
+		MOVO	X6, X13
+		SHUFPS	$221, X5, X12
+		SHUFPS	$221, X7, X13
+		SHUFPS	$136, X5, X4
+		SHUFPS	$136, X7, X6
+		PADDL	X12, X4
+		PADDL	X13, X6
+		ADDQ	$128, BP
+		PADDL	X14, X0
+		PADDL	X14, X2
+		PADDL	X14, X4
+		PADDL	X14, X6
+		PSRAL	$14, X0
+		PSRAL	$14, X2
+		PSRAL	$14, X4
+		PSRAL	$14, X6
+		PACKSSLW	X2, X0
+		PACKSSLW	X6, X4
+		PACKUSWB	X4, X0
+		MOVOU	X0, (DI)
+		ADDQ	$16, DI
+		SUBQ	$1, CX
+		JNE	simdloop_6
+nosimdloop_8:
+		MOVQ	asmroll+-16(SP), CX
+		ORQ	CX, CX
+		JE	end_9
+asmloop_7:
+		MOVWQSX	(BX), DX
+		MOVBQZX	(SI)(DX*1), AX
+		MOVWQSX	(BP), DX
+		IMULQ	DX
+		MOVQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	1(SI)(DX*1), AX
+		MOVWQSX	2(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	2(SI)(DX*1), AX
+		MOVWQSX	4(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	3(SI)(DX*1), AX
+		MOVWQSX	6(BP), DX
+		IMULQ	DX
+		ADDQ	$8, BP
+		ADDQ	sum+-40(SP), AX
+		ADDQ	$8192, AX
+		CMOVQLT	zero_0<>(SB), AX
+		SHRQ	$14, AX
+		CMPQ	u8max_2<>(SB), AX
+		CMOVQLT	u8max_2<>(SB), AX
+		ADDQ	$2, BX
+		MOVB	AL, (DI)
+		ADDQ	$1, DI
+		SUBQ	$1, CX
+		JNE	asmloop_7
+end_9:
+		MOVQ	srcref+-24(SP), SI
+		ADDQ	dstoff+-32(SP), DI
+		ADDQ	sp+128(FP), SI
+		MOVQ	SI, srcref+-24(SP)
+		SUBQ	$1, height+112(FP)
+		JNE	yloop_5
+		RET
+
+TEXT ·h8scale8Amd64(SB),4,$40-136
+		MOVQ	dp+120(FP), BX
+		MOVQ	width+104(FP), CX
+		MOVQ	CX, DX
+		SUBQ	CX, BX
+		SHRQ	$4, CX
+		ANDQ	$15, DX
+		MOVQ	BX, dstoff+-32(SP)
+		MOVQ	CX, simdroll+-8(SP)
+		MOVQ	DX, asmroll+-16(SP)
+		MOVQ	src+24(FP), AX
+		MOVQ	AX, srcref+-24(SP)
+		MOVQ	taps+96(FP), DX
+		SUBQ	$2, DX
+		PXOR	X15, X15
+		MOVO	hbits_1<>(SB), X14
+		MOVQ	src+24(FP), SI
+		MOVQ	dst+0(FP), DI
+yloop_10:
+		MOVQ	off+72(FP), BX
+		MOVQ	cof+48(FP), BP
+		MOVQ	simdroll+-8(SP), CX
+		ORQ	CX, CX
+		JE	nosimdloop_13
+simdloop_11:
+		MOVWQSX	(BX), AX
+		MOVQ	(SI)(AX*1), X0
+		MOVWQSX	2(BX), DX
+		MOVQ	(SI)(DX*1), X1
+		MOVWQSX	4(BX), AX
+		MOVQ	(SI)(AX*1), X2
+		MOVWQSX	6(BX), DX
+		MOVQ	(SI)(DX*1), X3
+		MOVWQSX	8(BX), AX
+		MOVQ	(SI)(AX*1), X4
+		MOVWQSX	10(BX), DX
+		MOVQ	(SI)(DX*1), X5
+		MOVWQSX	12(BX), AX
+		MOVQ	(SI)(AX*1), X6
+		MOVWQSX	14(BX), DX
+		MOVQ	(SI)(DX*1), X7
+		MOVWQSX	16(BX), AX
+		MOVQ	(SI)(AX*1), X8
+		MOVWQSX	18(BX), DX
+		MOVQ	(SI)(DX*1), X9
+		MOVWQSX	20(BX), AX
+		MOVQ	(SI)(AX*1), X10
+		MOVWQSX	22(BX), DX
+		MOVQ	(SI)(DX*1), X11
+		PUNPCKLBW	X15, X0
+		PMADDWL	(BP), X0
+		PUNPCKLBW	X15, X1
+		PMADDWL	16(BP), X1
+		PUNPCKLBW	X15, X2
+		PMADDWL	32(BP), X2
+		PUNPCKLBW	X15, X3
+		PMADDWL	48(BP), X3
+		MOVO	X0, X12
+		MOVO	X2, X13
+		PUNPCKLQDQ	X1, X0
+		PUNPCKHQDQ	X1, X12
+		PADDL	X12, X0
+		PUNPCKLQDQ	X3, X2
+		PUNPCKHQDQ	X3, X13
+		PADDL	X13, X2
+		MOVO	X0, X12
+		SHUFPS	$136, X2, X0
+		SHUFPS	$221, X2, X12
+		PADDL	X12, X0
+		PUNPCKLBW	X15, X4
+		PMADDWL	64(BP), X4
+		PUNPCKLBW	X15, X5
+		PMADDWL	80(BP), X5
+		PUNPCKLBW	X15, X6
+		PMADDWL	96(BP), X6
+		PUNPCKLBW	X15, X7
+		PMADDWL	112(BP), X7
+		MOVO	X4, X1
+		MOVO	X6, X2
+		PUNPCKLQDQ	X5, X4
+		PUNPCKHQDQ	X5, X1
+		PADDL	X1, X4
+		PUNPCKLQDQ	X7, X6
+		PUNPCKHQDQ	X7, X2
+		PADDL	X2, X6
+		MOVO	X4, X1
+		SHUFPS	$136, X6, X4
+		SHUFPS	$221, X6, X1
+		PADDL	X1, X4
+		MOVWQSX	24(BX), AX
+		MOVQ	(SI)(AX*1), X1
+		MOVWQSX	26(BX), DX
+		MOVQ	(SI)(DX*1), X2
+		MOVWQSX	28(BX), AX
+		MOVQ	(SI)(AX*1), X3
+		MOVWQSX	30(BX), DX
+		MOVQ	(SI)(DX*1), X5
+		ADDQ	$32, BX
+		PUNPCKLBW	X15, X8
+		PMADDWL	128(BP), X8
+		PUNPCKLBW	X15, X9
+		PMADDWL	144(BP), X9
+		PUNPCKLBW	X15, X10
+		PMADDWL	160(BP), X10
+		PUNPCKLBW	X15, X11
+		PMADDWL	176(BP), X11
+		MOVO	X8, X12
+		MOVO	X10, X13
+		PUNPCKLQDQ	X9, X8
+		PUNPCKHQDQ	X9, X12
+		PADDL	X12, X8
+		PUNPCKLQDQ	X11, X10
+		PUNPCKHQDQ	X11, X13
+		PADDL	X13, X10
+		MOVO	X8, X12
+		SHUFPS	$136, X10, X8
+		SHUFPS	$221, X10, X12
+		PADDL	X12, X8
+		PUNPCKLBW	X15, X1
+		PMADDWL	192(BP), X1
+		PUNPCKLBW	X15, X2
+		PMADDWL	208(BP), X2
+		PUNPCKLBW	X15, X3
+		PMADDWL	224(BP), X3
+		PUNPCKLBW	X15, X5
+		PMADDWL	240(BP), X5
+		MOVO	X1, X10
+		MOVO	X3, X11
+		PUNPCKLQDQ	X2, X1
+		PUNPCKHQDQ	X2, X10
+		PADDL	X10, X1
+		PUNPCKLQDQ	X5, X3
+		PUNPCKHQDQ	X5, X11
+		PADDL	X11, X3
+		MOVO	X1, X10
+		SHUFPS	$136, X3, X1
+		SHUFPS	$221, X3, X10
+		PADDL	X10, X1
+		ADDQ	$256, BP
+		PADDL	X14, X0
+		PADDL	X14, X4
+		PADDL	X14, X8
+		PADDL	X14, X1
+		PSRAL	$14, X0
+		PSRAL	$14, X4
+		PSRAL	$14, X8
+		PSRAL	$14, X1
+		PACKSSLW	X4, X0
+		PACKSSLW	X1, X8
+		PACKUSWB	X8, X0
+		MOVOU	X0, (DI)
+		ADDQ	$16, DI
+		SUBQ	$1, CX
+		JNE	simdloop_11
+nosimdloop_13:
+		MOVQ	asmroll+-16(SP), CX
+		ORQ	CX, CX
+		JE	end_14
+asmloop_12:
+		MOVWQSX	(BX), DX
+		MOVBQZX	(SI)(DX*1), AX
+		MOVWQSX	(BP), DX
+		IMULQ	DX
+		MOVQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	1(SI)(DX*1), AX
+		MOVWQSX	2(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	2(SI)(DX*1), AX
+		MOVWQSX	4(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	3(SI)(DX*1), AX
+		MOVWQSX	6(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	4(SI)(DX*1), AX
+		MOVWQSX	8(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	5(SI)(DX*1), AX
+		MOVWQSX	10(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	6(SI)(DX*1), AX
+		MOVWQSX	12(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	7(SI)(DX*1), AX
+		MOVWQSX	14(BP), DX
+		IMULQ	DX
+		ADDQ	$16, BP
+		ADDQ	sum+-40(SP), AX
+		ADDQ	$8192, AX
+		CMOVQLT	zero_0<>(SB), AX
+		SHRQ	$14, AX
+		CMPQ	u8max_2<>(SB), AX
+		CMOVQLT	u8max_2<>(SB), AX
+		ADDQ	$2, BX
+		MOVB	AL, (DI)
+		ADDQ	$1, DI
+		SUBQ	$1, CX
+		JNE	asmloop_12
+end_14:
+		MOVQ	srcref+-24(SP), SI
+		ADDQ	dstoff+-32(SP), DI
+		ADDQ	sp+128(FP), SI
+		MOVQ	SI, srcref+-24(SP)
+		SUBQ	$1, height+112(FP)
+		JNE	yloop_10
+		RET
+
+TEXT ·h8scale10Amd64(SB),4,$40-136
+		MOVQ	dp+120(FP), BX
+		MOVQ	width+104(FP), CX
+		MOVQ	CX, DX
+		SUBQ	CX, BX
+		SHRQ	$4, CX
+		ANDQ	$15, DX
+		MOVQ	BX, dstoff+-32(SP)
+		MOVQ	CX, simdroll+-8(SP)
+		MOVQ	DX, asmroll+-16(SP)
+		MOVQ	src+24(FP), AX
+		MOVQ	AX, srcref+-24(SP)
+		MOVQ	taps+96(FP), DX
+		SUBQ	$2, DX
+		PXOR	X15, X15
+		MOVO	hbits_1<>(SB), X14
+		MOVQ	src+24(FP), SI
+		MOVQ	dst+0(FP), DI
+yloop_15:
+		MOVQ	off+72(FP), BX
+		MOVQ	cof+48(FP), BP
+		MOVQ	simdroll+-8(SP), CX
+		ORQ	CX, CX
+		JE	nosimdloop_18
+simdloop_16:
+		MOVWQSX	(BX), R8
+		MOVWQSX	2(BX), R9
+		MOVWQSX	4(BX), R10
+		MOVWQSX	6(BX), R11
+		PINSRW	$0, (SI)(R8*1), X0
+		PINSRW	$1, (SI)(R9*1), X0
+		PINSRW	$2, (SI)(R10*1), X0
+		PINSRW	$3, (SI)(R11*1), X0
+		MOVWQSX	8(BX), R8
+		MOVWQSX	10(BX), R9
+		MOVWQSX	12(BX), R10
+		MOVWQSX	14(BX), R11
+		PINSRW	$0, (SI)(R8*1), X1
+		PINSRW	$1, (SI)(R9*1), X1
+		PINSRW	$2, (SI)(R10*1), X1
+		PINSRW	$3, (SI)(R11*1), X1
+		MOVWQSX	16(BX), R8
+		MOVWQSX	18(BX), R9
+		MOVWQSX	20(BX), R10
+		MOVWQSX	22(BX), R11
+		PINSRW	$0, (SI)(R8*1), X2
+		PINSRW	$1, (SI)(R9*1), X2
+		PINSRW	$2, (SI)(R10*1), X2
+		PINSRW	$3, (SI)(R11*1), X2
+		MOVWQSX	24(BX), R8
+		MOVWQSX	26(BX), R9
+		MOVWQSX	28(BX), R10
+		MOVWQSX	30(BX), R11
+		PINSRW	$0, (SI)(R8*1), X3
+		PINSRW	$1, (SI)(R9*1), X3
+		PINSRW	$2, (SI)(R10*1), X3
+		PINSRW	$3, (SI)(R11*1), X3
+		ADDQ	$2, SI
+		PUNPCKLBW	X15, X0
+		PMADDWL	(BP), X0
+		PUNPCKLBW	X15, X1
+		PMADDWL	16(BP), X1
+		PUNPCKLBW	X15, X2
+		PMADDWL	32(BP), X2
+		PUNPCKLBW	X15, X3
+		PMADDWL	48(BP), X3
+		ADDQ	$64, BP
+		MOVWQSX	(BX), R8
+		MOVWQSX	2(BX), R9
+		MOVWQSX	4(BX), R10
+		MOVWQSX	6(BX), R11
+		PINSRW	$0, (SI)(R8*1), X4
+		PINSRW	$1, (SI)(R9*1), X4
+		PINSRW	$2, (SI)(R10*1), X4
+		PINSRW	$3, (SI)(R11*1), X4
+		MOVWQSX	8(BX), R8
+		MOVWQSX	10(BX), R9
+		MOVWQSX	12(BX), R10
+		MOVWQSX	14(BX), R11
+		PINSRW	$0, (SI)(R8*1), X5
+		PINSRW	$1, (SI)(R9*1), X5
+		PINSRW	$2, (SI)(R10*1), X5
+		PINSRW	$3, (SI)(R11*1), X5
+		MOVWQSX	16(BX), R8
+		MOVWQSX	18(BX), R9
+		MOVWQSX	20(BX), R10
+		MOVWQSX	22(BX), R11
+		PINSRW	$0, (SI)(R8*1), X6
+		PINSRW	$1, (SI)(R9*1), X6
+		PINSRW	$2, (SI)(R10*1), X6
+		PINSRW	$3, (SI)(R11*1), X6
+		MOVWQSX	24(BX), R8
+		MOVWQSX	26(BX), R9
+		MOVWQSX	28(BX), R10
+		MOVWQSX	30(BX), R11
+		PINSRW	$0, (SI)(R8*1), X7
+		PINSRW	$1, (SI)(R9*1), X7
+		PINSRW	$2, (SI)(R10*1), X7
+		PINSRW	$3, (SI)(R11*1), X7
+		ADDQ	$2, SI
+		PUNPCKLBW	X15, X4
+		PMADDWL	(BP), X4
+		PUNPCKLBW	X15, X5
+		PMADDWL	16(BP), X5
+		PUNPCKLBW	X15, X6
+		PMADDWL	32(BP), X6
+		PUNPCKLBW	X15, X7
+		PMADDWL	48(BP), X7
+		ADDQ	$64, BP
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVWQSX	(BX), R8
+		MOVWQSX	2(BX), R9
+		MOVWQSX	4(BX), R10
+		MOVWQSX	6(BX), R11
+		PINSRW	$0, (SI)(R8*1), X4
+		PINSRW	$1, (SI)(R9*1), X4
+		PINSRW	$2, (SI)(R10*1), X4
+		PINSRW	$3, (SI)(R11*1), X4
+		MOVWQSX	8(BX), R8
+		MOVWQSX	10(BX), R9
+		MOVWQSX	12(BX), R10
+		MOVWQSX	14(BX), R11
+		PINSRW	$0, (SI)(R8*1), X5
+		PINSRW	$1, (SI)(R9*1), X5
+		PINSRW	$2, (SI)(R10*1), X5
+		PINSRW	$3, (SI)(R11*1), X5
+		MOVWQSX	16(BX), R8
+		MOVWQSX	18(BX), R9
+		MOVWQSX	20(BX), R10
+		MOVWQSX	22(BX), R11
+		PINSRW	$0, (SI)(R8*1), X6
+		PINSRW	$1, (SI)(R9*1), X6
+		PINSRW	$2, (SI)(R10*1), X6
+		PINSRW	$3, (SI)(R11*1), X6
+		MOVWQSX	24(BX), R8
+		MOVWQSX	26(BX), R9
+		MOVWQSX	28(BX), R10
+		MOVWQSX	30(BX), R11
+		PINSRW	$0, (SI)(R8*1), X7
+		PINSRW	$1, (SI)(R9*1), X7
+		PINSRW	$2, (SI)(R10*1), X7
+		PINSRW	$3, (SI)(R11*1), X7
+		ADDQ	$2, SI
+		PUNPCKLBW	X15, X4
+		PMADDWL	(BP), X4
+		PUNPCKLBW	X15, X5
+		PMADDWL	16(BP), X5
+		PUNPCKLBW	X15, X6
+		PMADDWL	32(BP), X6
+		PUNPCKLBW	X15, X7
+		PMADDWL	48(BP), X7
+		ADDQ	$64, BP
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVWQSX	(BX), R8
+		MOVWQSX	2(BX), R9
+		MOVWQSX	4(BX), R10
+		MOVWQSX	6(BX), R11
+		PINSRW	$0, (SI)(R8*1), X4
+		PINSRW	$1, (SI)(R9*1), X4
+		PINSRW	$2, (SI)(R10*1), X4
+		PINSRW	$3, (SI)(R11*1), X4
+		MOVWQSX	8(BX), R8
+		MOVWQSX	10(BX), R9
+		MOVWQSX	12(BX), R10
+		MOVWQSX	14(BX), R11
+		PINSRW	$0, (SI)(R8*1), X5
+		PINSRW	$1, (SI)(R9*1), X5
+		PINSRW	$2, (SI)(R10*1), X5
+		PINSRW	$3, (SI)(R11*1), X5
+		MOVWQSX	16(BX), R8
+		MOVWQSX	18(BX), R9
+		MOVWQSX	20(BX), R10
+		MOVWQSX	22(BX), R11
+		PINSRW	$0, (SI)(R8*1), X6
+		PINSRW	$1, (SI)(R9*1), X6
+		PINSRW	$2, (SI)(R10*1), X6
+		PINSRW	$3, (SI)(R11*1), X6
+		MOVWQSX	24(BX), R8
+		MOVWQSX	26(BX), R9
+		MOVWQSX	28(BX), R10
+		MOVWQSX	30(BX), R11
+		PINSRW	$0, (SI)(R8*1), X7
+		PINSRW	$1, (SI)(R9*1), X7
+		PINSRW	$2, (SI)(R10*1), X7
+		PINSRW	$3, (SI)(R11*1), X7
+		ADDQ	$2, SI
+		PUNPCKLBW	X15, X4
+		PMADDWL	(BP), X4
+		PUNPCKLBW	X15, X5
+		PMADDWL	16(BP), X5
+		PUNPCKLBW	X15, X6
+		PMADDWL	32(BP), X6
+		PUNPCKLBW	X15, X7
+		PMADDWL	48(BP), X7
+		ADDQ	$64, BP
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVWQSX	(BX), R8
+		MOVWQSX	2(BX), R9
+		MOVWQSX	4(BX), R10
+		MOVWQSX	6(BX), R11
+		PINSRW	$0, (SI)(R8*1), X4
+		PINSRW	$1, (SI)(R9*1), X4
+		PINSRW	$2, (SI)(R10*1), X4
+		PINSRW	$3, (SI)(R11*1), X4
+		MOVWQSX	8(BX), R8
+		MOVWQSX	10(BX), R9
+		MOVWQSX	12(BX), R10
+		MOVWQSX	14(BX), R11
+		PINSRW	$0, (SI)(R8*1), X5
+		PINSRW	$1, (SI)(R9*1), X5
+		PINSRW	$2, (SI)(R10*1), X5
+		PINSRW	$3, (SI)(R11*1), X5
+		MOVWQSX	16(BX), R8
+		MOVWQSX	18(BX), R9
+		MOVWQSX	20(BX), R10
+		MOVWQSX	22(BX), R11
+		PINSRW	$0, (SI)(R8*1), X6
+		PINSRW	$1, (SI)(R9*1), X6
+		PINSRW	$2, (SI)(R10*1), X6
+		PINSRW	$3, (SI)(R11*1), X6
+		MOVWQSX	24(BX), R8
+		MOVWQSX	26(BX), R9
+		MOVWQSX	28(BX), R10
+		MOVWQSX	30(BX), R11
+		PINSRW	$0, (SI)(R8*1), X7
+		PINSRW	$1, (SI)(R9*1), X7
+		PINSRW	$2, (SI)(R10*1), X7
+		PINSRW	$3, (SI)(R11*1), X7
+		ADDQ	$2, SI
+		PUNPCKLBW	X15, X4
+		PMADDWL	(BP), X4
+		PUNPCKLBW	X15, X5
+		PMADDWL	16(BP), X5
+		PUNPCKLBW	X15, X6
+		PMADDWL	32(BP), X6
+		PUNPCKLBW	X15, X7
+		PMADDWL	48(BP), X7
+		ADDQ	$64, BP
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVQ	taps+96(FP), AX
+		SUBQ	AX, SI
+		ADDQ	$32, BX
+		PADDL	X14, X0
+		PADDL	X14, X1
+		PADDL	X14, X2
+		PADDL	X14, X3
+		PSRAL	$14, X0
+		PSRAL	$14, X1
+		PSRAL	$14, X2
+		PSRAL	$14, X3
+		PACKSSLW	X1, X0
+		PACKSSLW	X3, X2
+		PACKUSWB	X2, X0
+		MOVOU	X0, (DI)
+		ADDQ	$16, DI
+		SUBQ	$1, CX
+		JNE	simdloop_16
+nosimdloop_18:
+		MOVQ	asmroll+-16(SP), CX
+		ORQ	CX, CX
+		JE	end_19
+asmloop_17:
+		MOVWQSX	(BX), DX
+		MOVBQZX	(SI)(DX*1), AX
+		MOVWQSX	(BP), DX
+		IMULQ	DX
+		MOVQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	1(SI)(DX*1), AX
+		MOVWQSX	2(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	2(SI)(DX*1), AX
+		MOVWQSX	4(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	3(SI)(DX*1), AX
+		MOVWQSX	6(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	4(SI)(DX*1), AX
+		MOVWQSX	8(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	5(SI)(DX*1), AX
+		MOVWQSX	10(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	6(SI)(DX*1), AX
+		MOVWQSX	12(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	7(SI)(DX*1), AX
+		MOVWQSX	14(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	8(SI)(DX*1), AX
+		MOVWQSX	16(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	9(SI)(DX*1), AX
+		MOVWQSX	18(BP), DX
+		IMULQ	DX
+		ADDQ	$20, BP
+		ADDQ	sum+-40(SP), AX
+		ADDQ	$8192, AX
+		CMOVQLT	zero_0<>(SB), AX
+		SHRQ	$14, AX
+		CMPQ	u8max_2<>(SB), AX
+		CMOVQLT	u8max_2<>(SB), AX
+		ADDQ	$2, BX
+		MOVB	AL, (DI)
+		ADDQ	$1, DI
+		SUBQ	$1, CX
+		JNE	asmloop_17
+end_19:
+		MOVQ	srcref+-24(SP), SI
+		ADDQ	dstoff+-32(SP), DI
+		ADDQ	sp+128(FP), SI
+		MOVQ	SI, srcref+-24(SP)
+		SUBQ	$1, height+112(FP)
+		JNE	yloop_15
+		RET
+
+TEXT ·h8scale12Amd64(SB),4,$40-136
+		MOVQ	dp+120(FP), BX
+		MOVQ	width+104(FP), CX
+		MOVQ	CX, DX
+		SUBQ	CX, BX
+		SHRQ	$4, CX
+		ANDQ	$15, DX
+		MOVQ	BX, dstoff+-32(SP)
+		MOVQ	CX, simdroll+-8(SP)
+		MOVQ	DX, asmroll+-16(SP)
+		MOVQ	src+24(FP), AX
+		MOVQ	AX, srcref+-24(SP)
+		MOVQ	taps+96(FP), DX
+		SUBQ	$2, DX
+		PXOR	X15, X15
+		MOVO	hbits_1<>(SB), X14
+		MOVQ	src+24(FP), SI
+		MOVQ	dst+0(FP), DI
+yloop_20:
+		MOVQ	off+72(FP), BX
+		MOVQ	cof+48(FP), BP
+		MOVQ	simdroll+-8(SP), CX
+		ORQ	CX, CX
+		JE	nosimdloop_23
+simdloop_21:
+		MOVWQSX	(BX), R8
+		MOVWQSX	2(BX), R9
+		MOVWQSX	4(BX), R10
+		MOVWQSX	6(BX), R11
+		PINSRW	$0, (SI)(R8*1), X0
+		PINSRW	$1, (SI)(R9*1), X0
+		PINSRW	$2, (SI)(R10*1), X0
+		PINSRW	$3, (SI)(R11*1), X0
+		MOVWQSX	8(BX), R8
+		MOVWQSX	10(BX), R9
+		MOVWQSX	12(BX), R10
+		MOVWQSX	14(BX), R11
+		PINSRW	$0, (SI)(R8*1), X1
+		PINSRW	$1, (SI)(R9*1), X1
+		PINSRW	$2, (SI)(R10*1), X1
+		PINSRW	$3, (SI)(R11*1), X1
+		MOVWQSX	16(BX), R8
+		MOVWQSX	18(BX), R9
+		MOVWQSX	20(BX), R10
+		MOVWQSX	22(BX), R11
+		PINSRW	$0, (SI)(R8*1), X2
+		PINSRW	$1, (SI)(R9*1), X2
+		PINSRW	$2, (SI)(R10*1), X2
+		PINSRW	$3, (SI)(R11*1), X2
+		MOVWQSX	24(BX), R8
+		MOVWQSX	26(BX), R9
+		MOVWQSX	28(BX), R10
+		MOVWQSX	30(BX), R11
+		PINSRW	$0, (SI)(R8*1), X3
+		PINSRW	$1, (SI)(R9*1), X3
+		PINSRW	$2, (SI)(R10*1), X3
+		PINSRW	$3, (SI)(R11*1), X3
+		ADDQ	$2, SI
+		PUNPCKLBW	X15, X0
+		PMADDWL	(BP), X0
+		PUNPCKLBW	X15, X1
+		PMADDWL	16(BP), X1
+		PUNPCKLBW	X15, X2
+		PMADDWL	32(BP), X2
+		PUNPCKLBW	X15, X3
+		PMADDWL	48(BP), X3
+		ADDQ	$64, BP
+		MOVWQSX	(BX), R8
+		MOVWQSX	2(BX), R9
+		MOVWQSX	4(BX), R10
+		MOVWQSX	6(BX), R11
+		PINSRW	$0, (SI)(R8*1), X4
+		PINSRW	$1, (SI)(R9*1), X4
+		PINSRW	$2, (SI)(R10*1), X4
+		PINSRW	$3, (SI)(R11*1), X4
+		MOVWQSX	8(BX), R8
+		MOVWQSX	10(BX), R9
+		MOVWQSX	12(BX), R10
+		MOVWQSX	14(BX), R11
+		PINSRW	$0, (SI)(R8*1), X5
+		PINSRW	$1, (SI)(R9*1), X5
+		PINSRW	$2, (SI)(R10*1), X5
+		PINSRW	$3, (SI)(R11*1), X5
+		MOVWQSX	16(BX), R8
+		MOVWQSX	18(BX), R9
+		MOVWQSX	20(BX), R10
+		MOVWQSX	22(BX), R11
+		PINSRW	$0, (SI)(R8*1), X6
+		PINSRW	$1, (SI)(R9*1), X6
+		PINSRW	$2, (SI)(R10*1), X6
+		PINSRW	$3, (SI)(R11*1), X6
+		MOVWQSX	24(BX), R8
+		MOVWQSX	26(BX), R9
+		MOVWQSX	28(BX), R10
+		MOVWQSX	30(BX), R11
+		PINSRW	$0, (SI)(R8*1), X7
+		PINSRW	$1, (SI)(R9*1), X7
+		PINSRW	$2, (SI)(R10*1), X7
+		PINSRW	$3, (SI)(R11*1), X7
+		ADDQ	$2, SI
+		PUNPCKLBW	X15, X4
+		PMADDWL	(BP), X4
+		PUNPCKLBW	X15, X5
+		PMADDWL	16(BP), X5
+		PUNPCKLBW	X15, X6
+		PMADDWL	32(BP), X6
+		PUNPCKLBW	X15, X7
+		PMADDWL	48(BP), X7
+		ADDQ	$64, BP
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVWQSX	(BX), R8
+		MOVWQSX	2(BX), R9
+		MOVWQSX	4(BX), R10
+		MOVWQSX	6(BX), R11
+		PINSRW	$0, (SI)(R8*1), X4
+		PINSRW	$1, (SI)(R9*1), X4
+		PINSRW	$2, (SI)(R10*1), X4
+		PINSRW	$3, (SI)(R11*1), X4
+		MOVWQSX	8(BX), R8
+		MOVWQSX	10(BX), R9
+		MOVWQSX	12(BX), R10
+		MOVWQSX	14(BX), R11
+		PINSRW	$0, (SI)(R8*1), X5
+		PINSRW	$1, (SI)(R9*1), X5
+		PINSRW	$2, (SI)(R10*1), X5
+		PINSRW	$3, (SI)(R11*1), X5
+		MOVWQSX	16(BX), R8
+		MOVWQSX	18(BX), R9
+		MOVWQSX	20(BX), R10
+		MOVWQSX	22(BX), R11
+		PINSRW	$0, (SI)(R8*1), X6
+		PINSRW	$1, (SI)(R9*1), X6
+		PINSRW	$2, (SI)(R10*1), X6
+		PINSRW	$3, (SI)(R11*1), X6
+		MOVWQSX	24(BX), R8
+		MOVWQSX	26(BX), R9
+		MOVWQSX	28(BX), R10
+		MOVWQSX	30(BX), R11
+		PINSRW	$0, (SI)(R8*1), X7
+		PINSRW	$1, (SI)(R9*1), X7
+		PINSRW	$2, (SI)(R10*1), X7
+		PINSRW	$3, (SI)(R11*1), X7
+		ADDQ	$2, SI
+		PUNPCKLBW	X15, X4
+		PMADDWL	(BP), X4
+		PUNPCKLBW	X15, X5
+		PMADDWL	16(BP), X5
+		PUNPCKLBW	X15, X6
+		PMADDWL	32(BP), X6
+		PUNPCKLBW	X15, X7
+		PMADDWL	48(BP), X7
+		ADDQ	$64, BP
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVWQSX	(BX), R8
+		MOVWQSX	2(BX), R9
+		MOVWQSX	4(BX), R10
+		MOVWQSX	6(BX), R11
+		PINSRW	$0, (SI)(R8*1), X4
+		PINSRW	$1, (SI)(R9*1), X4
+		PINSRW	$2, (SI)(R10*1), X4
+		PINSRW	$3, (SI)(R11*1), X4
+		MOVWQSX	8(BX), R8
+		MOVWQSX	10(BX), R9
+		MOVWQSX	12(BX), R10
+		MOVWQSX	14(BX), R11
+		PINSRW	$0, (SI)(R8*1), X5
+		PINSRW	$1, (SI)(R9*1), X5
+		PINSRW	$2, (SI)(R10*1), X5
+		PINSRW	$3, (SI)(R11*1), X5
+		MOVWQSX	16(BX), R8
+		MOVWQSX	18(BX), R9
+		MOVWQSX	20(BX), R10
+		MOVWQSX	22(BX), R11
+		PINSRW	$0, (SI)(R8*1), X6
+		PINSRW	$1, (SI)(R9*1), X6
+		PINSRW	$2, (SI)(R10*1), X6
+		PINSRW	$3, (SI)(R11*1), X6
+		MOVWQSX	24(BX), R8
+		MOVWQSX	26(BX), R9
+		MOVWQSX	28(BX), R10
+		MOVWQSX	30(BX), R11
+		PINSRW	$0, (SI)(R8*1), X7
+		PINSRW	$1, (SI)(R9*1), X7
+		PINSRW	$2, (SI)(R10*1), X7
+		PINSRW	$3, (SI)(R11*1), X7
+		ADDQ	$2, SI
+		PUNPCKLBW	X15, X4
+		PMADDWL	(BP), X4
+		PUNPCKLBW	X15, X5
+		PMADDWL	16(BP), X5
+		PUNPCKLBW	X15, X6
+		PMADDWL	32(BP), X6
+		PUNPCKLBW	X15, X7
+		PMADDWL	48(BP), X7
+		ADDQ	$64, BP
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVWQSX	(BX), R8
+		MOVWQSX	2(BX), R9
+		MOVWQSX	4(BX), R10
+		MOVWQSX	6(BX), R11
+		PINSRW	$0, (SI)(R8*1), X4
+		PINSRW	$1, (SI)(R9*1), X4
+		PINSRW	$2, (SI)(R10*1), X4
+		PINSRW	$3, (SI)(R11*1), X4
+		MOVWQSX	8(BX), R8
+		MOVWQSX	10(BX), R9
+		MOVWQSX	12(BX), R10
+		MOVWQSX	14(BX), R11
+		PINSRW	$0, (SI)(R8*1), X5
+		PINSRW	$1, (SI)(R9*1), X5
+		PINSRW	$2, (SI)(R10*1), X5
+		PINSRW	$3, (SI)(R11*1), X5
+		MOVWQSX	16(BX), R8
+		MOVWQSX	18(BX), R9
+		MOVWQSX	20(BX), R10
+		MOVWQSX	22(BX), R11
+		PINSRW	$0, (SI)(R8*1), X6
+		PINSRW	$1, (SI)(R9*1), X6
+		PINSRW	$2, (SI)(R10*1), X6
+		PINSRW	$3, (SI)(R11*1), X6
+		MOVWQSX	24(BX), R8
+		MOVWQSX	26(BX), R9
+		MOVWQSX	28(BX), R10
+		MOVWQSX	30(BX), R11
+		PINSRW	$0, (SI)(R8*1), X7
+		PINSRW	$1, (SI)(R9*1), X7
+		PINSRW	$2, (SI)(R10*1), X7
+		PINSRW	$3, (SI)(R11*1), X7
+		ADDQ	$2, SI
+		PUNPCKLBW	X15, X4
+		PMADDWL	(BP), X4
+		PUNPCKLBW	X15, X5
+		PMADDWL	16(BP), X5
+		PUNPCKLBW	X15, X6
+		PMADDWL	32(BP), X6
+		PUNPCKLBW	X15, X7
+		PMADDWL	48(BP), X7
+		ADDQ	$64, BP
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVWQSX	(BX), R8
+		MOVWQSX	2(BX), R9
+		MOVWQSX	4(BX), R10
+		MOVWQSX	6(BX), R11
+		PINSRW	$0, (SI)(R8*1), X4
+		PINSRW	$1, (SI)(R9*1), X4
+		PINSRW	$2, (SI)(R10*1), X4
+		PINSRW	$3, (SI)(R11*1), X4
+		MOVWQSX	8(BX), R8
+		MOVWQSX	10(BX), R9
+		MOVWQSX	12(BX), R10
+		MOVWQSX	14(BX), R11
+		PINSRW	$0, (SI)(R8*1), X5
+		PINSRW	$1, (SI)(R9*1), X5
+		PINSRW	$2, (SI)(R10*1), X5
+		PINSRW	$3, (SI)(R11*1), X5
+		MOVWQSX	16(BX), R8
+		MOVWQSX	18(BX), R9
+		MOVWQSX	20(BX), R10
+		MOVWQSX	22(BX), R11
+		PINSRW	$0, (SI)(R8*1), X6
+		PINSRW	$1, (SI)(R9*1), X6
+		PINSRW	$2, (SI)(R10*1), X6
+		PINSRW	$3, (SI)(R11*1), X6
+		MOVWQSX	24(BX), R8
+		MOVWQSX	26(BX), R9
+		MOVWQSX	28(BX), R10
+		MOVWQSX	30(BX), R11
+		PINSRW	$0, (SI)(R8*1), X7
+		PINSRW	$1, (SI)(R9*1), X7
+		PINSRW	$2, (SI)(R10*1), X7
+		PINSRW	$3, (SI)(R11*1), X7
+		ADDQ	$2, SI
+		PUNPCKLBW	X15, X4
+		PMADDWL	(BP), X4
+		PUNPCKLBW	X15, X5
+		PMADDWL	16(BP), X5
+		PUNPCKLBW	X15, X6
+		PMADDWL	32(BP), X6
+		PUNPCKLBW	X15, X7
+		PMADDWL	48(BP), X7
+		ADDQ	$64, BP
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVQ	taps+96(FP), AX
+		SUBQ	AX, SI
+		ADDQ	$32, BX
+		PADDL	X14, X0
+		PADDL	X14, X1
+		PADDL	X14, X2
+		PADDL	X14, X3
+		PSRAL	$14, X0
+		PSRAL	$14, X1
+		PSRAL	$14, X2
+		PSRAL	$14, X3
+		PACKSSLW	X1, X0
+		PACKSSLW	X3, X2
+		PACKUSWB	X2, X0
+		MOVOU	X0, (DI)
+		ADDQ	$16, DI
+		SUBQ	$1, CX
+		JNE	simdloop_21
+nosimdloop_23:
+		MOVQ	asmroll+-16(SP), CX
+		ORQ	CX, CX
+		JE	end_24
+asmloop_22:
+		MOVWQSX	(BX), DX
+		MOVBQZX	(SI)(DX*1), AX
+		MOVWQSX	(BP), DX
+		IMULQ	DX
+		MOVQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	1(SI)(DX*1), AX
+		MOVWQSX	2(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	2(SI)(DX*1), AX
+		MOVWQSX	4(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	3(SI)(DX*1), AX
+		MOVWQSX	6(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	4(SI)(DX*1), AX
+		MOVWQSX	8(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	5(SI)(DX*1), AX
+		MOVWQSX	10(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	6(SI)(DX*1), AX
+		MOVWQSX	12(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	7(SI)(DX*1), AX
+		MOVWQSX	14(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	8(SI)(DX*1), AX
+		MOVWQSX	16(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	9(SI)(DX*1), AX
+		MOVWQSX	18(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	10(SI)(DX*1), AX
+		MOVWQSX	20(BP), DX
+		IMULQ	DX
+		ADDQ	AX, sum+-40(SP)
+		MOVWQSX	(BX), DX
+		MOVBQZX	11(SI)(DX*1), AX
+		MOVWQSX	22(BP), DX
+		IMULQ	DX
+		ADDQ	$24, BP
+		ADDQ	sum+-40(SP), AX
+		ADDQ	$8192, AX
+		CMOVQLT	zero_0<>(SB), AX
+		SHRQ	$14, AX
+		CMPQ	u8max_2<>(SB), AX
+		CMOVQLT	u8max_2<>(SB), AX
+		ADDQ	$2, BX
+		MOVB	AL, (DI)
+		ADDQ	$1, DI
+		SUBQ	$1, CX
+		JNE	asmloop_22
+end_24:
+		MOVQ	srcref+-24(SP), SI
+		ADDQ	dstoff+-32(SP), DI
+		ADDQ	sp+128(FP), SI
+		MOVQ	SI, srcref+-24(SP)
+		SUBQ	$1, height+112(FP)
+		JNE	yloop_20
+		RET
+
+TEXT ·h8scaleNAmd64(SB),4,$64-136
+		MOVQ	dp+120(FP), BX
+		MOVQ	width+104(FP), CX
+		MOVQ	CX, DX
+		SUBQ	CX, BX
+		SHRQ	$4, CX
+		ANDQ	$15, DX
+		MOVQ	BX, dstoff+-32(SP)
+		MOVQ	CX, simdroll+-8(SP)
+		MOVQ	DX, asmroll+-16(SP)
+		MOVQ	src+24(FP), AX
+		MOVQ	AX, srcref+-24(SP)
+		MOVQ	taps+96(FP), DX
+		SUBQ	$2, DX
+		MOVQ	DX, inner+-64(SP)
+		PXOR	X15, X15
+		MOVO	hbits_1<>(SB), X14
+		MOVQ	src+24(FP), SI
+		MOVQ	dst+0(FP), DI
+yloop_25:
+		MOVQ	off+72(FP), BX
+		MOVQ	cof+48(FP), BP
+		MOVQ	simdroll+-8(SP), CX
+		ORQ	CX, CX
+		JE	nosimdloop_28
+simdloop_26:
+		MOVWQSX	(BX), R8
+		MOVWQSX	2(BX), R9
+		MOVWQSX	4(BX), R10
+		MOVWQSX	6(BX), R11
+		PINSRW	$0, (SI)(R8*1), X0
+		PINSRW	$1, (SI)(R9*1), X0
+		PINSRW	$2, (SI)(R10*1), X0
+		PINSRW	$3, (SI)(R11*1), X0
+		MOVWQSX	8(BX), R8
+		MOVWQSX	10(BX), R9
+		MOVWQSX	12(BX), R10
+		MOVWQSX	14(BX), R11
+		PINSRW	$0, (SI)(R8*1), X1
+		PINSRW	$1, (SI)(R9*1), X1
+		PINSRW	$2, (SI)(R10*1), X1
+		PINSRW	$3, (SI)(R11*1), X1
+		MOVWQSX	16(BX), R8
+		MOVWQSX	18(BX), R9
+		MOVWQSX	20(BX), R10
+		MOVWQSX	22(BX), R11
+		PINSRW	$0, (SI)(R8*1), X2
+		PINSRW	$1, (SI)(R9*1), X2
+		PINSRW	$2, (SI)(R10*1), X2
+		PINSRW	$3, (SI)(R11*1), X2
+		MOVWQSX	24(BX), R8
+		MOVWQSX	26(BX), R9
+		MOVWQSX	28(BX), R10
+		MOVWQSX	30(BX), R11
+		PINSRW	$0, (SI)(R8*1), X3
+		PINSRW	$1, (SI)(R9*1), X3
+		PINSRW	$2, (SI)(R10*1), X3
+		PINSRW	$3, (SI)(R11*1), X3
+		ADDQ	$2, SI
+		PUNPCKLBW	X15, X0
+		PMADDWL	(BP), X0
+		PUNPCKLBW	X15, X1
+		PMADDWL	16(BP), X1
+		PUNPCKLBW	X15, X2
+		PMADDWL	32(BP), X2
+		PUNPCKLBW	X15, X3
+		PMADDWL	48(BP), X3
+		ADDQ	$64, BP
+		MOVQ	DI, dstref+-48(SP)
+		MOVQ	inner+-64(SP), DI
+loop_30:
+		MOVWQSX	(BX), R8
+		MOVWQSX	2(BX), R9
+		MOVWQSX	4(BX), R10
+		MOVWQSX	6(BX), R11
+		PINSRW	$0, (SI)(R8*1), X4
+		PINSRW	$1, (SI)(R9*1), X4
+		PINSRW	$2, (SI)(R10*1), X4
+		PINSRW	$3, (SI)(R11*1), X4
+		MOVWQSX	8(BX), R8
+		MOVWQSX	10(BX), R9
+		MOVWQSX	12(BX), R10
+		MOVWQSX	14(BX), R11
+		PINSRW	$0, (SI)(R8*1), X5
+		PINSRW	$1, (SI)(R9*1), X5
+		PINSRW	$2, (SI)(R10*1), X5
+		PINSRW	$3, (SI)(R11*1), X5
+		MOVWQSX	16(BX), R8
+		MOVWQSX	18(BX), R9
+		MOVWQSX	20(BX), R10
+		MOVWQSX	22(BX), R11
+		PINSRW	$0, (SI)(R8*1), X6
+		PINSRW	$1, (SI)(R9*1), X6
+		PINSRW	$2, (SI)(R10*1), X6
+		PINSRW	$3, (SI)(R11*1), X6
+		MOVWQSX	24(BX), R8
+		MOVWQSX	26(BX), R9
+		MOVWQSX	28(BX), R10
+		MOVWQSX	30(BX), R11
+		PINSRW	$0, (SI)(R8*1), X7
+		PINSRW	$1, (SI)(R9*1), X7
+		PINSRW	$2, (SI)(R10*1), X7
+		PINSRW	$3, (SI)(R11*1), X7
+		ADDQ	$2, SI
+		PUNPCKLBW	X15, X4
+		PMADDWL	(BP), X4
+		PUNPCKLBW	X15, X5
+		PMADDWL	16(BP), X5
+		PUNPCKLBW	X15, X6
+		PMADDWL	32(BP), X6
+		PUNPCKLBW	X15, X7
+		PMADDWL	48(BP), X7
+		ADDQ	$64, BP
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		SUBQ	$2, DI
+		JNE	loop_30
+		MOVQ	dstref+-48(SP), DI
+		MOVQ	taps+96(FP), AX
+		SUBQ	AX, SI
+		ADDQ	$32, BX
+		PADDL	X14, X0
+		PADDL	X14, X1
+		PADDL	X14, X2
+		PADDL	X14, X3
+		PSRAL	$14, X0
+		PSRAL	$14, X1
+		PSRAL	$14, X2
+		PSRAL	$14, X3
+		PACKSSLW	X1, X0
+		PACKSSLW	X3, X2
+		PACKUSWB	X2, X0
+		MOVOU	X0, (DI)
+		ADDQ	$16, DI
+		SUBQ	$1, CX
+		JNE	simdloop_26
+nosimdloop_28:
+		MOVQ	asmroll+-16(SP), CX
+		ORQ	CX, CX
+		JE	end_29
+asmloop_27:
+		MOVWQSX	(BX), DX
+		MOVBQZX	(SI)(DX*1), AX
+		MOVWQSX	(BP), DX
+		IMULQ	DX
+		MOVQ	AX, sum+-40(SP)
+		MOVQ	inner+-64(SP), AX
+		MOVQ	AX, count+-56(SP)
+loop_31:
+		MOVWQSX	(BX), DX
+		MOVBQZX	1(SI)(DX*1), AX
+		MOVWQSX	2(BP), DX
+		IMULQ	DX
+		ADDQ	$1, SI
+		ADDQ	$2, BP
+		ADDQ	AX, sum+-40(SP)
+		SUBQ	$1, count+-56(SP)
+		JNE	loop_31
+		MOVWQSX	(BX), DX
+		MOVBQZX	1(SI)(DX*1), AX
+		MOVWQSX	2(BP), DX
+		IMULQ	DX
+		ADDQ	$4, BP
+		SUBQ	inner+-64(SP), SI
+		ADDQ	sum+-40(SP), AX
+		ADDQ	$8192, AX
+		CMOVQLT	zero_0<>(SB), AX
+		SHRQ	$14, AX
+		CMPQ	u8max_2<>(SB), AX
+		CMOVQLT	u8max_2<>(SB), AX
+		ADDQ	$2, BX
+		MOVB	AL, (DI)
+		ADDQ	$1, DI
+		SUBQ	$1, CX
+		JNE	asmloop_27
+end_29:
+		MOVQ	srcref+-24(SP), SI
+		ADDQ	dstoff+-32(SP), DI
+		ADDQ	sp+128(FP), SI
+		MOVQ	SI, srcref+-24(SP)
+		SUBQ	$1, height+112(FP)
+		JNE	yloop_25
+		RET
diff --git a/vendor/github.com/bamiaux/rez/image.go b/vendor/github.com/bamiaux/rez/image.go
new file mode 100644
index 00000000..66cb1312
--- /dev/null
+++ b/vendor/github.com/bamiaux/rez/image.go
@@ -0,0 +1,540 @@
+// Copyright 2013 Benoît Amiaux. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+//go:generate autoreadme -f -template=README.md.template
+
+/*
+Package rez provides image resizing in pure Go and SIMD.
+
+Featuring:
+ - YCbCr, RGBA, NRGBA & Gray resizes
+ - YCbCr Chroma subsample ratio conversions
+ - Optional interlaced-aware resizes
+ - Parallel resizes
+ - SIMD optimisations on AMD64
+
+The easiest way to use it is:
+
+    err := Convert(output, input, NewBicubicFilter())
+
+However, if you plan to convert video, where resize parameters are the same for
+multiple images, the best way is:
+
+    cfg, err := PrepareConversion(output, input)
+    converter, err := NewConverter(cfg, NewBicubicFilter())
+    for i := 0; i < N; i++ {
+        err := converter.Convert(output[i], input[i])
+    }
+
+Note that by default, images are resized in parallel with GOMAXPROCS slices.
+Best performance is obtained when GOMAXPROCS is at least equal to your CPU
+count.
+*/
+package rez
+
+import (
+	"fmt"
+	"image"
+	"runtime"
+	"sync"
+)
+
+// Converter is an interface that implements conversion between images
+// It is currently able to convert only between ycbcr images
+type Converter interface {
+	// Converts one image into another, applying any necessary colorspace
+	// conversion and/or resizing
+	// dst = destination image
+	// src = source image
+	// Result is undefined if src points to the same data as dst
+	// Returns an error if the conversion fails
+	Convert(dst, src image.Image) error
+}
+
+// ChromaRatio is a chroma subsampling ratio
+type ChromaRatio int
+
+const (
+	// Ratio410 is 4:1:0
+	Ratio410 ChromaRatio = iota
+	// Ratio411 is 4:1:1
+	Ratio411
+	// Ratio420 is 4:2:0
+	Ratio420
+	// Ratio422 is 4:2:2
+	Ratio422
+	// Ratio440 is 4:4:0
+	Ratio440
+	// Ratio444 is 4:4:4
+	Ratio444
+)
+
+// Descriptor describes an image properties
+type Descriptor struct {
+	Width      int         // width in pixels
+	Height     int         // height in pixels
+	Ratio      ChromaRatio // chroma ratio
+	Pack       int         // pixels per pack
+	Interlaced bool        // progressive or interlaced
+	Planes     int         // number of planes
+}
+
+// Check returns whether the descriptor is valid
+func (d *Descriptor) Check() error {
+	if d.Pack < 1 || d.Pack > 4 {
+		return fmt.Errorf("invalid pack value %v", d.Pack)
+	}
+	for i := 0; i < d.Planes; i++ {
+		h := d.GetHeight(i)
+		if d.Interlaced && h%2 != 0 && h != d.Height {
+			return fmt.Errorf("invalid interlaced input height %v", d.Height)
+		}
+	}
+	return nil
+}
+
+// GetWidth returns the width in pixels for the input plane
+func (d *Descriptor) GetWidth(plane int) int {
+	if plane < 0 || plane+1 > maxPlanes {
+		panic(fmt.Errorf("invalid plane %v", plane))
+	}
+	if plane == 0 {
+		return d.Width
+	}
+	switch d.Ratio {
+	case Ratio410, Ratio411:
+		return (d.Width + 3) >> 2
+	case Ratio420, Ratio422:
+		return (d.Width + 1) >> 1
+	case Ratio440, Ratio444:
+		return d.Width
+	}
+	panic(fmt.Errorf("invalid ratio %v", d.Ratio))
+}
+
+// GetHeight returns the height in pixels for the input plane
+func (d *Descriptor) GetHeight(plane int) int {
+	if plane < 0 || plane+1 > maxPlanes {
+		panic(fmt.Errorf("invalid plane %v", plane))
+	}
+	if plane == 0 {
+		return d.Height
+	}
+	switch d.Ratio {
+	case Ratio411, Ratio422, Ratio444:
+		return d.Height
+	case Ratio410, Ratio420, Ratio440:
+		h := (d.Height + 1) >> 1
+		if d.Interlaced && h&1 != 0 {
+			h++
+		}
+		return h
+	}
+	panic(fmt.Errorf("invalid ratio %v", d.Ratio))
+}
+
+// ConverterConfig is a configuration used with NewConverter
+type ConverterConfig struct {
+	Input      Descriptor // input description
+	Output     Descriptor // output description
+	Threads    int        // number of allowed "threads"
+	DisableAsm bool       // disable asm optimisations
+}
+
+const (
+	maxPlanes = 3
+)
+
+// Plane describes a single image plane
+type Plane struct {
+	Data   []byte // plane buffer
+	Width  int    // width in pixels
+	Height int    // height in pixels
+	Pitch  int    // pitch in bytes
+	Pack   int    // pixels per pack
+}
+
+type converterContext struct {
+	ConverterConfig
+	wrez   [maxPlanes]Resizer
+	hrez   [maxPlanes]Resizer
+	buffer [maxPlanes]*Plane
+}
+
+func toInterlacedString(interlaced bool) string {
+	if interlaced {
+		return "interlaced"
+	}
+	return "progressive"
+}
+
+func toPackedString(pack int) string {
+	return fmt.Sprintf("%v-packed", pack)
+}
+
+func align(value, align int) int {
+	return (value + align - 1) & -align
+}
+
+func checkConversion(dst, src *Descriptor) error {
+	if err := src.Check(); err != nil {
+		return fmt.Errorf("invalid input format: %v", err)
+	}
+	if err := dst.Check(); err != nil {
+		return fmt.Errorf("invalid output format: %v", err)
+	}
+	if src.Interlaced != dst.Interlaced {
+		return fmt.Errorf("unable to convert %v input to %v output",
+			toInterlacedString(src.Interlaced),
+			toInterlacedString(dst.Interlaced))
+	}
+	if src.Pack != dst.Pack {
+		return fmt.Errorf("unable to convert %v input to %v output",
+			toPackedString(src.Pack),
+			toPackedString(dst.Pack))
+	}
+	if src.Planes != dst.Planes {
+		return fmt.Errorf("unable to convert %v planes to %v planes",
+			src.Planes, dst.Planes)
+	}
+	return nil
+}
+
+func min(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+// NewConverter returns a Converter interface
+// cfg = converter configuration
+// filter = filter used for resizing
+// Returns an error if the conversion is invalid or not implemented
+func NewConverter(cfg *ConverterConfig, filter Filter) (Converter, error) {
+	err := checkConversion(&cfg.Output, &cfg.Input)
+	if err != nil {
+		return nil, err
+	}
+	if cfg.Threads == 0 {
+		cfg.Threads = runtime.GOMAXPROCS(0)
+	}
+	ctx := &converterContext{
+		ConverterConfig: *cfg,
+	}
+	size := 0
+	group := sync.WaitGroup{}
+	for i := 0; i < cfg.Output.Planes; i++ {
+		win := cfg.Input.GetWidth(i)
+		hin := cfg.Input.GetHeight(i)
+		wout := cfg.Output.GetWidth(i)
+		hout := cfg.Output.GetHeight(i)
+		if win < 2 || hin < 2 {
+			return nil, fmt.Errorf("input size too small %vx%v", win, hin)
+		}
+		if wout < 2 || hout < 2 {
+			return nil, fmt.Errorf("output size too small %vx%v", wout, hout)
+		}
+		idx := i
+		if win != wout {
+			dispatch(&group, cfg.Threads, func() {
+				threads := min(cfg.Threads, hout)
+				ctx.wrez[idx] = NewResize(&ResizerConfig{
+					Depth:      8,
+					Input:      win,
+					Output:     wout,
+					Vertical:   false,
+					Interlaced: false,
+					Pack:       cfg.Input.Pack,
+					Threads:    threads,
+					DisableAsm: cfg.DisableAsm || wout < 16,
+				}, filter)
+			})
+		}
+		if hin != hout {
+			dispatch(&group, cfg.Threads, func() {
+				threads := min(cfg.Threads, hout)
+				if cfg.Output.Interlaced {
+					threads = min(cfg.Threads, hout>>1)
+				}
+				ctx.hrez[idx] = NewResize(&ResizerConfig{
+					Depth:      8,
+					Input:      hin,
+					Output:     hout,
+					Vertical:   true,
+					Interlaced: cfg.Output.Interlaced,
+					Pack:       cfg.Output.Pack,
+					Threads:    threads,
+					DisableAsm: cfg.DisableAsm || wout < 16 || win < 16,
+				}, filter)
+			})
+		}
+		if win != wout && hin != hout {
+			p := &Plane{
+				Width:  win,
+				Height: hout,
+				Pitch:  align(win*cfg.Input.Pack, 16),
+				Pack:   cfg.Input.Pack,
+			}
+			size += p.Pitch * p.Height
+			ctx.buffer[i] = p
+		}
+	}
+	if size != 0 {
+		buffer := make([]byte, size)
+		idx := 0
+		for i := 0; i < cfg.Output.Planes; i++ {
+			if p := ctx.buffer[i]; p != nil {
+				size := p.Pitch*(p.Height-1) + p.Width*p.Pack
+				p.Data = buffer[idx : idx+size]
+				idx += p.Pitch * p.Height
+			}
+		}
+	}
+	group.Wait()
+	return ctx, nil
+}
+
+// GetRatio returns a ChromaRatio from an image.YCbCrSubsampleRatio
+func GetRatio(value image.YCbCrSubsampleRatio) ChromaRatio {
+	switch value {
+	case image.YCbCrSubsampleRatio410:
+		return Ratio410
+	case image.YCbCrSubsampleRatio411:
+		return Ratio411
+	case image.YCbCrSubsampleRatio420:
+		return Ratio420
+	case image.YCbCrSubsampleRatio422:
+		return Ratio422
+	case image.YCbCrSubsampleRatio440:
+		return Ratio440
+	case image.YCbCrSubsampleRatio444:
+		return Ratio444
+	}
+	return Ratio444
+}
+
+func inspect(data image.Image, interlaced bool) (*Descriptor, []Plane, error) {
+	switch t := data.(type) {
+	case *image.YCbCr:
+		d, p := inspectYuv(t, interlaced)
+		return d, p, nil
+	case *image.RGBA:
+		d, p := inspectRgba(t, interlaced)
+		return d, p, nil
+	case *image.NRGBA:
+		d, p := inspectNrgba(t, interlaced)
+		return d, p, nil
+	case *image.Gray:
+		d, p := inspectGray(t, interlaced)
+		return d, p, nil
+	}
+	return nil, nil, fmt.Errorf("unknown image format")
+}
+
+func getYuvDescriptor(img *image.YCbCr, interlaced bool) Descriptor {
+	return Descriptor{
+		Width:      img.Rect.Dx(),
+		Height:     img.Rect.Dy(),
+		Ratio:      GetRatio(img.SubsampleRatio),
+		Interlaced: interlaced,
+		Pack:       1,
+		Planes:     3,
+	}
+}
+
+func getRgbDescriptor(rect image.Rectangle, interlaced bool) Descriptor {
+	return Descriptor{
+		Width:      rect.Dx(),
+		Height:     rect.Dy(),
+		Ratio:      Ratio444,
+		Interlaced: interlaced,
+		Pack:       4,
+		Planes:     1,
+	}
+}
+
+func getGrayDescriptor(img *image.Gray, interlaced bool) Descriptor {
+	return Descriptor{
+		Width:      img.Rect.Dx(),
+		Height:     img.Rect.Dy(),
+		Ratio:      Ratio444,
+		Interlaced: interlaced,
+		Pack:       1,
+		Planes:     1,
+	}
+}
+
+func setPlane(p *Plane, rect image.Rectangle, offset func(x, y int) int, pix []byte) {
+	x, y := rect.Min.X, rect.Min.Y
+	base := offset(x, y)
+	p.Data = pix[base : base+p.Pitch*(p.Height-1)+p.Width*p.Pack]
+}
+
+func getYuvPlanes(img *image.YCbCr, d *Descriptor) []Plane {
+	planes := []Plane{}
+	for i := 0; i < maxPlanes; i++ {
+		p := Plane{
+			Width:  d.GetWidth(i),
+			Height: d.GetHeight(i),
+			Pack:   d.Pack,
+		}
+		switch i {
+		case 0:
+			p.Pitch = img.YStride
+			setPlane(&p, img.Rect, img.YOffset, img.Y)
+		case 1:
+			p.Pitch = img.CStride
+			setPlane(&p, img.Rect, img.COffset, img.Cb)
+		case 2:
+			p.Pitch = img.CStride
+			setPlane(&p, img.Rect, img.COffset, img.Cr)
+		}
+		planes = append(planes, p)
+	}
+	return planes
+}
+
+func getSinglePlane(d *Descriptor, pitch int, rect image.Rectangle, offset func(x, y int) int, pix []byte) []Plane {
+	p := Plane{
+		Width:  d.Width,
+		Height: d.Height,
+		Pack:   d.Pack,
+		Pitch:  pitch,
+	}
+	setPlane(&p, rect, offset, pix)
+	return []Plane{p}
+}
+
+func getRgbaPlane(img *image.RGBA, d *Descriptor) []Plane {
+	return getSinglePlane(d, img.Stride, img.Rect, img.PixOffset, img.Pix)
+}
+
+func getNrgbaPlane(img *image.NRGBA, d *Descriptor) []Plane {
+	return getSinglePlane(d, img.Stride, img.Rect, img.PixOffset, img.Pix)
+}
+
+func getGrayPlane(img *image.Gray, d *Descriptor) []Plane {
+	return getSinglePlane(d, img.Stride, img.Rect, img.PixOffset, img.Pix)
+}
+
+func inspectYuv(img *image.YCbCr, interlaced bool) (*Descriptor, []Plane) {
+	d := getYuvDescriptor(img, interlaced)
+	return &d, getYuvPlanes(img, &d)
+}
+
+func inspectRgba(img *image.RGBA, interlaced bool) (*Descriptor, []Plane) {
+	d := getRgbDescriptor(img.Rect, interlaced)
+	return &d, getRgbaPlane(img, &d)
+}
+
+func inspectNrgba(img *image.NRGBA, interlaced bool) (*Descriptor, []Plane) {
+	d := getRgbDescriptor(img.Rect, interlaced)
+	return &d, getNrgbaPlane(img, &d)
+}
+
+func inspectGray(img *image.Gray, interlaced bool) (*Descriptor, []Plane) {
+	d := getGrayDescriptor(img, interlaced)
+	return &d, getGrayPlane(img, &d)
+}
+
+func resizePlane(group *sync.WaitGroup, threads int, dst, src, buf *Plane, hrez, wrez Resizer) {
+	dispatch(group, threads, func() {
+		hdst := dst
+		wsrc := src
+		if hrez != nil && wrez != nil {
+			hdst = buf
+			wsrc = buf
+		}
+		if hrez != nil {
+			hrez.Resize(hdst.Data, src.Data, src.Width, src.Height, hdst.Pitch, src.Pitch)
+		}
+		if wrez != nil {
+			wrez.Resize(dst.Data, wsrc.Data, wsrc.Width, wsrc.Height, dst.Pitch, wsrc.Pitch)
+		}
+		if hrez == nil && wrez == nil {
+			copyPlane(dst.Data, src.Data, src.Width*src.Pack, src.Height, dst.Pitch, src.Pitch)
+		}
+	})
+}
+
+func (ctx *converterContext) Convert(output, input image.Image) error {
+	id, src, err := inspect(input, ctx.Input.Interlaced)
+	if err != nil {
+		return err
+	}
+	od, dst, err := inspect(output, ctx.Output.Interlaced)
+	if err != nil {
+		return err
+	}
+	err = checkConversion(od, id)
+	if err != nil {
+		return err
+	}
+	group := sync.WaitGroup{}
+	for i := 0; i < ctx.Input.Planes; i++ {
+		resizePlane(&group, ctx.Threads, &dst[i], &src[i], ctx.buffer[i], ctx.hrez[i], ctx.wrez[i])
+	}
+	group.Wait()
+	return nil
+}
+
+// PrepareConversion returns a ConverterConfig properly set for a conversion
+// from input images to output images
+// Returns an error if the conversion is not possible
+func PrepareConversion(output, input image.Image) (*ConverterConfig, error) {
+	src, _, err := inspect(input, false)
+	if err != nil {
+		return nil, err
+	}
+	dst, _, err := inspect(output, false)
+	if err != nil {
+		return nil, err
+	}
+	err = checkConversion(dst, src)
+	if err != nil {
+		return nil, err
+	}
+	return &ConverterConfig{
+		Input:  *src,
+		Output: *dst,
+	}, nil
+}
+
+// Convert converts an input image into output, applying any color conversion
+// and/or resizing, using the input filter for interpolation.
+// Note that if you plan to do the same conversion over and over, it is faster
+// to use a Converter interface
+func Convert(output, input image.Image, filter Filter) error {
+	cfg, err := PrepareConversion(output, input)
+	if err != nil {
+		return err
+	}
+	converter, err := NewConverter(cfg, filter)
+	if err != nil {
+		return err
+	}
+	return converter.Convert(output, input)
+}
+
+// Psnr computes the PSNR between two input images
+// Only ycbcr is currently supported
+func Psnr(a, b image.Image) ([]float64, error) {
+	psnrs := []float64{}
+	id, src, err := inspect(a, false)
+	if err != nil {
+		return nil, err
+	}
+	od, dst, err := inspect(b, false)
+	if err != nil {
+		return nil, err
+	}
+	if *id != *od {
+		return nil, fmt.Errorf("unable to psnr different formats")
+	}
+	for i := 0; i < len(dst); i++ {
+		psnrs = append(psnrs, psnrPlane(src[i].Data, dst[i].Data, src[i].Width*src[i].Pack, src[i].Height, src[i].Pitch, dst[i].Pitch))
+	}
+	return psnrs, nil
+}
diff --git a/vendor/github.com/bamiaux/rez/kernels.go b/vendor/github.com/bamiaux/rez/kernels.go
new file mode 100644
index 00000000..8d2c7ba5
--- /dev/null
+++ b/vendor/github.com/bamiaux/rez/kernels.go
@@ -0,0 +1,226 @@
+// Copyright 2013 Benoît Amiaux. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+package rez
+
+import (
+	"math"
+	"sort"
+)
+
+type kernel struct {
+	coeffs   []int16
+	offsets  []int16
+	size     int
+	cofscale int // how many more coeffs do we have
+}
+
+func bin(v bool) uint {
+	if v {
+		return 1
+	}
+	return 0
+}
+
+func clip(v, min, max int) int {
+	if v < min {
+		return min
+	}
+	if v > max {
+		return max
+	}
+	return v
+}
+
+func max(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
+
+func makeDoubleKernel(cfg *ResizerConfig, filter Filter, field, idx uint) ([]int16, []float64, []float64, int, int) {
+	scale := float64(cfg.Output) / float64(cfg.Input)
+	step := math.Min(1, scale)
+	support := float64(filter.Taps()) / step
+	taps := int(math.Ceil(support)) * 2
+	if !cfg.Vertical && taps == 6 && hasAsm() && !cfg.DisableAsm {
+		taps = 8
+	}
+	taps = min(taps, (cfg.Input>>field)&^1)
+	offsets := make([]int16, cfg.Output)
+	sums := make([]float64, cfg.Output)
+	weights := make([]float64, cfg.Output*taps)
+	xmid := float64(cfg.Input-cfg.Output) / float64(cfg.Output*2)
+	xstep := 1 / scale
+	// interlaced resize see only one field but still use full res pixel positions
+	ftaps := taps << field
+	size := (cfg.Output + int(field*(1-idx))) >> field
+	step /= float64(1 + field)
+	xmid += xstep * float64(field*idx)
+	for i := 0; i < size; i++ {
+		left := int(math.Ceil(xmid)) - ftaps>>1
+		x := clip(left, 0, max(0, cfg.Input-ftaps))
+		offsets[i] = int16(x)
+		for j := 0; j < ftaps; j++ {
+			src := left + j
+			if field != 0 && idx^uint(src&1) != 0 {
+				continue
+			}
+			weight := filter.Get(math.Abs(xmid-float64(src)) * step)
+			src = clip(src, x, cfg.Input-1) - x
+			src >>= field
+			weights[i*taps+src] += weight
+			sums[i] += weight
+		}
+		xmid += xstep * float64(1+field)
+	}
+	return offsets, sums, weights, taps, size
+}
+
+type weight struct {
+	weight float64
+	offset int
+}
+
+type weights []weight
+
+func (w weights) Len() int {
+	return len(w)
+}
+
+func (w weights) Less(i, j int) bool {
+	return math.Abs(w[j].weight) < math.Abs(w[i].weight)
+}
+
+func (w weights) Swap(i, j int) {
+	w[i], w[j] = w[j], w[i]
+}
+
+func makeIntegerKernel(taps, size int, cof, sums []float64, pos []int16, field, idx uint) ([]int16, []int16) {
+	coeffs := make([]int16, taps*size)
+	offsets := make([]int16, size)
+	weights := make(weights, taps)
+	for i, sum := range sums[:size] {
+		for j, w := range cof[:taps] {
+			weights[j].weight = w
+			weights[j].offset = j
+		}
+		sort.Sort(weights)
+		diff := float64(0)
+		scale := 1 << Bits / sum
+		for _, it := range weights {
+			w := it.weight*scale + diff
+			iw := math.Floor(w + 0.5)
+			coeffs[i*taps+it.offset] = int16(iw)
+			diff = w - iw
+		}
+		cof = cof[taps:]
+		off := pos[i] + int16(field-idx)
+		offsets[i] = off >> field
+	}
+	return coeffs, offsets
+}
+
+func makeKernel(cfg *ResizerConfig, filter Filter, idx uint) kernel {
+	field := bin(cfg.Interlaced)
+	pos, sums, cof, taps, size := makeDoubleKernel(cfg, filter, field, idx)
+	coeffs, offsets := makeIntegerKernel(taps, size, cof, sums, pos, field, idx)
+	//coeffs, offsets = reduceKernel(coeffs, offsets, taps, size)
+	if cfg.Vertical {
+		for i := len(offsets) - 1; i > 0; i-- {
+			offsets[i] = offsets[i] - offsets[i-1]
+		}
+
+	} else if cfg.Pack > 1 {
+		coeffs, offsets, taps = unpack(coeffs, offsets, taps, cfg.Pack)
+	}
+	coeffs, cofscale := prepareCoeffs(cfg, coeffs, size, taps)
+	return kernel{coeffs, offsets, taps, cofscale}
+}
+
+func prepareCoeffs(cfg *ResizerConfig, cof []int16, size, taps int) ([]int16, int) {
+	if !hasAsm() || cfg.DisableAsm {
+		return cof, 1
+	}
+	if cfg.Vertical {
+		return prepareVerticalCoeffs(cof, size, taps)
+	}
+	return prepareHorizontalCoeffs(cof, size*cfg.Pack, taps), 1
+}
+
+func prepareVerticalCoeffs(cof []int16, size, taps int) ([]int16, int) {
+	xwidth := 16
+	dst := make([]int16, size*taps*xwidth>>1)
+	si := 0
+	di := 0
+	for i := 0; i < size; i++ {
+		for j := 0; j < taps; j += 2 {
+			for k := 0; k < xwidth; k += 2 {
+				dst[di+k+0] = cof[si+0]
+				dst[di+k+1] = cof[si+1]
+			}
+			si += 2
+			di += xwidth
+		}
+	}
+	return dst, xwidth >> 1
+}
+
+func prepareHorizontalCoeffs(cof []int16, size, taps int) []int16 {
+	if taps == 2 || taps == 4 || taps == 8 {
+		return cof
+	}
+	xwidth := 16
+	dst := make([]int16, len(cof))
+	loop := size / xwidth
+	left := (size - loop*xwidth) * taps
+	si := 0
+	di := 0
+	// instead of having all taps contiguous for one destination pixel,
+	// we store 2 taps per pixel and fill one simd-sized buffer with it, then
+	// fill the second register with the following taps until none are left
+	// this way we don't care about the simd register size, we will always be
+	// able to process N pixels at once
+	for i := 0; i < loop; i++ {
+		for j := 0; j*2 < taps; j++ {
+			for k := 0; k < xwidth; k++ {
+				dst[di+k*2+0] = cof[si+k*taps+0]
+				dst[di+k*2+1] = cof[si+k*taps+1]
+			}
+			di += xwidth * 2
+			si += 2
+		}
+		si = di
+	}
+	copy(dst[di:di+left], cof[si:si+left])
+	return dst
+}
+
+func unpack(coeffs, offsets []int16, taps, pack int) ([]int16, []int16, int) {
+	cof := make([]int16, len(coeffs)*pack*pack)
+	off := make([]int16, len(offsets)*pack)
+	di := 0
+	ci := 0
+	oi := 0
+	buf := make([]int16, pack*taps*2)
+	zero := buf[:pack*taps]
+	next := buf[pack*taps:]
+	for _, offset := range offsets {
+		copy(next, zero)
+		for i := 0; i < taps; i++ {
+			next[i*pack] = coeffs[ci+i]
+		}
+		for i := 0; i < pack; i++ {
+			off[oi+i] = offset * int16(pack)
+			copy(cof[di+pack*taps*i:], next)
+			copy(next[i+1:], next[i:])
+			copy(next[:i+1], zero)
+		}
+		di += taps * pack * pack
+		ci += taps
+		oi += pack
+	}
+	return cof, off, taps * pack
+}
diff --git a/vendor/github.com/bamiaux/rez/mkscalers.sh b/vendor/github.com/bamiaux/rez/mkscalers.sh
new file mode 100644
index 00000000..9d18c7cc
--- /dev/null
+++ b/vendor/github.com/bamiaux/rez/mkscalers.sh
@@ -0,0 +1,7 @@
+#!bin/sh
+# depends on https://github.com/jimmyfrasche/txt
+cat fixedscalers.go.input | txt -json fixedscalers.go.template > fixedscalers.go
+go fmt fixedscalers.go
+go install -v .../rez/rezgen
+rezgen -gen horizontal > hscalers_amd64.s && echo hscalers_amd64.s
+rezgen -gen vertical   > vscalers_amd64.s && echo vscalers_amd64.s
diff --git a/vendor/github.com/bamiaux/rez/resize.go b/vendor/github.com/bamiaux/rez/resize.go
new file mode 100644
index 00000000..9b8d8bb4
--- /dev/null
+++ b/vendor/github.com/bamiaux/rez/resize.go
@@ -0,0 +1,184 @@
+// Copyright 2013 Benoît Amiaux. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+package rez
+
+import (
+	"sync"
+)
+
+// ResizerConfig is a configuration used with NewResizer
+type ResizerConfig struct {
+	Depth      int  // bits per pixel
+	Input      int  // input size in pixels
+	Output     int  // output size in pixels
+	Vertical   bool // true for vertical resizes
+	Interlaced bool // true if input/output is interlaced
+	Pack       int  // pixels per pack [default=1]
+	Threads    int  // number of threads, [default=0]
+	DisableAsm bool // disable asm optimisations
+}
+
+// Resizer is a interface that implements resizes
+type Resizer interface {
+	// Resize one plane into another
+	// dst, src = destination and source buffer
+	// width, height = plane dimensions in pixels
+	// dstPitch, srcPitch = destination and source pitchs/strides in bytes
+	Resize(dst, src []byte, width, height, dstPitch, srcPitch int)
+}
+
+type scaler func(dst, src []byte, cof, off []int16,
+	taps, width, height, dstPitch, srcPitch int)
+
+type context struct {
+	cfg     ResizerConfig
+	kernels []kernel
+	scaler  scaler
+}
+
+func getHorizontalScalerGo(taps int) scaler {
+	switch taps {
+	case 2:
+		return h8scale2Go
+	case 4:
+		return h8scale4Go
+	case 6:
+		return h8scale6Go
+	case 8:
+		return h8scale8Go
+	case 10:
+		return h8scale10Go
+	case 12:
+		return h8scale12Go
+	}
+	return h8scaleNGo
+}
+
+func getVerticalScalerGo(taps int) scaler {
+	switch taps {
+	case 2:
+		return v8scale2Go
+	case 4:
+		return v8scale4Go
+	case 6:
+		return v8scale6Go
+	case 8:
+		return v8scale8Go
+	case 10:
+		return v8scale10Go
+	case 12:
+		return v8scale12Go
+	}
+	return v8scaleNGo
+}
+
+// NewResize returns a new resizer
+// cfg = resize configuration
+// filter = filter used for computing weights
+func NewResize(cfg *ResizerConfig, filter Filter) Resizer {
+	ctx := context{
+		cfg: *cfg,
+	}
+	ctx.cfg.Depth = 8 // only 8-bit for now
+	if ctx.cfg.Pack < 1 {
+		ctx.cfg.Pack = 1
+	}
+	ctx.kernels = []kernel{makeKernel(&ctx.cfg, filter, 0)}
+	ctx.scaler = getHorizontalScaler(ctx.kernels[0].size, !cfg.DisableAsm)
+	if cfg.Vertical {
+		ctx.scaler = getVerticalScaler(ctx.kernels[0].size, !cfg.DisableAsm)
+		if cfg.Interlaced {
+			ctx.kernels = append(ctx.kernels, makeKernel(&ctx.cfg, filter, 1))
+		}
+	}
+	return &ctx
+}
+
+func dispatch(group *sync.WaitGroup, threads int, job func()) {
+	if threads == 1 {
+		job()
+	} else {
+		group.Add(1)
+		go func() {
+			job()
+			group.Done()
+		}()
+	}
+}
+
+func scaleSlice(group *sync.WaitGroup, threads int, scaler scaler,
+	dst, src []byte, cof, off []int16, taps, width, height, dp, sp int) {
+	dispatch(group, threads, func() {
+		scaler(dst, src, cof, off, taps, width, height, dp, sp)
+	})
+}
+
+func scaleSlices(group *sync.WaitGroup, scaler scaler,
+	vertical bool, threads, taps, width, height, dp, sp int,
+	dst, src []byte, cof []int16, cofscale int, off []int16) {
+	dispatch(group, threads, func() {
+		nh := height / threads
+		if nh < 1 {
+			nh = 1
+		}
+		di := 0
+		si := 0
+		oi := 0
+		ci := 0
+		for i := 0; i < threads; i++ {
+			last := i+1 == threads
+			ih := nh
+			if last {
+				ih = height - nh*(threads-1)
+			}
+			if ih == 0 {
+				continue
+			}
+			next := width
+			if vertical {
+				next = ih
+			}
+			scaleSlice(group, threads, scaler,
+				dst[di:di+dp*(ih-1)+width],
+				src[si:],
+				cof[ci:ci+next*taps*cofscale],
+				off[oi:oi+next],
+				taps, width, ih, dp, sp)
+			if last {
+				break
+			}
+			di += ih * dp
+			if vertical {
+				ci += ih * taps * cofscale
+				for j := 0; j < ih; j++ {
+					si += sp * int(off[oi+j])
+				}
+				oi += ih
+			} else {
+				si += sp * ih
+			}
+		}
+	})
+}
+
+func (c *context) Resize(dst, src []byte, width, height, dp, sp int) {
+	field := bin(c.cfg.Vertical && c.cfg.Interlaced)
+	dwidth := c.cfg.Output
+	dheight := height
+	if c.cfg.Vertical {
+		dwidth = width
+	}
+	pk := c.cfg.Pack
+	group := sync.WaitGroup{}
+	for i, k := range c.kernels[:1+field] {
+		if c.cfg.Vertical {
+			dheight = (c.cfg.Output + (1-i)*int(field)) >> field
+		}
+		scaleSlices(&group, c.scaler, c.cfg.Vertical, c.cfg.Threads,
+			k.size, dwidth*pk, dheight, dp<<field, sp<<field,
+			dst[dp*i:], src[sp*i:], k.coeffs, k.cofscale, k.offsets)
+	}
+	group.Wait()
+}
diff --git a/vendor/github.com/bamiaux/rez/scalers.go b/vendor/github.com/bamiaux/rez/scalers.go
new file mode 100644
index 00000000..c42c2697
--- /dev/null
+++ b/vendor/github.com/bamiaux/rez/scalers.go
@@ -0,0 +1,88 @@
+// Copyright 2013 Benoît Amiaux. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+package rez
+
+import (
+	"math"
+)
+
+const (
+	// Bits exports the number of significant bits used by kernels
+	Bits = 14
+)
+
+func u8(x int) byte {
+	if x < 0 {
+		x = 0
+	}
+	if x > 0xFF {
+		x = 0xFF
+	}
+	return byte(x)
+}
+
+func copyPlane(dst, src []byte, width, height, dp, sp int) {
+	di := 0
+	si := 0
+	for y := 0; y < height; y++ {
+		copy(dst[di:di+width], src[si:si+width])
+		di += dp
+		si += sp
+	}
+}
+
+func psnrPlane(dst, src []byte, width, height, dp, sp int) float64 {
+	mse := 0
+	di := 0
+	si := 0
+	for y := 0; y < height; y++ {
+		for x, v := range src[si : si+width] {
+			n := int(v) - int(dst[di+x])
+			mse += n * n
+		}
+		di += dp
+		si += sp
+	}
+	fmse := float64(mse) / float64(width*height)
+	return 10 * math.Log10(255*255/fmse)
+}
+
+func h8scaleNGo(dst, src []byte, cof, off []int16,
+	taps, width, height, dp, sp int) {
+	di := 0
+	si := 0
+	for y := 0; y < height; y++ {
+		c := cof
+		s := src[si:]
+		d := dst[di:]
+		for x, xoff := range off[:width] {
+			pix := 0
+			for i, v := range s[xoff : xoff+int16(taps)] {
+				pix += int(v) * int(c[i])
+			}
+			d[x] = u8((pix + 1<<(Bits-1)) >> Bits)
+			c = c[taps:]
+		}
+		di += dp
+		si += sp
+	}
+}
+
+func v8scaleNGo(dst, src []byte, cof, off []int16,
+	taps, width, height, dp, sp int) {
+	di := 0
+	for _, yoff := range off[:height] {
+		src = src[sp*int(yoff):]
+		for x := range dst[di : di+width] {
+			pix := 0
+			for i, c := range cof[:taps] {
+				pix += int(c) * int(src[sp*i+x])
+			}
+			dst[di+x] = u8((pix + 1<<(Bits-1)) >> Bits)
+		}
+		cof = cof[taps:]
+		di += dp
+	}
+}
diff --git a/vendor/github.com/bamiaux/rez/scalers_amd64.go b/vendor/github.com/bamiaux/rez/scalers_amd64.go
new file mode 100644
index 00000000..fc9766e0
--- /dev/null
+++ b/vendor/github.com/bamiaux/rez/scalers_amd64.go
@@ -0,0 +1,61 @@
+// Copyright 2014 Benoît Amiaux. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+package rez
+
+func hasAsm() bool { return true }
+
+func h8scale2Amd64(dst, src []byte, cof, off []int16, taps, width, height, dp, sp int)
+func h8scale4Amd64(dst, src []byte, cof, off []int16, taps, width, height, dp, sp int)
+func h8scale8Amd64(dst, src []byte, cof, off []int16, taps, width, height, dp, sp int)
+func h8scale10Amd64(dst, src []byte, cof, off []int16, taps, width, height, dp, sp int)
+func h8scale12Amd64(dst, src []byte, cof, off []int16, taps, width, height, dp, sp int)
+func h8scaleNAmd64(dst, src []byte, cof, off []int16, taps, width, height, dp, sp int)
+func v8scale2Amd64(dst, src []byte, cof, off []int16, taps, width, height, dp, sp int)
+func v8scale4Amd64(dst, src []byte, cof, off []int16, taps, width, height, dp, sp int)
+func v8scale6Amd64(dst, src []byte, cof, off []int16, taps, width, height, dp, sp int)
+func v8scale8Amd64(dst, src []byte, cof, off []int16, taps, width, height, dp, sp int)
+func v8scale10Amd64(dst, src []byte, cof, off []int16, taps, width, height, dp, sp int)
+func v8scale12Amd64(dst, src []byte, cof, off []int16, taps, width, height, dp, sp int)
+func v8scaleNAmd64(dst, src []byte, cof, off []int16, taps, width, height, dp, sp int)
+
+func getHorizontalScaler(taps int, asm bool) scaler {
+	if !asm {
+		return getHorizontalScalerGo(taps)
+	}
+	switch taps {
+	case 2:
+		return h8scale2Amd64
+	case 4:
+		return h8scale4Amd64
+	case 8:
+		return h8scale8Amd64
+	case 10:
+		return h8scale10Amd64
+	case 12:
+		return h8scale12Amd64
+	}
+	return h8scaleNAmd64
+}
+
+func getVerticalScaler(taps int, asm bool) scaler {
+	if !asm {
+		return getVerticalScalerGo(taps)
+	}
+	switch taps {
+	case 2:
+		return v8scale2Amd64
+	case 4:
+		return v8scale4Amd64
+	case 6:
+		return v8scale6Amd64
+	case 8:
+		return v8scale8Amd64
+	case 10:
+		return v8scale10Amd64
+	case 12:
+		return v8scale12Amd64
+	}
+	return v8scaleNAmd64
+}
diff --git a/vendor/github.com/bamiaux/rez/scalers_gen.go b/vendor/github.com/bamiaux/rez/scalers_gen.go
new file mode 100644
index 00000000..e972dac6
--- /dev/null
+++ b/vendor/github.com/bamiaux/rez/scalers_gen.go
@@ -0,0 +1,17 @@
+// Copyright 2014 Benoît Amiaux. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64
+
+package rez
+
+func hasAsm() bool { return false }
+
+func getHorizontalScaler(taps int, asm bool) scaler {
+	return getHorizontalScalerGo(taps)
+}
+
+func getVerticalScaler(taps int, asm bool) scaler {
+	return getVerticalScalerGo(taps)
+}
diff --git a/vendor/github.com/bamiaux/rez/utils.go b/vendor/github.com/bamiaux/rez/utils.go
new file mode 100644
index 00000000..5af5ec19
--- /dev/null
+++ b/vendor/github.com/bamiaux/rez/utils.go
@@ -0,0 +1,43 @@
+// Copyright 2014 Benoît Amiaux. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+package rez
+
+import (
+	"fmt"
+	"image"
+	"os"
+)
+
+func dumpPlane(prefix string, p *Plane, idx int) error {
+	fh, err := os.Create(fmt.Sprintf("%v_%v.raw", prefix, idx))
+	if err != nil {
+		return err
+	}
+	defer fh.Close()
+	si := 0
+	for y := 0; y < p.Height; y++ {
+		_, err = fh.Write(p.Data[si : si+p.Width])
+		if err != nil {
+			return err
+		}
+		si += p.Pitch
+	}
+	return nil
+}
+
+// DumpImage dumps each img planes to disk using the input prefix
+func DumpImage(prefix string, img image.Image) error {
+	_, src, err := inspect(img, false)
+	if err != nil {
+		return err
+	}
+	for i, p := range src {
+		err = dumpPlane(prefix, &p, i)
+		if err != nil {
+			return err
+		}
+	}
+	return nil
+}
diff --git a/vendor/github.com/bamiaux/rez/vscalers_amd64.s b/vendor/github.com/bamiaux/rez/vscalers_amd64.s
new file mode 100644
index 00000000..33e71ec3
--- /dev/null
+++ b/vendor/github.com/bamiaux/rez/vscalers_amd64.s
@@ -0,0 +1,1530 @@
+// Copyright 2014 Benoît Amiaux. All rights reserved.
+// Use of this source code is governed by a MIT-style
+// license that can be found in the LICENSE file.
+
+// This file is auto-generated - do not modify
+
+DATA	zero_0<>+0x00(SB)/8, $0x0000000000000000
+DATA	zero_0<>+0x08(SB)/8, $0x0000000000000000
+GLOBL	zero_0<>(SB), 8, $16
+DATA	hbits_1<>+0x00(SB)/8, $0x0000200000002000
+DATA	hbits_1<>+0x08(SB)/8, $0x0000200000002000
+GLOBL	hbits_1<>(SB), 8, $16
+
+TEXT ·v8scale2Amd64(SB),4,$0-136
+		MOVQ	dp+120(FP), BX
+		MOVQ	width+104(FP), CX
+		MOVQ	CX, DX
+		SUBQ	CX, BX
+		ANDQ	$15, DX
+		SHRQ	$4, CX
+		MOVQ	BX, R11
+		MOVQ	CX, R12
+		MOVQ	DX, AX
+		ORQ	AX, AX
+		JE	norollback_0
+		SUBQ	$16, DX
+		NEGQ	DX
+norollback_0:
+		MOVQ	DX, R13
+		MOVQ	off+72(FP), CX
+		MOVQ	CX, R10
+		MOVO	zero_0<>(SB), X14
+		MOVO	hbits_1<>(SB), X13
+		MOVQ	src+24(FP), SI
+		MOVQ	SI, R9
+		MOVQ	dst+0(FP), DI
+		MOVQ	cof+48(FP), BP
+		MOVQ	sp+128(FP), BX
+yloop_1:
+		MOVQ	R9, SI
+		MOVQ	R10, DX
+		MOVWQSX	(DX), AX
+		MULQ	BX
+		ADDQ	AX, SI
+		MOVQ	SI, R9
+		MOVQ	R12, CX
+		ORQ	CX, CX
+		JE	nomaxloop_2
+maxloop_3:
+		MOVOU	(BP), X12
+		MOVOU	(SI), X0
+		MOVOU	(SI)(BX*1), X3
+		MOVO	X0, X2
+		PUNPCKLBW	X3, X0
+		PUNPCKHBW	X3, X2
+		MOVO	X0, X1
+		MOVO	X2, X3
+		PUNPCKLBW	X14, X0
+		PUNPCKHBW	X14, X1
+		PUNPCKLBW	X14, X2
+		PUNPCKHBW	X14, X3
+		PMADDWL	X12, X0
+		PMADDWL	X12, X1
+		PMADDWL	X12, X2
+		PMADDWL	X12, X3
+		PADDL	X13, X0
+		PADDL	X13, X1
+		PADDL	X13, X2
+		PADDL	X13, X3
+		PSRAL	$14, X0
+		PSRAL	$14, X1
+		PSRAL	$14, X2
+		PSRAL	$14, X3
+		PACKSSLW	X1, X0
+		PACKSSLW	X3, X2
+		PACKUSWB	X2, X0
+		MOVOU	X0, (DI)
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+		SUBQ	$1, CX
+		JNE	maxloop_3
+nomaxloop_2:
+		MOVQ	R13, CX
+		SUBQ	R13, SI
+		SUBQ	R13, DI
+		ORQ	CX, CX
+		JE	nobackroll_4
+		MOVOU	(BP), X12
+		MOVOU	(SI), X0
+		MOVOU	(SI)(BX*1), X3
+		MOVO	X0, X2
+		PUNPCKLBW	X3, X0
+		PUNPCKHBW	X3, X2
+		MOVO	X0, X1
+		MOVO	X2, X3
+		PUNPCKLBW	X14, X0
+		PUNPCKHBW	X14, X1
+		PUNPCKLBW	X14, X2
+		PUNPCKHBW	X14, X3
+		PMADDWL	X12, X0
+		PMADDWL	X12, X1
+		PMADDWL	X12, X2
+		PMADDWL	X12, X3
+		PADDL	X13, X0
+		PADDL	X13, X1
+		PADDL	X13, X2
+		PADDL	X13, X3
+		PSRAL	$14, X0
+		PSRAL	$14, X1
+		PSRAL	$14, X2
+		PSRAL	$14, X3
+		PACKSSLW	X1, X0
+		PACKSSLW	X3, X2
+		PACKUSWB	X2, X0
+		MOVOU	X0, (DI)
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+nobackroll_4:
+		ADDQ	R11, DI
+		ADDQ	$32, BP
+		ADDQ	$2, R10
+		SUBQ	$1, height+112(FP)
+		JNE	yloop_1
+		RET
+
+TEXT ·v8scale4Amd64(SB),4,$0-136
+		MOVQ	dp+120(FP), BX
+		MOVQ	width+104(FP), CX
+		MOVQ	CX, DX
+		SUBQ	CX, BX
+		ANDQ	$15, DX
+		SHRQ	$4, CX
+		MOVQ	BX, R11
+		MOVQ	CX, R12
+		MOVQ	DX, AX
+		ORQ	AX, AX
+		JE	norollback_5
+		SUBQ	$16, DX
+		NEGQ	DX
+norollback_5:
+		MOVQ	DX, R13
+		MOVQ	off+72(FP), CX
+		MOVQ	CX, R10
+		MOVO	zero_0<>(SB), X14
+		MOVO	hbits_1<>(SB), X13
+		MOVQ	src+24(FP), SI
+		MOVQ	SI, R9
+		MOVQ	dst+0(FP), DI
+		MOVQ	cof+48(FP), BP
+		MOVQ	sp+128(FP), BX
+yloop_6:
+		MOVQ	R9, SI
+		MOVQ	R10, DX
+		MOVWQSX	(DX), AX
+		MULQ	BX
+		ADDQ	AX, SI
+		MOVQ	SI, R9
+		MOVQ	R12, CX
+		ORQ	CX, CX
+		JE	nomaxloop_7
+maxloop_8:
+		MOVOU	(SI), X0
+		MOVOU	(SI)(BX*1), X3
+		MOVOU	(SI)(BX*2), X4
+		MOVOU	(BP), X10
+		MOVOU	32(BP), X11
+		ADDQ	BX, SI
+		MOVOU	(SI)(BX*2), X7
+		MOVO	X0, X2
+		MOVO	X4, X6
+		PUNPCKLBW	X3, X0
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X3, X2
+		PUNPCKHBW	X7, X6
+		MOVO	X0, X1
+		MOVO	X4, X5
+		MOVO	X2, X3
+		MOVO	X6, X7
+		SUBQ	BX, SI
+		PUNPCKLBW	X14, X0
+		PUNPCKHBW	X14, X1
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X2
+		PUNPCKHBW	X14, X3
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	X10, X0
+		PMADDWL	X10, X1
+		PMADDWL	X11, X4
+		PMADDWL	X11, X5
+		PMADDWL	X10, X2
+		PMADDWL	X10, X3
+		PMADDWL	X11, X6
+		PMADDWL	X11, X7
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		PADDL	X13, X0
+		PADDL	X13, X1
+		PADDL	X13, X2
+		PADDL	X13, X3
+		PSRAL	$14, X0
+		PSRAL	$14, X1
+		PSRAL	$14, X2
+		PSRAL	$14, X3
+		PACKSSLW	X1, X0
+		PACKSSLW	X3, X2
+		PACKUSWB	X2, X0
+		MOVOU	X0, (DI)
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+		SUBQ	$1, CX
+		JNE	maxloop_8
+nomaxloop_7:
+		MOVQ	R13, CX
+		SUBQ	R13, SI
+		SUBQ	R13, DI
+		ORQ	CX, CX
+		JE	nobackroll_9
+		MOVOU	(SI), X0
+		MOVOU	(SI)(BX*1), X3
+		MOVOU	(SI)(BX*2), X4
+		MOVOU	(BP), X10
+		MOVOU	32(BP), X11
+		ADDQ	BX, SI
+		MOVOU	(SI)(BX*2), X7
+		MOVO	X0, X2
+		MOVO	X4, X6
+		PUNPCKLBW	X3, X0
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X3, X2
+		PUNPCKHBW	X7, X6
+		MOVO	X0, X1
+		MOVO	X4, X5
+		MOVO	X2, X3
+		MOVO	X6, X7
+		SUBQ	BX, SI
+		PUNPCKLBW	X14, X0
+		PUNPCKHBW	X14, X1
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X2
+		PUNPCKHBW	X14, X3
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	X10, X0
+		PMADDWL	X10, X1
+		PMADDWL	X11, X4
+		PMADDWL	X11, X5
+		PMADDWL	X10, X2
+		PMADDWL	X10, X3
+		PMADDWL	X11, X6
+		PMADDWL	X11, X7
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		PADDL	X13, X0
+		PADDL	X13, X1
+		PADDL	X13, X2
+		PADDL	X13, X3
+		PSRAL	$14, X0
+		PSRAL	$14, X1
+		PSRAL	$14, X2
+		PSRAL	$14, X3
+		PACKSSLW	X1, X0
+		PACKSSLW	X3, X2
+		PACKUSWB	X2, X0
+		MOVOU	X0, (DI)
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+nobackroll_9:
+		ADDQ	R11, DI
+		ADDQ	$64, BP
+		ADDQ	$2, R10
+		SUBQ	$1, height+112(FP)
+		JNE	yloop_6
+		RET
+
+TEXT ·v8scale6Amd64(SB),4,$0-136
+		MOVQ	dp+120(FP), BX
+		MOVQ	width+104(FP), CX
+		MOVQ	CX, DX
+		SUBQ	CX, BX
+		ANDQ	$15, DX
+		SHRQ	$4, CX
+		MOVQ	BX, R11
+		MOVQ	CX, R12
+		MOVQ	DX, AX
+		ORQ	AX, AX
+		JE	norollback_10
+		SUBQ	$16, DX
+		NEGQ	DX
+norollback_10:
+		MOVQ	DX, R13
+		MOVQ	off+72(FP), CX
+		MOVQ	CX, R10
+		MOVO	zero_0<>(SB), X14
+		MOVO	hbits_1<>(SB), X13
+		MOVQ	src+24(FP), SI
+		MOVQ	SI, R9
+		MOVQ	dst+0(FP), DI
+		MOVQ	cof+48(FP), BP
+		MOVQ	sp+128(FP), BX
+yloop_11:
+		MOVQ	R9, SI
+		MOVQ	R10, DX
+		MOVWQSX	(DX), AX
+		MULQ	BX
+		ADDQ	AX, SI
+		MOVQ	SI, R9
+		MOVQ	R12, CX
+		ORQ	CX, CX
+		JE	nomaxloop_12
+maxloop_13:
+		LEAQ	(SI)(BX*4), AX
+		MOVOU	(SI), X0
+		MOVOU	(SI)(BX*1), X3
+		MOVOU	(SI)(BX*2), X4
+		MOVOU	(BP), X10
+		MOVOU	32(BP), X11
+		ADDQ	BX, SI
+		MOVOU	(SI)(BX*2), X7
+		MOVO	X0, X2
+		MOVO	X4, X6
+		PUNPCKLBW	X3, X0
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X3, X2
+		PUNPCKHBW	X7, X6
+		MOVO	X0, X1
+		MOVO	X4, X5
+		MOVO	X2, X3
+		MOVO	X6, X7
+		SUBQ	BX, SI
+		PUNPCKLBW	X14, X0
+		PUNPCKHBW	X14, X1
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X2
+		PUNPCKHBW	X14, X3
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	X10, X0
+		PMADDWL	X10, X1
+		PMADDWL	X11, X4
+		PMADDWL	X11, X5
+		PMADDWL	X10, X2
+		PMADDWL	X10, X3
+		PMADDWL	X11, X6
+		PMADDWL	X11, X7
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	64(BP), X4
+		PMADDWL	64(BP), X5
+		PMADDWL	64(BP), X6
+		PMADDWL	64(BP), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		PADDL	X13, X0
+		PADDL	X13, X1
+		PADDL	X13, X2
+		PADDL	X13, X3
+		PSRAL	$14, X0
+		PSRAL	$14, X1
+		PSRAL	$14, X2
+		PSRAL	$14, X3
+		PACKSSLW	X1, X0
+		PACKSSLW	X3, X2
+		PACKUSWB	X2, X0
+		MOVOU	X0, (DI)
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+		SUBQ	$1, CX
+		JNE	maxloop_13
+nomaxloop_12:
+		MOVQ	R13, CX
+		SUBQ	R13, SI
+		SUBQ	R13, DI
+		ORQ	CX, CX
+		JE	nobackroll_14
+		LEAQ	(SI)(BX*4), AX
+		MOVOU	(SI), X0
+		MOVOU	(SI)(BX*1), X3
+		MOVOU	(SI)(BX*2), X4
+		MOVOU	(BP), X10
+		MOVOU	32(BP), X11
+		ADDQ	BX, SI
+		MOVOU	(SI)(BX*2), X7
+		MOVO	X0, X2
+		MOVO	X4, X6
+		PUNPCKLBW	X3, X0
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X3, X2
+		PUNPCKHBW	X7, X6
+		MOVO	X0, X1
+		MOVO	X4, X5
+		MOVO	X2, X3
+		MOVO	X6, X7
+		SUBQ	BX, SI
+		PUNPCKLBW	X14, X0
+		PUNPCKHBW	X14, X1
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X2
+		PUNPCKHBW	X14, X3
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	X10, X0
+		PMADDWL	X10, X1
+		PMADDWL	X11, X4
+		PMADDWL	X11, X5
+		PMADDWL	X10, X2
+		PMADDWL	X10, X3
+		PMADDWL	X11, X6
+		PMADDWL	X11, X7
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	64(BP), X4
+		PMADDWL	64(BP), X5
+		PMADDWL	64(BP), X6
+		PMADDWL	64(BP), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		PADDL	X13, X0
+		PADDL	X13, X1
+		PADDL	X13, X2
+		PADDL	X13, X3
+		PSRAL	$14, X0
+		PSRAL	$14, X1
+		PSRAL	$14, X2
+		PSRAL	$14, X3
+		PACKSSLW	X1, X0
+		PACKSSLW	X3, X2
+		PACKUSWB	X2, X0
+		MOVOU	X0, (DI)
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+nobackroll_14:
+		ADDQ	R11, DI
+		ADDQ	$96, BP
+		ADDQ	$2, R10
+		SUBQ	$1, height+112(FP)
+		JNE	yloop_11
+		RET
+
+TEXT ·v8scale8Amd64(SB),4,$0-136
+		MOVQ	dp+120(FP), BX
+		MOVQ	width+104(FP), CX
+		MOVQ	CX, DX
+		SUBQ	CX, BX
+		ANDQ	$15, DX
+		SHRQ	$4, CX
+		MOVQ	BX, R11
+		MOVQ	CX, R12
+		MOVQ	DX, AX
+		ORQ	AX, AX
+		JE	norollback_15
+		SUBQ	$16, DX
+		NEGQ	DX
+norollback_15:
+		MOVQ	DX, R13
+		MOVQ	off+72(FP), CX
+		MOVQ	CX, R10
+		MOVO	zero_0<>(SB), X14
+		MOVO	hbits_1<>(SB), X13
+		MOVQ	src+24(FP), SI
+		MOVQ	SI, R9
+		MOVQ	dst+0(FP), DI
+		MOVQ	cof+48(FP), BP
+		MOVQ	sp+128(FP), BX
+yloop_16:
+		MOVQ	R9, SI
+		MOVQ	R10, DX
+		MOVWQSX	(DX), AX
+		MULQ	BX
+		ADDQ	AX, SI
+		MOVQ	SI, R9
+		MOVQ	R12, CX
+		ORQ	CX, CX
+		JE	nomaxloop_17
+maxloop_18:
+		LEAQ	(SI)(BX*4), AX
+		MOVOU	(SI), X0
+		MOVOU	(SI)(BX*1), X3
+		MOVOU	(SI)(BX*2), X4
+		MOVOU	(BP), X10
+		MOVOU	32(BP), X11
+		ADDQ	BX, SI
+		MOVOU	(SI)(BX*2), X7
+		MOVO	X0, X2
+		MOVO	X4, X6
+		PUNPCKLBW	X3, X0
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X3, X2
+		PUNPCKHBW	X7, X6
+		MOVO	X0, X1
+		MOVO	X4, X5
+		MOVO	X2, X3
+		MOVO	X6, X7
+		SUBQ	BX, SI
+		PUNPCKLBW	X14, X0
+		PUNPCKHBW	X14, X1
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X2
+		PUNPCKHBW	X14, X3
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	X10, X0
+		PMADDWL	X10, X1
+		PMADDWL	X11, X4
+		PMADDWL	X11, X5
+		PMADDWL	X10, X2
+		PMADDWL	X10, X3
+		PMADDWL	X11, X6
+		PMADDWL	X11, X7
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	64(BP), X4
+		PMADDWL	64(BP), X5
+		PMADDWL	64(BP), X6
+		PMADDWL	64(BP), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	96(BP), X4
+		PMADDWL	96(BP), X5
+		PMADDWL	96(BP), X6
+		PMADDWL	96(BP), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		PADDL	X13, X0
+		PADDL	X13, X1
+		PADDL	X13, X2
+		PADDL	X13, X3
+		PSRAL	$14, X0
+		PSRAL	$14, X1
+		PSRAL	$14, X2
+		PSRAL	$14, X3
+		PACKSSLW	X1, X0
+		PACKSSLW	X3, X2
+		PACKUSWB	X2, X0
+		MOVOU	X0, (DI)
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+		SUBQ	$1, CX
+		JNE	maxloop_18
+nomaxloop_17:
+		MOVQ	R13, CX
+		SUBQ	R13, SI
+		SUBQ	R13, DI
+		ORQ	CX, CX
+		JE	nobackroll_19
+		LEAQ	(SI)(BX*4), AX
+		MOVOU	(SI), X0
+		MOVOU	(SI)(BX*1), X3
+		MOVOU	(SI)(BX*2), X4
+		MOVOU	(BP), X10
+		MOVOU	32(BP), X11
+		ADDQ	BX, SI
+		MOVOU	(SI)(BX*2), X7
+		MOVO	X0, X2
+		MOVO	X4, X6
+		PUNPCKLBW	X3, X0
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X3, X2
+		PUNPCKHBW	X7, X6
+		MOVO	X0, X1
+		MOVO	X4, X5
+		MOVO	X2, X3
+		MOVO	X6, X7
+		SUBQ	BX, SI
+		PUNPCKLBW	X14, X0
+		PUNPCKHBW	X14, X1
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X2
+		PUNPCKHBW	X14, X3
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	X10, X0
+		PMADDWL	X10, X1
+		PMADDWL	X11, X4
+		PMADDWL	X11, X5
+		PMADDWL	X10, X2
+		PMADDWL	X10, X3
+		PMADDWL	X11, X6
+		PMADDWL	X11, X7
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	64(BP), X4
+		PMADDWL	64(BP), X5
+		PMADDWL	64(BP), X6
+		PMADDWL	64(BP), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	96(BP), X4
+		PMADDWL	96(BP), X5
+		PMADDWL	96(BP), X6
+		PMADDWL	96(BP), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		PADDL	X13, X0
+		PADDL	X13, X1
+		PADDL	X13, X2
+		PADDL	X13, X3
+		PSRAL	$14, X0
+		PSRAL	$14, X1
+		PSRAL	$14, X2
+		PSRAL	$14, X3
+		PACKSSLW	X1, X0
+		PACKSSLW	X3, X2
+		PACKUSWB	X2, X0
+		MOVOU	X0, (DI)
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+nobackroll_19:
+		ADDQ	R11, DI
+		ADDQ	$128, BP
+		ADDQ	$2, R10
+		SUBQ	$1, height+112(FP)
+		JNE	yloop_16
+		RET
+
+TEXT ·v8scale10Amd64(SB),4,$0-136
+		MOVQ	dp+120(FP), BX
+		MOVQ	width+104(FP), CX
+		MOVQ	CX, DX
+		SUBQ	CX, BX
+		ANDQ	$15, DX
+		SHRQ	$4, CX
+		MOVQ	BX, R11
+		MOVQ	CX, R12
+		MOVQ	DX, AX
+		ORQ	AX, AX
+		JE	norollback_20
+		SUBQ	$16, DX
+		NEGQ	DX
+norollback_20:
+		MOVQ	DX, R13
+		MOVQ	off+72(FP), CX
+		MOVQ	CX, R10
+		MOVO	zero_0<>(SB), X14
+		MOVO	hbits_1<>(SB), X13
+		MOVQ	src+24(FP), SI
+		MOVQ	SI, R9
+		MOVQ	dst+0(FP), DI
+		MOVQ	cof+48(FP), BP
+		MOVQ	sp+128(FP), BX
+yloop_21:
+		MOVQ	R9, SI
+		MOVQ	R10, DX
+		MOVWQSX	(DX), AX
+		MULQ	BX
+		ADDQ	AX, SI
+		MOVQ	SI, R9
+		MOVQ	R12, CX
+		ORQ	CX, CX
+		JE	nomaxloop_22
+maxloop_23:
+		LEAQ	(SI)(BX*4), AX
+		MOVOU	(SI), X0
+		MOVOU	(SI)(BX*1), X3
+		MOVOU	(SI)(BX*2), X4
+		MOVOU	(BP), X10
+		MOVOU	32(BP), X11
+		ADDQ	BX, SI
+		MOVOU	(SI)(BX*2), X7
+		MOVO	X0, X2
+		MOVO	X4, X6
+		PUNPCKLBW	X3, X0
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X3, X2
+		PUNPCKHBW	X7, X6
+		MOVO	X0, X1
+		MOVO	X4, X5
+		MOVO	X2, X3
+		MOVO	X6, X7
+		SUBQ	BX, SI
+		PUNPCKLBW	X14, X0
+		PUNPCKHBW	X14, X1
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X2
+		PUNPCKHBW	X14, X3
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	X10, X0
+		PMADDWL	X10, X1
+		PMADDWL	X11, X4
+		PMADDWL	X11, X5
+		PMADDWL	X10, X2
+		PMADDWL	X10, X3
+		PMADDWL	X11, X6
+		PMADDWL	X11, X7
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	64(BP), X4
+		PMADDWL	64(BP), X5
+		PMADDWL	64(BP), X6
+		PMADDWL	64(BP), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	96(BP), X4
+		PMADDWL	96(BP), X5
+		PMADDWL	96(BP), X6
+		PMADDWL	96(BP), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	128(BP), X4
+		PMADDWL	128(BP), X5
+		PMADDWL	128(BP), X6
+		PMADDWL	128(BP), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		PADDL	X13, X0
+		PADDL	X13, X1
+		PADDL	X13, X2
+		PADDL	X13, X3
+		PSRAL	$14, X0
+		PSRAL	$14, X1
+		PSRAL	$14, X2
+		PSRAL	$14, X3
+		PACKSSLW	X1, X0
+		PACKSSLW	X3, X2
+		PACKUSWB	X2, X0
+		MOVOU	X0, (DI)
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+		SUBQ	$1, CX
+		JNE	maxloop_23
+nomaxloop_22:
+		MOVQ	R13, CX
+		SUBQ	R13, SI
+		SUBQ	R13, DI
+		ORQ	CX, CX
+		JE	nobackroll_24
+		LEAQ	(SI)(BX*4), AX
+		MOVOU	(SI), X0
+		MOVOU	(SI)(BX*1), X3
+		MOVOU	(SI)(BX*2), X4
+		MOVOU	(BP), X10
+		MOVOU	32(BP), X11
+		ADDQ	BX, SI
+		MOVOU	(SI)(BX*2), X7
+		MOVO	X0, X2
+		MOVO	X4, X6
+		PUNPCKLBW	X3, X0
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X3, X2
+		PUNPCKHBW	X7, X6
+		MOVO	X0, X1
+		MOVO	X4, X5
+		MOVO	X2, X3
+		MOVO	X6, X7
+		SUBQ	BX, SI
+		PUNPCKLBW	X14, X0
+		PUNPCKHBW	X14, X1
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X2
+		PUNPCKHBW	X14, X3
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	X10, X0
+		PMADDWL	X10, X1
+		PMADDWL	X11, X4
+		PMADDWL	X11, X5
+		PMADDWL	X10, X2
+		PMADDWL	X10, X3
+		PMADDWL	X11, X6
+		PMADDWL	X11, X7
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	64(BP), X4
+		PMADDWL	64(BP), X5
+		PMADDWL	64(BP), X6
+		PMADDWL	64(BP), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	96(BP), X4
+		PMADDWL	96(BP), X5
+		PMADDWL	96(BP), X6
+		PMADDWL	96(BP), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	128(BP), X4
+		PMADDWL	128(BP), X5
+		PMADDWL	128(BP), X6
+		PMADDWL	128(BP), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		PADDL	X13, X0
+		PADDL	X13, X1
+		PADDL	X13, X2
+		PADDL	X13, X3
+		PSRAL	$14, X0
+		PSRAL	$14, X1
+		PSRAL	$14, X2
+		PSRAL	$14, X3
+		PACKSSLW	X1, X0
+		PACKSSLW	X3, X2
+		PACKUSWB	X2, X0
+		MOVOU	X0, (DI)
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+nobackroll_24:
+		ADDQ	R11, DI
+		ADDQ	$160, BP
+		ADDQ	$2, R10
+		SUBQ	$1, height+112(FP)
+		JNE	yloop_21
+		RET
+
+TEXT ·v8scale12Amd64(SB),4,$0-136
+		MOVQ	dp+120(FP), BX
+		MOVQ	width+104(FP), CX
+		MOVQ	CX, DX
+		SUBQ	CX, BX
+		ANDQ	$15, DX
+		SHRQ	$4, CX
+		MOVQ	BX, R11
+		MOVQ	CX, R12
+		MOVQ	DX, AX
+		ORQ	AX, AX
+		JE	norollback_25
+		SUBQ	$16, DX
+		NEGQ	DX
+norollback_25:
+		MOVQ	DX, R13
+		MOVQ	off+72(FP), CX
+		MOVQ	CX, R10
+		MOVO	zero_0<>(SB), X14
+		MOVO	hbits_1<>(SB), X13
+		MOVQ	src+24(FP), SI
+		MOVQ	SI, R9
+		MOVQ	dst+0(FP), DI
+		MOVQ	cof+48(FP), BP
+		MOVQ	sp+128(FP), BX
+yloop_26:
+		MOVQ	R9, SI
+		MOVQ	R10, DX
+		MOVWQSX	(DX), AX
+		MULQ	BX
+		ADDQ	AX, SI
+		MOVQ	SI, R9
+		MOVQ	R12, CX
+		ORQ	CX, CX
+		JE	nomaxloop_27
+maxloop_28:
+		LEAQ	(SI)(BX*4), AX
+		MOVOU	(SI), X0
+		MOVOU	(SI)(BX*1), X3
+		MOVOU	(SI)(BX*2), X4
+		MOVOU	(BP), X10
+		MOVOU	32(BP), X11
+		ADDQ	BX, SI
+		MOVOU	(SI)(BX*2), X7
+		MOVO	X0, X2
+		MOVO	X4, X6
+		PUNPCKLBW	X3, X0
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X3, X2
+		PUNPCKHBW	X7, X6
+		MOVO	X0, X1
+		MOVO	X4, X5
+		MOVO	X2, X3
+		MOVO	X6, X7
+		SUBQ	BX, SI
+		PUNPCKLBW	X14, X0
+		PUNPCKHBW	X14, X1
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X2
+		PUNPCKHBW	X14, X3
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	X10, X0
+		PMADDWL	X10, X1
+		PMADDWL	X11, X4
+		PMADDWL	X11, X5
+		PMADDWL	X10, X2
+		PMADDWL	X10, X3
+		PMADDWL	X11, X6
+		PMADDWL	X11, X7
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	64(BP), X4
+		PMADDWL	64(BP), X5
+		PMADDWL	64(BP), X6
+		PMADDWL	64(BP), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	96(BP), X4
+		PMADDWL	96(BP), X5
+		PMADDWL	96(BP), X6
+		PMADDWL	96(BP), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	128(BP), X4
+		PMADDWL	128(BP), X5
+		PMADDWL	128(BP), X6
+		PMADDWL	128(BP), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	160(BP), X4
+		PMADDWL	160(BP), X5
+		PMADDWL	160(BP), X6
+		PMADDWL	160(BP), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		PADDL	X13, X0
+		PADDL	X13, X1
+		PADDL	X13, X2
+		PADDL	X13, X3
+		PSRAL	$14, X0
+		PSRAL	$14, X1
+		PSRAL	$14, X2
+		PSRAL	$14, X3
+		PACKSSLW	X1, X0
+		PACKSSLW	X3, X2
+		PACKUSWB	X2, X0
+		MOVOU	X0, (DI)
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+		SUBQ	$1, CX
+		JNE	maxloop_28
+nomaxloop_27:
+		MOVQ	R13, CX
+		SUBQ	R13, SI
+		SUBQ	R13, DI
+		ORQ	CX, CX
+		JE	nobackroll_29
+		LEAQ	(SI)(BX*4), AX
+		MOVOU	(SI), X0
+		MOVOU	(SI)(BX*1), X3
+		MOVOU	(SI)(BX*2), X4
+		MOVOU	(BP), X10
+		MOVOU	32(BP), X11
+		ADDQ	BX, SI
+		MOVOU	(SI)(BX*2), X7
+		MOVO	X0, X2
+		MOVO	X4, X6
+		PUNPCKLBW	X3, X0
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X3, X2
+		PUNPCKHBW	X7, X6
+		MOVO	X0, X1
+		MOVO	X4, X5
+		MOVO	X2, X3
+		MOVO	X6, X7
+		SUBQ	BX, SI
+		PUNPCKLBW	X14, X0
+		PUNPCKHBW	X14, X1
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X2
+		PUNPCKHBW	X14, X3
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	X10, X0
+		PMADDWL	X10, X1
+		PMADDWL	X11, X4
+		PMADDWL	X11, X5
+		PMADDWL	X10, X2
+		PMADDWL	X10, X3
+		PMADDWL	X11, X6
+		PMADDWL	X11, X7
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	64(BP), X4
+		PMADDWL	64(BP), X5
+		PMADDWL	64(BP), X6
+		PMADDWL	64(BP), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	96(BP), X4
+		PMADDWL	96(BP), X5
+		PMADDWL	96(BP), X6
+		PMADDWL	96(BP), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	128(BP), X4
+		PMADDWL	128(BP), X5
+		PMADDWL	128(BP), X6
+		PMADDWL	128(BP), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	160(BP), X4
+		PMADDWL	160(BP), X5
+		PMADDWL	160(BP), X6
+		PMADDWL	160(BP), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		PADDL	X13, X0
+		PADDL	X13, X1
+		PADDL	X13, X2
+		PADDL	X13, X3
+		PSRAL	$14, X0
+		PSRAL	$14, X1
+		PSRAL	$14, X2
+		PSRAL	$14, X3
+		PACKSSLW	X1, X0
+		PACKSSLW	X3, X2
+		PACKUSWB	X2, X0
+		MOVOU	X0, (DI)
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+nobackroll_29:
+		ADDQ	R11, DI
+		ADDQ	$192, BP
+		ADDQ	$2, R10
+		SUBQ	$1, height+112(FP)
+		JNE	yloop_26
+		RET
+
+TEXT ·v8scaleNAmd64(SB),4,$0-136
+		MOVQ	dp+120(FP), BX
+		MOVQ	width+104(FP), CX
+		MOVQ	CX, DX
+		SUBQ	CX, BX
+		ANDQ	$15, DX
+		SHRQ	$4, CX
+		MOVQ	BX, R11
+		MOVQ	CX, R12
+		MOVQ	DX, AX
+		ORQ	AX, AX
+		JE	norollback_30
+		SUBQ	$16, DX
+		NEGQ	DX
+norollback_30:
+		MOVQ	DX, R13
+		MOVQ	off+72(FP), CX
+		MOVQ	CX, R10
+		MOVO	zero_0<>(SB), X14
+		MOVO	hbits_1<>(SB), X13
+		MOVQ	taps+96(FP), DX
+		SUBQ	$4, DX
+		SHRQ	$1, DX
+		MOVQ	DX, R14
+		MOVQ	src+24(FP), SI
+		MOVQ	SI, R9
+		MOVQ	dst+0(FP), DI
+		MOVQ	cof+48(FP), BP
+		MOVQ	sp+128(FP), BX
+yloop_31:
+		MOVQ	R9, SI
+		MOVQ	R10, DX
+		MOVWQSX	(DX), AX
+		MULQ	BX
+		ADDQ	AX, SI
+		MOVQ	SI, R9
+		MOVQ	R12, CX
+		ORQ	CX, CX
+		JE	nomaxloop_32
+maxloop_33:
+		LEAQ	(SI)(BX*4), AX
+		MOVOU	(SI), X0
+		MOVOU	(SI)(BX*1), X3
+		MOVOU	(SI)(BX*2), X4
+		MOVOU	(BP), X10
+		MOVOU	32(BP), X11
+		ADDQ	BX, SI
+		MOVOU	(SI)(BX*2), X7
+		MOVO	X0, X2
+		MOVO	X4, X6
+		PUNPCKLBW	X3, X0
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X3, X2
+		PUNPCKHBW	X7, X6
+		MOVO	X0, X1
+		MOVO	X4, X5
+		MOVO	X2, X3
+		MOVO	X6, X7
+		SUBQ	BX, SI
+		PUNPCKLBW	X14, X0
+		PUNPCKHBW	X14, X1
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X2
+		PUNPCKHBW	X14, X3
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	X10, X0
+		PMADDWL	X10, X1
+		PMADDWL	X11, X4
+		PMADDWL	X11, X5
+		PMADDWL	X10, X2
+		PMADDWL	X10, X3
+		PMADDWL	X11, X6
+		PMADDWL	X11, X7
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVQ	R14, R15
+		MOVQ	BP, DX
+		ADDQ	$32, DX
+innerloop_34:
+		ADDQ	$32, DX
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	(DX), X4
+		PMADDWL	(DX), X5
+		PMADDWL	(DX), X6
+		PMADDWL	(DX), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		SUBQ	$1, R15
+		JNE	innerloop_34
+		PADDL	X13, X0
+		PADDL	X13, X1
+		PADDL	X13, X2
+		PADDL	X13, X3
+		PSRAL	$14, X0
+		PSRAL	$14, X1
+		PSRAL	$14, X2
+		PSRAL	$14, X3
+		PACKSSLW	X1, X0
+		PACKSSLW	X3, X2
+		PACKUSWB	X2, X0
+		MOVOU	X0, (DI)
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+		SUBQ	$1, CX
+		JNE	maxloop_33
+nomaxloop_32:
+		MOVQ	R13, CX
+		SUBQ	R13, SI
+		SUBQ	R13, DI
+		ORQ	CX, CX
+		JE	nobackroll_35
+		LEAQ	(SI)(BX*4), AX
+		MOVOU	(SI), X0
+		MOVOU	(SI)(BX*1), X3
+		MOVOU	(SI)(BX*2), X4
+		MOVOU	(BP), X10
+		MOVOU	32(BP), X11
+		ADDQ	BX, SI
+		MOVOU	(SI)(BX*2), X7
+		MOVO	X0, X2
+		MOVO	X4, X6
+		PUNPCKLBW	X3, X0
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X3, X2
+		PUNPCKHBW	X7, X6
+		MOVO	X0, X1
+		MOVO	X4, X5
+		MOVO	X2, X3
+		MOVO	X6, X7
+		SUBQ	BX, SI
+		PUNPCKLBW	X14, X0
+		PUNPCKHBW	X14, X1
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X2
+		PUNPCKHBW	X14, X3
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	X10, X0
+		PMADDWL	X10, X1
+		PMADDWL	X11, X4
+		PMADDWL	X11, X5
+		PMADDWL	X10, X2
+		PMADDWL	X10, X3
+		PMADDWL	X11, X6
+		PMADDWL	X11, X7
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		MOVQ	R14, R15
+		MOVQ	BP, DX
+		ADDQ	$32, DX
+innerloop_36:
+		ADDQ	$32, DX
+		MOVOU	(AX), X4
+		MOVOU	(AX)(BX*1), X7
+		MOVO	X4, X6
+		PUNPCKLBW	X7, X4
+		PUNPCKHBW	X7, X6
+		MOVO	X4, X5
+		MOVO	X6, X7
+		PUNPCKLBW	X14, X4
+		PUNPCKHBW	X14, X5
+		PUNPCKLBW	X14, X6
+		PUNPCKHBW	X14, X7
+		PMADDWL	(DX), X4
+		PMADDWL	(DX), X5
+		PMADDWL	(DX), X6
+		PMADDWL	(DX), X7
+		LEAQ	(AX)(BX*2), AX
+		PADDL	X4, X0
+		PADDL	X5, X1
+		PADDL	X6, X2
+		PADDL	X7, X3
+		SUBQ	$1, R15
+		JNE	innerloop_36
+		PADDL	X13, X0
+		PADDL	X13, X1
+		PADDL	X13, X2
+		PADDL	X13, X3
+		PSRAL	$14, X0
+		PSRAL	$14, X1
+		PSRAL	$14, X2
+		PSRAL	$14, X3
+		PACKSSLW	X1, X0
+		PACKSSLW	X3, X2
+		PACKUSWB	X2, X0
+		MOVOU	X0, (DI)
+		ADDQ	$16, SI
+		ADDQ	$16, DI
+nobackroll_35:
+		ADDQ	R11, DI
+		MOVQ	taps+96(FP), DX
+		SHLQ	$4, DX
+		ADDQ	DX, BP
+		ADDQ	$2, R10
+		SUBQ	$1, height+112(FP)
+		JNE	yloop_31
+		RET
diff --git a/vendor/github.com/nfnt/resize/LICENSE b/vendor/github.com/nfnt/resize/LICENSE
deleted file mode 100644
index 7836cad5..00000000
--- a/vendor/github.com/nfnt/resize/LICENSE
+++ /dev/null
@@ -1,13 +0,0 @@
-Copyright (c) 2012, Jan Schlicht <jan.schlicht@gmail.com>
-
-Permission to use, copy, modify, and/or distribute this software for any purpose
-with or without fee is hereby granted, provided that the above copyright notice
-and this permission notice appear in all copies.
-
-THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
-REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
-FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
-INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
-THIS SOFTWARE.
diff --git a/vendor/github.com/nfnt/resize/README.md b/vendor/github.com/nfnt/resize/README.md
deleted file mode 100644
index 2aefa75c..00000000
--- a/vendor/github.com/nfnt/resize/README.md
+++ /dev/null
@@ -1,149 +0,0 @@
-Resize
-======
-
-Image resizing for the [Go programming language](http://golang.org) with common interpolation methods.
-
-[![Build Status](https://travis-ci.org/nfnt/resize.svg)](https://travis-ci.org/nfnt/resize)
-
-Installation
-------------
-
-```bash
-$ go get github.com/nfnt/resize
-```
-
-It's that easy!
-
-Usage
------
-
-This package needs at least Go 1.1. Import package with
-
-```go
-import "github.com/nfnt/resize"
-```
-
-The resize package provides 2 functions:
-
-* `resize.Resize` creates a scaled image with new dimensions (`width`, `height`) using the interpolation function `interp`.
-  If either `width` or `height` is set to 0, it will be set to an aspect ratio preserving value.
-* `resize.Thumbnail` downscales an image preserving its aspect ratio to the maximum dimensions (`maxWidth`, `maxHeight`).
-  It will return the original image if original sizes are smaller than the provided dimensions.
-
-```go
-resize.Resize(width, height uint, img image.Image, interp resize.InterpolationFunction) image.Image
-resize.Thumbnail(maxWidth, maxHeight uint, img image.Image, interp resize.InterpolationFunction) image.Image
-```
-
-The provided interpolation functions are (from fast to slow execution time)
-
-- `NearestNeighbor`: [Nearest-neighbor interpolation](http://en.wikipedia.org/wiki/Nearest-neighbor_interpolation)
-- `Bilinear`: [Bilinear interpolation](http://en.wikipedia.org/wiki/Bilinear_interpolation)
-- `Bicubic`: [Bicubic interpolation](http://en.wikipedia.org/wiki/Bicubic_interpolation)
-- `MitchellNetravali`: [Mitchell-Netravali interpolation](http://dl.acm.org/citation.cfm?id=378514)
-- `Lanczos2`: [Lanczos resampling](http://en.wikipedia.org/wiki/Lanczos_resampling) with a=2
-- `Lanczos3`: [Lanczos resampling](http://en.wikipedia.org/wiki/Lanczos_resampling) with a=3
-
-Which of these methods gives the best results depends on your use case.
-
-Sample usage:
-
-```go
-package main
-
-import (
-	"github.com/nfnt/resize"
-	"image/jpeg"
-	"log"
-	"os"
-)
-
-func main() {
-	// open "test.jpg"
-	file, err := os.Open("test.jpg")
-	if err != nil {
-		log.Fatal(err)
-	}
-
-	// decode jpeg into image.Image
-	img, err := jpeg.Decode(file)
-	if err != nil {
-		log.Fatal(err)
-	}
-	file.Close()
-
-	// resize to width 1000 using Lanczos resampling
-	// and preserve aspect ratio
-	m := resize.Resize(1000, 0, img, resize.Lanczos3)
-
-	out, err := os.Create("test_resized.jpg")
-	if err != nil {
-		log.Fatal(err)
-	}
-	defer out.Close()
-
-	// write new image to file
-	jpeg.Encode(out, m, nil)
-}
-```
-
-Caveats
--------
-
-* Optimized access routines are used for `image.RGBA`, `image.NRGBA`, `image.RGBA64`, `image.NRGBA64`, `image.YCbCr`, `image.Gray`, and `image.Gray16` types. All other image types are accessed in a generic way that will result in slow processing speed.
-* JPEG images are stored in `image.YCbCr`. This image format stores data in a way that will decrease processing speed. A resize may be up to 2 times slower than with `image.RGBA`. 
-
-
-Downsizing Samples
--------
-
-Downsizing is not as simple as it might look like. Images have to be filtered before they are scaled down, otherwise aliasing might occur.
-Filtering is highly subjective: Applying too much will blur the whole image, too little will make aliasing become apparent.
-Resize tries to provide sane defaults that should suffice in most cases.
-
-### Artificial sample
-
-Original image
-![Rings](http://nfnt.github.com/img/rings_lg_orig.png)
-
-<table>
-<tr>
-<th><img src="http://nfnt.github.com/img/rings_300_NearestNeighbor.png" /><br>Nearest-Neighbor</th>
-<th><img src="http://nfnt.github.com/img/rings_300_Bilinear.png" /><br>Bilinear</th>
-</tr>
-<tr>
-<th><img src="http://nfnt.github.com/img/rings_300_Bicubic.png" /><br>Bicubic</th>
-<th><img src="http://nfnt.github.com/img/rings_300_MitchellNetravali.png" /><br>Mitchell-Netravali</th>
-</tr>
-<tr>
-<th><img src="http://nfnt.github.com/img/rings_300_Lanczos2.png" /><br>Lanczos2</th>
-<th><img src="http://nfnt.github.com/img/rings_300_Lanczos3.png" /><br>Lanczos3</th>
-</tr>
-</table>
-
-### Real-Life sample
-
-Original image  
-![Original](http://nfnt.github.com/img/IMG_3694_720.jpg)
-
-<table>
-<tr>
-<th><img src="http://nfnt.github.com/img/IMG_3694_300_NearestNeighbor.png" /><br>Nearest-Neighbor</th>
-<th><img src="http://nfnt.github.com/img/IMG_3694_300_Bilinear.png" /><br>Bilinear</th>
-</tr>
-<tr>
-<th><img src="http://nfnt.github.com/img/IMG_3694_300_Bicubic.png" /><br>Bicubic</th>
-<th><img src="http://nfnt.github.com/img/IMG_3694_300_MitchellNetravali.png" /><br>Mitchell-Netravali</th>
-</tr>
-<tr>
-<th><img src="http://nfnt.github.com/img/IMG_3694_300_Lanczos2.png" /><br>Lanczos2</th>
-<th><img src="http://nfnt.github.com/img/IMG_3694_300_Lanczos3.png" /><br>Lanczos3</th>
-</tr>
-</table>
-
-
-License
--------
-
-Copyright (c) 2012 Jan Schlicht <janschlicht@gmail.com>
-Resize is released under a MIT style license.
diff --git a/vendor/github.com/nfnt/resize/converter.go b/vendor/github.com/nfnt/resize/converter.go
deleted file mode 100644
index f9c520d0..00000000
--- a/vendor/github.com/nfnt/resize/converter.go
+++ /dev/null
@@ -1,438 +0,0 @@
-/*
-Copyright (c) 2012, Jan Schlicht <jan.schlicht@gmail.com>
-
-Permission to use, copy, modify, and/or distribute this software for any purpose
-with or without fee is hereby granted, provided that the above copyright notice
-and this permission notice appear in all copies.
-
-THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
-REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
-FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
-INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
-THIS SOFTWARE.
-*/
-
-package resize
-
-import "image"
-
-// Keep value in [0,255] range.
-func clampUint8(in int32) uint8 {
-	// casting a negative int to an uint will result in an overflown
-	// large uint. this behavior will be exploited here and in other functions
-	// to achieve a higher performance.
-	if uint32(in) < 256 {
-		return uint8(in)
-	}
-	if in > 255 {
-		return 255
-	}
-	return 0
-}
-
-// Keep value in [0,65535] range.
-func clampUint16(in int64) uint16 {
-	if uint64(in) < 65536 {
-		return uint16(in)
-	}
-	if in > 65535 {
-		return 65535
-	}
-	return 0
-}
-
-func resizeGeneric(in image.Image, out *image.RGBA64, scale float64, coeffs []int32, offset []int, filterLength int) {
-	newBounds := out.Bounds()
-	maxX := in.Bounds().Dx() - 1
-
-	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
-		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
-			var rgba [4]int64
-			var sum int64
-			start := offset[y]
-			ci := y * filterLength
-			for i := 0; i < filterLength; i++ {
-				coeff := coeffs[ci+i]
-				if coeff != 0 {
-					xi := start + i
-					switch {
-					case xi < 0:
-						xi = 0
-					case xi >= maxX:
-						xi = maxX
-					}
-
-					r, g, b, a := in.At(xi+in.Bounds().Min.X, x+in.Bounds().Min.Y).RGBA()
-
-					rgba[0] += int64(coeff) * int64(r)
-					rgba[1] += int64(coeff) * int64(g)
-					rgba[2] += int64(coeff) * int64(b)
-					rgba[3] += int64(coeff) * int64(a)
-					sum += int64(coeff)
-				}
-			}
-
-			offset := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*8
-
-			value := clampUint16(rgba[0] / sum)
-			out.Pix[offset+0] = uint8(value >> 8)
-			out.Pix[offset+1] = uint8(value)
-			value = clampUint16(rgba[1] / sum)
-			out.Pix[offset+2] = uint8(value >> 8)
-			out.Pix[offset+3] = uint8(value)
-			value = clampUint16(rgba[2] / sum)
-			out.Pix[offset+4] = uint8(value >> 8)
-			out.Pix[offset+5] = uint8(value)
-			value = clampUint16(rgba[3] / sum)
-			out.Pix[offset+6] = uint8(value >> 8)
-			out.Pix[offset+7] = uint8(value)
-		}
-	}
-}
-
-func resizeRGBA(in *image.RGBA, out *image.RGBA, scale float64, coeffs []int16, offset []int, filterLength int) {
-	newBounds := out.Bounds()
-	maxX := in.Bounds().Dx() - 1
-
-	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
-		row := in.Pix[x*in.Stride:]
-		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
-			var rgba [4]int32
-			var sum int32
-			start := offset[y]
-			ci := y * filterLength
-			for i := 0; i < filterLength; i++ {
-				coeff := coeffs[ci+i]
-				if coeff != 0 {
-					xi := start + i
-					switch {
-					case uint(xi) < uint(maxX):
-						xi *= 4
-					case xi >= maxX:
-						xi = 4 * maxX
-					default:
-						xi = 0
-					}
-
-					rgba[0] += int32(coeff) * int32(row[xi+0])
-					rgba[1] += int32(coeff) * int32(row[xi+1])
-					rgba[2] += int32(coeff) * int32(row[xi+2])
-					rgba[3] += int32(coeff) * int32(row[xi+3])
-					sum += int32(coeff)
-				}
-			}
-
-			xo := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*4
-
-			out.Pix[xo+0] = clampUint8(rgba[0] / sum)
-			out.Pix[xo+1] = clampUint8(rgba[1] / sum)
-			out.Pix[xo+2] = clampUint8(rgba[2] / sum)
-			out.Pix[xo+3] = clampUint8(rgba[3] / sum)
-		}
-	}
-}
-
-func resizeNRGBA(in *image.NRGBA, out *image.RGBA, scale float64, coeffs []int16, offset []int, filterLength int) {
-	newBounds := out.Bounds()
-	maxX := in.Bounds().Dx() - 1
-
-	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
-		row := in.Pix[x*in.Stride:]
-		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
-			var rgba [4]int32
-			var sum int32
-			start := offset[y]
-			ci := y * filterLength
-			for i := 0; i < filterLength; i++ {
-				coeff := coeffs[ci+i]
-				if coeff != 0 {
-					xi := start + i
-					switch {
-					case uint(xi) < uint(maxX):
-						xi *= 4
-					case xi >= maxX:
-						xi = 4 * maxX
-					default:
-						xi = 0
-					}
-
-					// Forward alpha-premultiplication
-					a := int32(row[xi+3])
-					r := int32(row[xi+0]) * a
-					r /= 0xff
-					g := int32(row[xi+1]) * a
-					g /= 0xff
-					b := int32(row[xi+2]) * a
-					b /= 0xff
-
-					rgba[0] += int32(coeff) * r
-					rgba[1] += int32(coeff) * g
-					rgba[2] += int32(coeff) * b
-					rgba[3] += int32(coeff) * a
-					sum += int32(coeff)
-				}
-			}
-
-			xo := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*4
-
-			out.Pix[xo+0] = clampUint8(rgba[0] / sum)
-			out.Pix[xo+1] = clampUint8(rgba[1] / sum)
-			out.Pix[xo+2] = clampUint8(rgba[2] / sum)
-			out.Pix[xo+3] = clampUint8(rgba[3] / sum)
-		}
-	}
-}
-
-func resizeRGBA64(in *image.RGBA64, out *image.RGBA64, scale float64, coeffs []int32, offset []int, filterLength int) {
-	newBounds := out.Bounds()
-	maxX := in.Bounds().Dx() - 1
-
-	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
-		row := in.Pix[x*in.Stride:]
-		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
-			var rgba [4]int64
-			var sum int64
-			start := offset[y]
-			ci := y * filterLength
-			for i := 0; i < filterLength; i++ {
-				coeff := coeffs[ci+i]
-				if coeff != 0 {
-					xi := start + i
-					switch {
-					case uint(xi) < uint(maxX):
-						xi *= 8
-					case xi >= maxX:
-						xi = 8 * maxX
-					default:
-						xi = 0
-					}
-
-					rgba[0] += int64(coeff) * (int64(row[xi+0])<<8 | int64(row[xi+1]))
-					rgba[1] += int64(coeff) * (int64(row[xi+2])<<8 | int64(row[xi+3]))
-					rgba[2] += int64(coeff) * (int64(row[xi+4])<<8 | int64(row[xi+5]))
-					rgba[3] += int64(coeff) * (int64(row[xi+6])<<8 | int64(row[xi+7]))
-					sum += int64(coeff)
-				}
-			}
-
-			xo := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*8
-
-			value := clampUint16(rgba[0] / sum)
-			out.Pix[xo+0] = uint8(value >> 8)
-			out.Pix[xo+1] = uint8(value)
-			value = clampUint16(rgba[1] / sum)
-			out.Pix[xo+2] = uint8(value >> 8)
-			out.Pix[xo+3] = uint8(value)
-			value = clampUint16(rgba[2] / sum)
-			out.Pix[xo+4] = uint8(value >> 8)
-			out.Pix[xo+5] = uint8(value)
-			value = clampUint16(rgba[3] / sum)
-			out.Pix[xo+6] = uint8(value >> 8)
-			out.Pix[xo+7] = uint8(value)
-		}
-	}
-}
-
-func resizeNRGBA64(in *image.NRGBA64, out *image.RGBA64, scale float64, coeffs []int32, offset []int, filterLength int) {
-	newBounds := out.Bounds()
-	maxX := in.Bounds().Dx() - 1
-
-	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
-		row := in.Pix[x*in.Stride:]
-		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
-			var rgba [4]int64
-			var sum int64
-			start := offset[y]
-			ci := y * filterLength
-			for i := 0; i < filterLength; i++ {
-				coeff := coeffs[ci+i]
-				if coeff != 0 {
-					xi := start + i
-					switch {
-					case uint(xi) < uint(maxX):
-						xi *= 8
-					case xi >= maxX:
-						xi = 8 * maxX
-					default:
-						xi = 0
-					}
-
-					// Forward alpha-premultiplication
-					a := int64(uint16(row[xi+6])<<8 | uint16(row[xi+7]))
-					r := int64(uint16(row[xi+0])<<8|uint16(row[xi+1])) * a
-					r /= 0xffff
-					g := int64(uint16(row[xi+2])<<8|uint16(row[xi+3])) * a
-					g /= 0xffff
-					b := int64(uint16(row[xi+4])<<8|uint16(row[xi+5])) * a
-					b /= 0xffff
-
-					rgba[0] += int64(coeff) * r
-					rgba[1] += int64(coeff) * g
-					rgba[2] += int64(coeff) * b
-					rgba[3] += int64(coeff) * a
-					sum += int64(coeff)
-				}
-			}
-
-			xo := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*8
-
-			value := clampUint16(rgba[0] / sum)
-			out.Pix[xo+0] = uint8(value >> 8)
-			out.Pix[xo+1] = uint8(value)
-			value = clampUint16(rgba[1] / sum)
-			out.Pix[xo+2] = uint8(value >> 8)
-			out.Pix[xo+3] = uint8(value)
-			value = clampUint16(rgba[2] / sum)
-			out.Pix[xo+4] = uint8(value >> 8)
-			out.Pix[xo+5] = uint8(value)
-			value = clampUint16(rgba[3] / sum)
-			out.Pix[xo+6] = uint8(value >> 8)
-			out.Pix[xo+7] = uint8(value)
-		}
-	}
-}
-
-func resizeGray(in *image.Gray, out *image.Gray, scale float64, coeffs []int16, offset []int, filterLength int) {
-	newBounds := out.Bounds()
-	maxX := in.Bounds().Dx() - 1
-
-	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
-		row := in.Pix[(x-newBounds.Min.X)*in.Stride:]
-		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
-			var gray int32
-			var sum int32
-			start := offset[y]
-			ci := y * filterLength
-			for i := 0; i < filterLength; i++ {
-				coeff := coeffs[ci+i]
-				if coeff != 0 {
-					xi := start + i
-					switch {
-					case xi < 0:
-						xi = 0
-					case xi >= maxX:
-						xi = maxX
-					}
-					gray += int32(coeff) * int32(row[xi])
-					sum += int32(coeff)
-				}
-			}
-
-			offset := (y-newBounds.Min.Y)*out.Stride + (x - newBounds.Min.X)
-			out.Pix[offset] = clampUint8(gray / sum)
-		}
-	}
-}
-
-func resizeGray16(in *image.Gray16, out *image.Gray16, scale float64, coeffs []int32, offset []int, filterLength int) {
-	newBounds := out.Bounds()
-	maxX := in.Bounds().Dx() - 1
-
-	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
-		row := in.Pix[x*in.Stride:]
-		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
-			var gray int64
-			var sum int64
-			start := offset[y]
-			ci := y * filterLength
-			for i := 0; i < filterLength; i++ {
-				coeff := coeffs[ci+i]
-				if coeff != 0 {
-					xi := start + i
-					switch {
-					case uint(xi) < uint(maxX):
-						xi *= 2
-					case xi >= maxX:
-						xi = 2 * maxX
-					default:
-						xi = 0
-					}
-					gray += int64(coeff) * int64(uint16(row[xi+0])<<8|uint16(row[xi+1]))
-					sum += int64(coeff)
-				}
-			}
-
-			offset := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*2
-			value := clampUint16(gray / sum)
-			out.Pix[offset+0] = uint8(value >> 8)
-			out.Pix[offset+1] = uint8(value)
-		}
-	}
-}
-
-func resizeYCbCr(in *ycc, out *ycc, scale float64, coeffs []int16, offset []int, filterLength int) {
-	newBounds := out.Bounds()
-	maxX := in.Bounds().Dx() - 1
-
-	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
-		row := in.Pix[x*in.Stride:]
-		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
-			var p [3]int32
-			var sum int32
-			start := offset[y]
-			ci := y * filterLength
-			for i := 0; i < filterLength; i++ {
-				coeff := coeffs[ci+i]
-				if coeff != 0 {
-					xi := start + i
-					switch {
-					case uint(xi) < uint(maxX):
-						xi *= 3
-					case xi >= maxX:
-						xi = 3 * maxX
-					default:
-						xi = 0
-					}
-					p[0] += int32(coeff) * int32(row[xi+0])
-					p[1] += int32(coeff) * int32(row[xi+1])
-					p[2] += int32(coeff) * int32(row[xi+2])
-					sum += int32(coeff)
-				}
-			}
-
-			xo := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*3
-			out.Pix[xo+0] = clampUint8(p[0] / sum)
-			out.Pix[xo+1] = clampUint8(p[1] / sum)
-			out.Pix[xo+2] = clampUint8(p[2] / sum)
-		}
-	}
-}
-
-func nearestYCbCr(in *ycc, out *ycc, scale float64, coeffs []bool, offset []int, filterLength int) {
-	newBounds := out.Bounds()
-	maxX := in.Bounds().Dx() - 1
-
-	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
-		row := in.Pix[x*in.Stride:]
-		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
-			var p [3]float32
-			var sum float32
-			start := offset[y]
-			ci := y * filterLength
-			for i := 0; i < filterLength; i++ {
-				if coeffs[ci+i] {
-					xi := start + i
-					switch {
-					case uint(xi) < uint(maxX):
-						xi *= 3
-					case xi >= maxX:
-						xi = 3 * maxX
-					default:
-						xi = 0
-					}
-					p[0] += float32(row[xi+0])
-					p[1] += float32(row[xi+1])
-					p[2] += float32(row[xi+2])
-					sum++
-				}
-			}
-
-			xo := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*3
-			out.Pix[xo+0] = floatToUint8(p[0] / sum)
-			out.Pix[xo+1] = floatToUint8(p[1] / sum)
-			out.Pix[xo+2] = floatToUint8(p[2] / sum)
-		}
-	}
-}
diff --git a/vendor/github.com/nfnt/resize/filters.go b/vendor/github.com/nfnt/resize/filters.go
deleted file mode 100644
index 4ce04e38..00000000
--- a/vendor/github.com/nfnt/resize/filters.go
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
-Copyright (c) 2012, Jan Schlicht <jan.schlicht@gmail.com>
-
-Permission to use, copy, modify, and/or distribute this software for any purpose
-with or without fee is hereby granted, provided that the above copyright notice
-and this permission notice appear in all copies.
-
-THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
-REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
-FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
-INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
-THIS SOFTWARE.
-*/
-
-package resize
-
-import (
-	"math"
-)
-
-func nearest(in float64) float64 {
-	if in >= -0.5 && in < 0.5 {
-		return 1
-	}
-	return 0
-}
-
-func linear(in float64) float64 {
-	in = math.Abs(in)
-	if in <= 1 {
-		return 1 - in
-	}
-	return 0
-}
-
-func cubic(in float64) float64 {
-	in = math.Abs(in)
-	if in <= 1 {
-		return in*in*(1.5*in-2.5) + 1.0
-	}
-	if in <= 2 {
-		return in*(in*(2.5-0.5*in)-4.0) + 2.0
-	}
-	return 0
-}
-
-func mitchellnetravali(in float64) float64 {
-	in = math.Abs(in)
-	if in <= 1 {
-		return (7.0*in*in*in - 12.0*in*in + 5.33333333333) * 0.16666666666
-	}
-	if in <= 2 {
-		return (-2.33333333333*in*in*in + 12.0*in*in - 20.0*in + 10.6666666667) * 0.16666666666
-	}
-	return 0
-}
-
-func sinc(x float64) float64 {
-	x = math.Abs(x) * math.Pi
-	if x >= 1.220703e-4 {
-		return math.Sin(x) / x
-	}
-	return 1
-}
-
-func lanczos2(in float64) float64 {
-	if in > -2 && in < 2 {
-		return sinc(in) * sinc(in*0.5)
-	}
-	return 0
-}
-
-func lanczos3(in float64) float64 {
-	if in > -3 && in < 3 {
-		return sinc(in) * sinc(in*0.3333333333333333)
-	}
-	return 0
-}
-
-// range [-256,256]
-func createWeights8(dy, filterLength int, blur, scale float64, kernel func(float64) float64) ([]int16, []int, int) {
-	filterLength = filterLength * int(math.Max(math.Ceil(blur*scale), 1))
-	filterFactor := math.Min(1./(blur*scale), 1)
-
-	coeffs := make([]int16, dy*filterLength)
-	start := make([]int, dy)
-	for y := 0; y < dy; y++ {
-		interpX := scale*(float64(y)+0.5) - 0.5
-		start[y] = int(interpX) - filterLength/2 + 1
-		interpX -= float64(start[y])
-		for i := 0; i < filterLength; i++ {
-			in := (interpX - float64(i)) * filterFactor
-			coeffs[y*filterLength+i] = int16(kernel(in) * 256)
-		}
-	}
-
-	return coeffs, start, filterLength
-}
-
-// range [-65536,65536]
-func createWeights16(dy, filterLength int, blur, scale float64, kernel func(float64) float64) ([]int32, []int, int) {
-	filterLength = filterLength * int(math.Max(math.Ceil(blur*scale), 1))
-	filterFactor := math.Min(1./(blur*scale), 1)
-
-	coeffs := make([]int32, dy*filterLength)
-	start := make([]int, dy)
-	for y := 0; y < dy; y++ {
-		interpX := scale*(float64(y)+0.5) - 0.5
-		start[y] = int(interpX) - filterLength/2 + 1
-		interpX -= float64(start[y])
-		for i := 0; i < filterLength; i++ {
-			in := (interpX - float64(i)) * filterFactor
-			coeffs[y*filterLength+i] = int32(kernel(in) * 65536)
-		}
-	}
-
-	return coeffs, start, filterLength
-}
-
-func createWeightsNearest(dy, filterLength int, blur, scale float64) ([]bool, []int, int) {
-	filterLength = filterLength * int(math.Max(math.Ceil(blur*scale), 1))
-	filterFactor := math.Min(1./(blur*scale), 1)
-
-	coeffs := make([]bool, dy*filterLength)
-	start := make([]int, dy)
-	for y := 0; y < dy; y++ {
-		interpX := scale*(float64(y)+0.5) - 0.5
-		start[y] = int(interpX) - filterLength/2 + 1
-		interpX -= float64(start[y])
-		for i := 0; i < filterLength; i++ {
-			in := (interpX - float64(i)) * filterFactor
-			if in >= -0.5 && in < 0.5 {
-				coeffs[y*filterLength+i] = true
-			} else {
-				coeffs[y*filterLength+i] = false
-			}
-		}
-	}
-
-	return coeffs, start, filterLength
-}
diff --git a/vendor/github.com/nfnt/resize/nearest.go b/vendor/github.com/nfnt/resize/nearest.go
deleted file mode 100644
index 888039d8..00000000
--- a/vendor/github.com/nfnt/resize/nearest.go
+++ /dev/null
@@ -1,318 +0,0 @@
-/*
-Copyright (c) 2014, Charlie Vieth <charlie.vieth@gmail.com>
-
-Permission to use, copy, modify, and/or distribute this software for any purpose
-with or without fee is hereby granted, provided that the above copyright notice
-and this permission notice appear in all copies.
-
-THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
-REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
-FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
-INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
-THIS SOFTWARE.
-*/
-
-package resize
-
-import "image"
-
-func floatToUint8(x float32) uint8 {
-	// Nearest-neighbor values are always
-	// positive no need to check lower-bound.
-	if x > 0xfe {
-		return 0xff
-	}
-	return uint8(x)
-}
-
-func floatToUint16(x float32) uint16 {
-	if x > 0xfffe {
-		return 0xffff
-	}
-	return uint16(x)
-}
-
-func nearestGeneric(in image.Image, out *image.RGBA64, scale float64, coeffs []bool, offset []int, filterLength int) {
-	newBounds := out.Bounds()
-	maxX := in.Bounds().Dx() - 1
-
-	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
-		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
-			var rgba [4]float32
-			var sum float32
-			start := offset[y]
-			ci := y * filterLength
-			for i := 0; i < filterLength; i++ {
-				if coeffs[ci+i] {
-					xi := start + i
-					switch {
-					case xi < 0:
-						xi = 0
-					case xi >= maxX:
-						xi = maxX
-					}
-					r, g, b, a := in.At(xi+in.Bounds().Min.X, x+in.Bounds().Min.Y).RGBA()
-					rgba[0] += float32(r)
-					rgba[1] += float32(g)
-					rgba[2] += float32(b)
-					rgba[3] += float32(a)
-					sum++
-				}
-			}
-
-			offset := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*8
-			value := floatToUint16(rgba[0] / sum)
-			out.Pix[offset+0] = uint8(value >> 8)
-			out.Pix[offset+1] = uint8(value)
-			value = floatToUint16(rgba[1] / sum)
-			out.Pix[offset+2] = uint8(value >> 8)
-			out.Pix[offset+3] = uint8(value)
-			value = floatToUint16(rgba[2] / sum)
-			out.Pix[offset+4] = uint8(value >> 8)
-			out.Pix[offset+5] = uint8(value)
-			value = floatToUint16(rgba[3] / sum)
-			out.Pix[offset+6] = uint8(value >> 8)
-			out.Pix[offset+7] = uint8(value)
-		}
-	}
-}
-
-func nearestRGBA(in *image.RGBA, out *image.RGBA, scale float64, coeffs []bool, offset []int, filterLength int) {
-	newBounds := out.Bounds()
-	maxX := in.Bounds().Dx() - 1
-
-	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
-		row := in.Pix[x*in.Stride:]
-		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
-			var rgba [4]float32
-			var sum float32
-			start := offset[y]
-			ci := y * filterLength
-			for i := 0; i < filterLength; i++ {
-				if coeffs[ci+i] {
-					xi := start + i
-					switch {
-					case uint(xi) < uint(maxX):
-						xi *= 4
-					case xi >= maxX:
-						xi = 4 * maxX
-					default:
-						xi = 0
-					}
-					rgba[0] += float32(row[xi+0])
-					rgba[1] += float32(row[xi+1])
-					rgba[2] += float32(row[xi+2])
-					rgba[3] += float32(row[xi+3])
-					sum++
-				}
-			}
-
-			xo := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*4
-			out.Pix[xo+0] = floatToUint8(rgba[0] / sum)
-			out.Pix[xo+1] = floatToUint8(rgba[1] / sum)
-			out.Pix[xo+2] = floatToUint8(rgba[2] / sum)
-			out.Pix[xo+3] = floatToUint8(rgba[3] / sum)
-		}
-	}
-}
-
-func nearestNRGBA(in *image.NRGBA, out *image.NRGBA, scale float64, coeffs []bool, offset []int, filterLength int) {
-	newBounds := out.Bounds()
-	maxX := in.Bounds().Dx() - 1
-
-	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
-		row := in.Pix[x*in.Stride:]
-		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
-			var rgba [4]float32
-			var sum float32
-			start := offset[y]
-			ci := y * filterLength
-			for i := 0; i < filterLength; i++ {
-				if coeffs[ci+i] {
-					xi := start + i
-					switch {
-					case uint(xi) < uint(maxX):
-						xi *= 4
-					case xi >= maxX:
-						xi = 4 * maxX
-					default:
-						xi = 0
-					}
-					rgba[0] += float32(row[xi+0])
-					rgba[1] += float32(row[xi+1])
-					rgba[2] += float32(row[xi+2])
-					rgba[3] += float32(row[xi+3])
-					sum++
-				}
-			}
-
-			xo := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*4
-			out.Pix[xo+0] = floatToUint8(rgba[0] / sum)
-			out.Pix[xo+1] = floatToUint8(rgba[1] / sum)
-			out.Pix[xo+2] = floatToUint8(rgba[2] / sum)
-			out.Pix[xo+3] = floatToUint8(rgba[3] / sum)
-		}
-	}
-}
-
-func nearestRGBA64(in *image.RGBA64, out *image.RGBA64, scale float64, coeffs []bool, offset []int, filterLength int) {
-	newBounds := out.Bounds()
-	maxX := in.Bounds().Dx() - 1
-
-	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
-		row := in.Pix[x*in.Stride:]
-		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
-			var rgba [4]float32
-			var sum float32
-			start := offset[y]
-			ci := y * filterLength
-			for i := 0; i < filterLength; i++ {
-				if coeffs[ci+i] {
-					xi := start + i
-					switch {
-					case uint(xi) < uint(maxX):
-						xi *= 8
-					case xi >= maxX:
-						xi = 8 * maxX
-					default:
-						xi = 0
-					}
-					rgba[0] += float32(uint16(row[xi+0])<<8 | uint16(row[xi+1]))
-					rgba[1] += float32(uint16(row[xi+2])<<8 | uint16(row[xi+3]))
-					rgba[2] += float32(uint16(row[xi+4])<<8 | uint16(row[xi+5]))
-					rgba[3] += float32(uint16(row[xi+6])<<8 | uint16(row[xi+7]))
-					sum++
-				}
-			}
-
-			xo := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*8
-			value := floatToUint16(rgba[0] / sum)
-			out.Pix[xo+0] = uint8(value >> 8)
-			out.Pix[xo+1] = uint8(value)
-			value = floatToUint16(rgba[1] / sum)
-			out.Pix[xo+2] = uint8(value >> 8)
-			out.Pix[xo+3] = uint8(value)
-			value = floatToUint16(rgba[2] / sum)
-			out.Pix[xo+4] = uint8(value >> 8)
-			out.Pix[xo+5] = uint8(value)
-			value = floatToUint16(rgba[3] / sum)
-			out.Pix[xo+6] = uint8(value >> 8)
-			out.Pix[xo+7] = uint8(value)
-		}
-	}
-}
-
-func nearestNRGBA64(in *image.NRGBA64, out *image.NRGBA64, scale float64, coeffs []bool, offset []int, filterLength int) {
-	newBounds := out.Bounds()
-	maxX := in.Bounds().Dx() - 1
-
-	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
-		row := in.Pix[x*in.Stride:]
-		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
-			var rgba [4]float32
-			var sum float32
-			start := offset[y]
-			ci := y * filterLength
-			for i := 0; i < filterLength; i++ {
-				if coeffs[ci+i] {
-					xi := start + i
-					switch {
-					case uint(xi) < uint(maxX):
-						xi *= 8
-					case xi >= maxX:
-						xi = 8 * maxX
-					default:
-						xi = 0
-					}
-					rgba[0] += float32(uint16(row[xi+0])<<8 | uint16(row[xi+1]))
-					rgba[1] += float32(uint16(row[xi+2])<<8 | uint16(row[xi+3]))
-					rgba[2] += float32(uint16(row[xi+4])<<8 | uint16(row[xi+5]))
-					rgba[3] += float32(uint16(row[xi+6])<<8 | uint16(row[xi+7]))
-					sum++
-				}
-			}
-
-			xo := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*8
-			value := floatToUint16(rgba[0] / sum)
-			out.Pix[xo+0] = uint8(value >> 8)
-			out.Pix[xo+1] = uint8(value)
-			value = floatToUint16(rgba[1] / sum)
-			out.Pix[xo+2] = uint8(value >> 8)
-			out.Pix[xo+3] = uint8(value)
-			value = floatToUint16(rgba[2] / sum)
-			out.Pix[xo+4] = uint8(value >> 8)
-			out.Pix[xo+5] = uint8(value)
-			value = floatToUint16(rgba[3] / sum)
-			out.Pix[xo+6] = uint8(value >> 8)
-			out.Pix[xo+7] = uint8(value)
-		}
-	}
-}
-
-func nearestGray(in *image.Gray, out *image.Gray, scale float64, coeffs []bool, offset []int, filterLength int) {
-	newBounds := out.Bounds()
-	maxX := in.Bounds().Dx() - 1
-
-	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
-		row := in.Pix[x*in.Stride:]
-		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
-			var gray float32
-			var sum float32
-			start := offset[y]
-			ci := y * filterLength
-			for i := 0; i < filterLength; i++ {
-				if coeffs[ci+i] {
-					xi := start + i
-					switch {
-					case xi < 0:
-						xi = 0
-					case xi >= maxX:
-						xi = maxX
-					}
-					gray += float32(row[xi])
-					sum++
-				}
-			}
-
-			offset := (y-newBounds.Min.Y)*out.Stride + (x - newBounds.Min.X)
-			out.Pix[offset] = floatToUint8(gray / sum)
-		}
-	}
-}
-
-func nearestGray16(in *image.Gray16, out *image.Gray16, scale float64, coeffs []bool, offset []int, filterLength int) {
-	newBounds := out.Bounds()
-	maxX := in.Bounds().Dx() - 1
-
-	for x := newBounds.Min.X; x < newBounds.Max.X; x++ {
-		row := in.Pix[x*in.Stride:]
-		for y := newBounds.Min.Y; y < newBounds.Max.Y; y++ {
-			var gray float32
-			var sum float32
-			start := offset[y]
-			ci := y * filterLength
-			for i := 0; i < filterLength; i++ {
-				if coeffs[ci+i] {
-					xi := start + i
-					switch {
-					case uint(xi) < uint(maxX):
-						xi *= 2
-					case xi >= maxX:
-						xi = 2 * maxX
-					default:
-						xi = 0
-					}
-					gray += float32(uint16(row[xi+0])<<8 | uint16(row[xi+1]))
-					sum++
-				}
-			}
-
-			offset := (y-newBounds.Min.Y)*out.Stride + (x-newBounds.Min.X)*2
-			value := floatToUint16(gray / sum)
-			out.Pix[offset+0] = uint8(value >> 8)
-			out.Pix[offset+1] = uint8(value)
-		}
-	}
-}
diff --git a/vendor/github.com/nfnt/resize/resize.go b/vendor/github.com/nfnt/resize/resize.go
deleted file mode 100644
index 57bd1fcd..00000000
--- a/vendor/github.com/nfnt/resize/resize.go
+++ /dev/null
@@ -1,614 +0,0 @@
-/*
-Copyright (c) 2012, Jan Schlicht <jan.schlicht@gmail.com>
-
-Permission to use, copy, modify, and/or distribute this software for any purpose
-with or without fee is hereby granted, provided that the above copyright notice
-and this permission notice appear in all copies.
-
-THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
-REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
-FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
-INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
-THIS SOFTWARE.
-*/
-
-// Package resize implements various image resizing methods.
-//
-// The package works with the Image interface described in the image package.
-// Various interpolation methods are provided and multiple processors may be
-// utilized in the computations.
-//
-// Example:
-//     imgResized := resize.Resize(1000, 0, imgOld, resize.MitchellNetravali)
-package resize
-
-import (
-	"image"
-	"runtime"
-	"sync"
-)
-
-// An InterpolationFunction provides the parameters that describe an
-// interpolation kernel. It returns the number of samples to take
-// and the kernel function to use for sampling.
-type InterpolationFunction int
-
-// InterpolationFunction constants
-const (
-	// Nearest-neighbor interpolation
-	NearestNeighbor InterpolationFunction = iota
-	// Bilinear interpolation
-	Bilinear
-	// Bicubic interpolation (with cubic hermite spline)
-	Bicubic
-	// Mitchell-Netravali interpolation
-	MitchellNetravali
-	// Lanczos interpolation (a=2)
-	Lanczos2
-	// Lanczos interpolation (a=3)
-	Lanczos3
-)
-
-// kernal, returns an InterpolationFunctions taps and kernel.
-func (i InterpolationFunction) kernel() (int, func(float64) float64) {
-	switch i {
-	case Bilinear:
-		return 2, linear
-	case Bicubic:
-		return 4, cubic
-	case MitchellNetravali:
-		return 4, mitchellnetravali
-	case Lanczos2:
-		return 4, lanczos2
-	case Lanczos3:
-		return 6, lanczos3
-	default:
-		// Default to NearestNeighbor.
-		return 2, nearest
-	}
-}
-
-// values <1 will sharpen the image
-var blur = 1.0
-
-// Resize scales an image to new width and height using the interpolation function interp.
-// A new image with the given dimensions will be returned.
-// If one of the parameters width or height is set to 0, its size will be calculated so that
-// the aspect ratio is that of the originating image.
-// The resizing algorithm uses channels for parallel computation.
-func Resize(width, height uint, img image.Image, interp InterpolationFunction) image.Image {
-	scaleX, scaleY := calcFactors(width, height, float64(img.Bounds().Dx()), float64(img.Bounds().Dy()))
-	if width == 0 {
-		width = uint(0.7 + float64(img.Bounds().Dx())/scaleX)
-	}
-	if height == 0 {
-		height = uint(0.7 + float64(img.Bounds().Dy())/scaleY)
-	}
-
-	// Trivial case: return input image
-	if int(width) == img.Bounds().Dx() && int(height) == img.Bounds().Dy() {
-		return img
-	}
-
-	if interp == NearestNeighbor {
-		return resizeNearest(width, height, scaleX, scaleY, img, interp)
-	}
-
-	taps, kernel := interp.kernel()
-	cpus := runtime.GOMAXPROCS(0)
-	wg := sync.WaitGroup{}
-
-	// Generic access to image.Image is slow in tight loops.
-	// The optimal access has to be determined from the concrete image type.
-	switch input := img.(type) {
-	case *image.RGBA:
-		// 8-bit precision
-		temp := image.NewRGBA(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
-		result := image.NewRGBA(image.Rect(0, 0, int(width), int(height)))
-
-		// horizontal filter, results in transposed temporary image
-		coeffs, offset, filterLength := createWeights8(temp.Bounds().Dy(), taps, blur, scaleX, kernel)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(temp, i, cpus).(*image.RGBA)
-			go func() {
-				defer wg.Done()
-				resizeRGBA(input, slice, scaleX, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-
-		// horizontal filter on transposed image, result is not transposed
-		coeffs, offset, filterLength = createWeights8(result.Bounds().Dy(), taps, blur, scaleY, kernel)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(result, i, cpus).(*image.RGBA)
-			go func() {
-				defer wg.Done()
-				resizeRGBA(temp, slice, scaleY, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-		return result
-	case *image.NRGBA:
-		// 8-bit precision
-		temp := image.NewRGBA(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
-		result := image.NewRGBA(image.Rect(0, 0, int(width), int(height)))
-
-		// horizontal filter, results in transposed temporary image
-		coeffs, offset, filterLength := createWeights8(temp.Bounds().Dy(), taps, blur, scaleX, kernel)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(temp, i, cpus).(*image.RGBA)
-			go func() {
-				defer wg.Done()
-				resizeNRGBA(input, slice, scaleX, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-
-		// horizontal filter on transposed image, result is not transposed
-		coeffs, offset, filterLength = createWeights8(result.Bounds().Dy(), taps, blur, scaleY, kernel)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(result, i, cpus).(*image.RGBA)
-			go func() {
-				defer wg.Done()
-				resizeRGBA(temp, slice, scaleY, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-		return result
-
-	case *image.YCbCr:
-		// 8-bit precision
-		// accessing the YCbCr arrays in a tight loop is slow.
-		// converting the image to ycc increases performance by 2x.
-		temp := newYCC(image.Rect(0, 0, input.Bounds().Dy(), int(width)), input.SubsampleRatio)
-		result := newYCC(image.Rect(0, 0, int(width), int(height)), image.YCbCrSubsampleRatio444)
-
-		coeffs, offset, filterLength := createWeights8(temp.Bounds().Dy(), taps, blur, scaleX, kernel)
-		in := imageYCbCrToYCC(input)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(temp, i, cpus).(*ycc)
-			go func() {
-				defer wg.Done()
-				resizeYCbCr(in, slice, scaleX, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-
-		coeffs, offset, filterLength = createWeights8(result.Bounds().Dy(), taps, blur, scaleY, kernel)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(result, i, cpus).(*ycc)
-			go func() {
-				defer wg.Done()
-				resizeYCbCr(temp, slice, scaleY, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-		return result.YCbCr()
-	case *image.RGBA64:
-		// 16-bit precision
-		temp := image.NewRGBA64(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
-		result := image.NewRGBA64(image.Rect(0, 0, int(width), int(height)))
-
-		// horizontal filter, results in transposed temporary image
-		coeffs, offset, filterLength := createWeights16(temp.Bounds().Dy(), taps, blur, scaleX, kernel)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(temp, i, cpus).(*image.RGBA64)
-			go func() {
-				defer wg.Done()
-				resizeRGBA64(input, slice, scaleX, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-
-		// horizontal filter on transposed image, result is not transposed
-		coeffs, offset, filterLength = createWeights16(result.Bounds().Dy(), taps, blur, scaleY, kernel)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(result, i, cpus).(*image.RGBA64)
-			go func() {
-				defer wg.Done()
-				resizeRGBA64(temp, slice, scaleY, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-		return result
-	case *image.NRGBA64:
-		// 16-bit precision
-		temp := image.NewRGBA64(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
-		result := image.NewRGBA64(image.Rect(0, 0, int(width), int(height)))
-
-		// horizontal filter, results in transposed temporary image
-		coeffs, offset, filterLength := createWeights16(temp.Bounds().Dy(), taps, blur, scaleX, kernel)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(temp, i, cpus).(*image.RGBA64)
-			go func() {
-				defer wg.Done()
-				resizeNRGBA64(input, slice, scaleX, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-
-		// horizontal filter on transposed image, result is not transposed
-		coeffs, offset, filterLength = createWeights16(result.Bounds().Dy(), taps, blur, scaleY, kernel)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(result, i, cpus).(*image.RGBA64)
-			go func() {
-				defer wg.Done()
-				resizeRGBA64(temp, slice, scaleY, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-		return result
-	case *image.Gray:
-		// 8-bit precision
-		temp := image.NewGray(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
-		result := image.NewGray(image.Rect(0, 0, int(width), int(height)))
-
-		// horizontal filter, results in transposed temporary image
-		coeffs, offset, filterLength := createWeights8(temp.Bounds().Dy(), taps, blur, scaleX, kernel)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(temp, i, cpus).(*image.Gray)
-			go func() {
-				defer wg.Done()
-				resizeGray(input, slice, scaleX, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-
-		// horizontal filter on transposed image, result is not transposed
-		coeffs, offset, filterLength = createWeights8(result.Bounds().Dy(), taps, blur, scaleY, kernel)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(result, i, cpus).(*image.Gray)
-			go func() {
-				defer wg.Done()
-				resizeGray(temp, slice, scaleY, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-		return result
-	case *image.Gray16:
-		// 16-bit precision
-		temp := image.NewGray16(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
-		result := image.NewGray16(image.Rect(0, 0, int(width), int(height)))
-
-		// horizontal filter, results in transposed temporary image
-		coeffs, offset, filterLength := createWeights16(temp.Bounds().Dy(), taps, blur, scaleX, kernel)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(temp, i, cpus).(*image.Gray16)
-			go func() {
-				defer wg.Done()
-				resizeGray16(input, slice, scaleX, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-
-		// horizontal filter on transposed image, result is not transposed
-		coeffs, offset, filterLength = createWeights16(result.Bounds().Dy(), taps, blur, scaleY, kernel)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(result, i, cpus).(*image.Gray16)
-			go func() {
-				defer wg.Done()
-				resizeGray16(temp, slice, scaleY, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-		return result
-	default:
-		// 16-bit precision
-		temp := image.NewRGBA64(image.Rect(0, 0, img.Bounds().Dy(), int(width)))
-		result := image.NewRGBA64(image.Rect(0, 0, int(width), int(height)))
-
-		// horizontal filter, results in transposed temporary image
-		coeffs, offset, filterLength := createWeights16(temp.Bounds().Dy(), taps, blur, scaleX, kernel)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(temp, i, cpus).(*image.RGBA64)
-			go func() {
-				defer wg.Done()
-				resizeGeneric(img, slice, scaleX, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-
-		// horizontal filter on transposed image, result is not transposed
-		coeffs, offset, filterLength = createWeights16(result.Bounds().Dy(), taps, blur, scaleY, kernel)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(result, i, cpus).(*image.RGBA64)
-			go func() {
-				defer wg.Done()
-				resizeRGBA64(temp, slice, scaleY, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-		return result
-	}
-}
-
-func resizeNearest(width, height uint, scaleX, scaleY float64, img image.Image, interp InterpolationFunction) image.Image {
-	taps, _ := interp.kernel()
-	cpus := runtime.GOMAXPROCS(0)
-	wg := sync.WaitGroup{}
-
-	switch input := img.(type) {
-	case *image.RGBA:
-		// 8-bit precision
-		temp := image.NewRGBA(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
-		result := image.NewRGBA(image.Rect(0, 0, int(width), int(height)))
-
-		// horizontal filter, results in transposed temporary image
-		coeffs, offset, filterLength := createWeightsNearest(temp.Bounds().Dy(), taps, blur, scaleX)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(temp, i, cpus).(*image.RGBA)
-			go func() {
-				defer wg.Done()
-				nearestRGBA(input, slice, scaleX, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-
-		// horizontal filter on transposed image, result is not transposed
-		coeffs, offset, filterLength = createWeightsNearest(result.Bounds().Dy(), taps, blur, scaleY)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(result, i, cpus).(*image.RGBA)
-			go func() {
-				defer wg.Done()
-				nearestRGBA(temp, slice, scaleY, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-		return result
-	case *image.NRGBA:
-		// 8-bit precision
-		temp := image.NewNRGBA(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
-		result := image.NewNRGBA(image.Rect(0, 0, int(width), int(height)))
-
-		// horizontal filter, results in transposed temporary image
-		coeffs, offset, filterLength := createWeightsNearest(temp.Bounds().Dy(), taps, blur, scaleX)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(temp, i, cpus).(*image.NRGBA)
-			go func() {
-				defer wg.Done()
-				nearestNRGBA(input, slice, scaleX, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-
-		// horizontal filter on transposed image, result is not transposed
-		coeffs, offset, filterLength = createWeightsNearest(result.Bounds().Dy(), taps, blur, scaleY)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(result, i, cpus).(*image.NRGBA)
-			go func() {
-				defer wg.Done()
-				nearestNRGBA(temp, slice, scaleY, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-		return result
-	case *image.YCbCr:
-		// 8-bit precision
-		// accessing the YCbCr arrays in a tight loop is slow.
-		// converting the image to ycc increases performance by 2x.
-		temp := newYCC(image.Rect(0, 0, input.Bounds().Dy(), int(width)), input.SubsampleRatio)
-		result := newYCC(image.Rect(0, 0, int(width), int(height)), image.YCbCrSubsampleRatio444)
-
-		coeffs, offset, filterLength := createWeightsNearest(temp.Bounds().Dy(), taps, blur, scaleX)
-		in := imageYCbCrToYCC(input)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(temp, i, cpus).(*ycc)
-			go func() {
-				defer wg.Done()
-				nearestYCbCr(in, slice, scaleX, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-
-		coeffs, offset, filterLength = createWeightsNearest(result.Bounds().Dy(), taps, blur, scaleY)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(result, i, cpus).(*ycc)
-			go func() {
-				defer wg.Done()
-				nearestYCbCr(temp, slice, scaleY, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-		return result.YCbCr()
-	case *image.RGBA64:
-		// 16-bit precision
-		temp := image.NewRGBA64(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
-		result := image.NewRGBA64(image.Rect(0, 0, int(width), int(height)))
-
-		// horizontal filter, results in transposed temporary image
-		coeffs, offset, filterLength := createWeightsNearest(temp.Bounds().Dy(), taps, blur, scaleX)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(temp, i, cpus).(*image.RGBA64)
-			go func() {
-				defer wg.Done()
-				nearestRGBA64(input, slice, scaleX, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-
-		// horizontal filter on transposed image, result is not transposed
-		coeffs, offset, filterLength = createWeightsNearest(result.Bounds().Dy(), taps, blur, scaleY)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(result, i, cpus).(*image.RGBA64)
-			go func() {
-				defer wg.Done()
-				nearestRGBA64(temp, slice, scaleY, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-		return result
-	case *image.NRGBA64:
-		// 16-bit precision
-		temp := image.NewNRGBA64(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
-		result := image.NewNRGBA64(image.Rect(0, 0, int(width), int(height)))
-
-		// horizontal filter, results in transposed temporary image
-		coeffs, offset, filterLength := createWeightsNearest(temp.Bounds().Dy(), taps, blur, scaleX)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(temp, i, cpus).(*image.NRGBA64)
-			go func() {
-				defer wg.Done()
-				nearestNRGBA64(input, slice, scaleX, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-
-		// horizontal filter on transposed image, result is not transposed
-		coeffs, offset, filterLength = createWeightsNearest(result.Bounds().Dy(), taps, blur, scaleY)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(result, i, cpus).(*image.NRGBA64)
-			go func() {
-				defer wg.Done()
-				nearestNRGBA64(temp, slice, scaleY, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-		return result
-	case *image.Gray:
-		// 8-bit precision
-		temp := image.NewGray(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
-		result := image.NewGray(image.Rect(0, 0, int(width), int(height)))
-
-		// horizontal filter, results in transposed temporary image
-		coeffs, offset, filterLength := createWeightsNearest(temp.Bounds().Dy(), taps, blur, scaleX)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(temp, i, cpus).(*image.Gray)
-			go func() {
-				defer wg.Done()
-				nearestGray(input, slice, scaleX, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-
-		// horizontal filter on transposed image, result is not transposed
-		coeffs, offset, filterLength = createWeightsNearest(result.Bounds().Dy(), taps, blur, scaleY)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(result, i, cpus).(*image.Gray)
-			go func() {
-				defer wg.Done()
-				nearestGray(temp, slice, scaleY, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-		return result
-	case *image.Gray16:
-		// 16-bit precision
-		temp := image.NewGray16(image.Rect(0, 0, input.Bounds().Dy(), int(width)))
-		result := image.NewGray16(image.Rect(0, 0, int(width), int(height)))
-
-		// horizontal filter, results in transposed temporary image
-		coeffs, offset, filterLength := createWeightsNearest(temp.Bounds().Dy(), taps, blur, scaleX)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(temp, i, cpus).(*image.Gray16)
-			go func() {
-				defer wg.Done()
-				nearestGray16(input, slice, scaleX, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-
-		// horizontal filter on transposed image, result is not transposed
-		coeffs, offset, filterLength = createWeightsNearest(result.Bounds().Dy(), taps, blur, scaleY)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(result, i, cpus).(*image.Gray16)
-			go func() {
-				defer wg.Done()
-				nearestGray16(temp, slice, scaleY, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-		return result
-	default:
-		// 16-bit precision
-		temp := image.NewRGBA64(image.Rect(0, 0, img.Bounds().Dy(), int(width)))
-		result := image.NewRGBA64(image.Rect(0, 0, int(width), int(height)))
-
-		// horizontal filter, results in transposed temporary image
-		coeffs, offset, filterLength := createWeightsNearest(temp.Bounds().Dy(), taps, blur, scaleX)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(temp, i, cpus).(*image.RGBA64)
-			go func() {
-				defer wg.Done()
-				nearestGeneric(img, slice, scaleX, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-
-		// horizontal filter on transposed image, result is not transposed
-		coeffs, offset, filterLength = createWeightsNearest(result.Bounds().Dy(), taps, blur, scaleY)
-		wg.Add(cpus)
-		for i := 0; i < cpus; i++ {
-			slice := makeSlice(result, i, cpus).(*image.RGBA64)
-			go func() {
-				defer wg.Done()
-				nearestRGBA64(temp, slice, scaleY, coeffs, offset, filterLength)
-			}()
-		}
-		wg.Wait()
-		return result
-	}
-
-}
-
-// Calculates scaling factors using old and new image dimensions.
-func calcFactors(width, height uint, oldWidth, oldHeight float64) (scaleX, scaleY float64) {
-	if width == 0 {
-		if height == 0 {
-			scaleX = 1.0
-			scaleY = 1.0
-		} else {
-			scaleY = oldHeight / float64(height)
-			scaleX = scaleY
-		}
-	} else {
-		scaleX = oldWidth / float64(width)
-		if height == 0 {
-			scaleY = scaleX
-		} else {
-			scaleY = oldHeight / float64(height)
-		}
-	}
-	return
-}
-
-type imageWithSubImage interface {
-	image.Image
-	SubImage(image.Rectangle) image.Image
-}
-
-func makeSlice(img imageWithSubImage, i, n int) image.Image {
-	return img.SubImage(image.Rect(img.Bounds().Min.X, img.Bounds().Min.Y+i*img.Bounds().Dy()/n, img.Bounds().Max.X, img.Bounds().Min.Y+(i+1)*img.Bounds().Dy()/n))
-}
diff --git a/vendor/github.com/nfnt/resize/thumbnail.go b/vendor/github.com/nfnt/resize/thumbnail.go
deleted file mode 100644
index 9efc246b..00000000
--- a/vendor/github.com/nfnt/resize/thumbnail.go
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
-Copyright (c) 2012, Jan Schlicht <jan.schlicht@gmail.com>
-
-Permission to use, copy, modify, and/or distribute this software for any purpose
-with or without fee is hereby granted, provided that the above copyright notice
-and this permission notice appear in all copies.
-
-THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
-REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
-FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
-INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
-THIS SOFTWARE.
-*/
-
-package resize
-
-import (
-	"image"
-)
-
-// Thumbnail will downscale provided image to max width and height preserving
-// original aspect ratio and using the interpolation function interp.
-// It will return original image, without processing it, if original sizes
-// are already smaller than provided constraints.
-func Thumbnail(maxWidth, maxHeight uint, img image.Image, interp InterpolationFunction) image.Image {
-	origBounds := img.Bounds()
-	origWidth := uint(origBounds.Dx())
-	origHeight := uint(origBounds.Dy())
-	newWidth, newHeight := origWidth, origHeight
-
-	// Return original image if it have same or smaller size as constraints
-	if maxWidth >= origWidth && maxHeight >= origHeight {
-		return img
-	}
-
-	// Preserve aspect ratio
-	if origWidth > maxWidth {
-		newHeight = uint(origHeight * maxWidth / origWidth)
-		if newHeight < 1 {
-			newHeight = 1
-		}
-		newWidth = maxWidth
-	}
-
-	if newHeight > maxHeight {
-		newWidth = uint(newWidth * maxHeight / newHeight)
-		if newWidth < 1 {
-			newWidth = 1
-		}
-		newHeight = maxHeight
-	}
-	return Resize(newWidth, newHeight, img, interp)
-}
diff --git a/vendor/github.com/nfnt/resize/ycc.go b/vendor/github.com/nfnt/resize/ycc.go
deleted file mode 100644
index 10415995..00000000
--- a/vendor/github.com/nfnt/resize/ycc.go
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
-Copyright (c) 2014, Charlie Vieth <charlie.vieth@gmail.com>
-
-Permission to use, copy, modify, and/or distribute this software for any purpose
-with or without fee is hereby granted, provided that the above copyright notice
-and this permission notice appear in all copies.
-
-THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
-REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
-FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
-INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
-OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
-TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
-THIS SOFTWARE.
-*/
-
-package resize
-
-import (
-	"image"
-	"image/color"
-)
-
-// ycc is an in memory YCbCr image.  The Y, Cb and Cr samples are held in a
-// single slice to increase resizing performance.
-type ycc struct {
-	// Pix holds the image's pixels, in Y, Cb, Cr order. The pixel at
-	// (x, y) starts at Pix[(y-Rect.Min.Y)*Stride + (x-Rect.Min.X)*3].
-	Pix []uint8
-	// Stride is the Pix stride (in bytes) between vertically adjacent pixels.
-	Stride int
-	// Rect is the image's bounds.
-	Rect image.Rectangle
-	// SubsampleRatio is the subsample ratio of the original YCbCr image.
-	SubsampleRatio image.YCbCrSubsampleRatio
-}
-
-// PixOffset returns the index of the first element of Pix that corresponds to
-// the pixel at (x, y).
-func (p *ycc) PixOffset(x, y int) int {
-	return (y-p.Rect.Min.Y)*p.Stride + (x-p.Rect.Min.X)*3
-}
-
-func (p *ycc) Bounds() image.Rectangle {
-	return p.Rect
-}
-
-func (p *ycc) ColorModel() color.Model {
-	return color.YCbCrModel
-}
-
-func (p *ycc) At(x, y int) color.Color {
-	if !(image.Point{x, y}.In(p.Rect)) {
-		return color.YCbCr{}
-	}
-	i := p.PixOffset(x, y)
-	return color.YCbCr{
-		p.Pix[i+0],
-		p.Pix[i+1],
-		p.Pix[i+2],
-	}
-}
-
-func (p *ycc) Opaque() bool {
-	return true
-}
-
-// SubImage returns an image representing the portion of the image p visible
-// through r. The returned value shares pixels with the original image.
-func (p *ycc) SubImage(r image.Rectangle) image.Image {
-	r = r.Intersect(p.Rect)
-	if r.Empty() {
-		return &ycc{SubsampleRatio: p.SubsampleRatio}
-	}
-	i := p.PixOffset(r.Min.X, r.Min.Y)
-	return &ycc{
-		Pix:            p.Pix[i:],
-		Stride:         p.Stride,
-		Rect:           r,
-		SubsampleRatio: p.SubsampleRatio,
-	}
-}
-
-// newYCC returns a new ycc with the given bounds and subsample ratio.
-func newYCC(r image.Rectangle, s image.YCbCrSubsampleRatio) *ycc {
-	w, h := r.Dx(), r.Dy()
-	buf := make([]uint8, 3*w*h)
-	return &ycc{Pix: buf, Stride: 3 * w, Rect: r, SubsampleRatio: s}
-}
-
-// YCbCr converts ycc to a YCbCr image with the same subsample ratio
-// as the YCbCr image that ycc was generated from.
-func (p *ycc) YCbCr() *image.YCbCr {
-	ycbcr := image.NewYCbCr(p.Rect, p.SubsampleRatio)
-	var off int
-
-	switch ycbcr.SubsampleRatio {
-	case image.YCbCrSubsampleRatio422:
-		for y := ycbcr.Rect.Min.Y; y < ycbcr.Rect.Max.Y; y++ {
-			yy := (y - ycbcr.Rect.Min.Y) * ycbcr.YStride
-			cy := (y - ycbcr.Rect.Min.Y) * ycbcr.CStride
-			for x := ycbcr.Rect.Min.X; x < ycbcr.Rect.Max.X; x++ {
-				xx := (x - ycbcr.Rect.Min.X)
-				yi := yy + xx
-				ci := cy + xx/2
-				ycbcr.Y[yi] = p.Pix[off+0]
-				ycbcr.Cb[ci] = p.Pix[off+1]
-				ycbcr.Cr[ci] = p.Pix[off+2]
-				off += 3
-			}
-		}
-	case image.YCbCrSubsampleRatio420:
-		for y := ycbcr.Rect.Min.Y; y < ycbcr.Rect.Max.Y; y++ {
-			yy := (y - ycbcr.Rect.Min.Y) * ycbcr.YStride
-			cy := (y/2 - ycbcr.Rect.Min.Y/2) * ycbcr.CStride
-			for x := ycbcr.Rect.Min.X; x < ycbcr.Rect.Max.X; x++ {
-				xx := (x - ycbcr.Rect.Min.X)
-				yi := yy + xx
-				ci := cy + xx/2
-				ycbcr.Y[yi] = p.Pix[off+0]
-				ycbcr.Cb[ci] = p.Pix[off+1]
-				ycbcr.Cr[ci] = p.Pix[off+2]
-				off += 3
-			}
-		}
-	case image.YCbCrSubsampleRatio440:
-		for y := ycbcr.Rect.Min.Y; y < ycbcr.Rect.Max.Y; y++ {
-			yy := (y - ycbcr.Rect.Min.Y) * ycbcr.YStride
-			cy := (y/2 - ycbcr.Rect.Min.Y/2) * ycbcr.CStride
-			for x := ycbcr.Rect.Min.X; x < ycbcr.Rect.Max.X; x++ {
-				xx := (x - ycbcr.Rect.Min.X)
-				yi := yy + xx
-				ci := cy + xx
-				ycbcr.Y[yi] = p.Pix[off+0]
-				ycbcr.Cb[ci] = p.Pix[off+1]
-				ycbcr.Cr[ci] = p.Pix[off+2]
-				off += 3
-			}
-		}
-	default:
-		// Default to 4:4:4 subsampling.
-		for y := ycbcr.Rect.Min.Y; y < ycbcr.Rect.Max.Y; y++ {
-			yy := (y - ycbcr.Rect.Min.Y) * ycbcr.YStride
-			cy := (y - ycbcr.Rect.Min.Y) * ycbcr.CStride
-			for x := ycbcr.Rect.Min.X; x < ycbcr.Rect.Max.X; x++ {
-				xx := (x - ycbcr.Rect.Min.X)
-				yi := yy + xx
-				ci := cy + xx
-				ycbcr.Y[yi] = p.Pix[off+0]
-				ycbcr.Cb[ci] = p.Pix[off+1]
-				ycbcr.Cr[ci] = p.Pix[off+2]
-				off += 3
-			}
-		}
-	}
-	return ycbcr
-}
-
-// imageYCbCrToYCC converts a YCbCr image to a ycc image for resizing.
-func imageYCbCrToYCC(in *image.YCbCr) *ycc {
-	w, h := in.Rect.Dx(), in.Rect.Dy()
-	r := image.Rect(0, 0, w, h)
-	buf := make([]uint8, 3*w*h)
-	p := ycc{Pix: buf, Stride: 3 * w, Rect: r, SubsampleRatio: in.SubsampleRatio}
-	var off int
-
-	switch in.SubsampleRatio {
-	case image.YCbCrSubsampleRatio422:
-		for y := in.Rect.Min.Y; y < in.Rect.Max.Y; y++ {
-			yy := (y - in.Rect.Min.Y) * in.YStride
-			cy := (y - in.Rect.Min.Y) * in.CStride
-			for x := in.Rect.Min.X; x < in.Rect.Max.X; x++ {
-				xx := (x - in.Rect.Min.X)
-				yi := yy + xx
-				ci := cy + xx/2
-				p.Pix[off+0] = in.Y[yi]
-				p.Pix[off+1] = in.Cb[ci]
-				p.Pix[off+2] = in.Cr[ci]
-				off += 3
-			}
-		}
-	case image.YCbCrSubsampleRatio420:
-		for y := in.Rect.Min.Y; y < in.Rect.Max.Y; y++ {
-			yy := (y - in.Rect.Min.Y) * in.YStride
-			cy := (y/2 - in.Rect.Min.Y/2) * in.CStride
-			for x := in.Rect.Min.X; x < in.Rect.Max.X; x++ {
-				xx := (x - in.Rect.Min.X)
-				yi := yy + xx
-				ci := cy + xx/2
-				p.Pix[off+0] = in.Y[yi]
-				p.Pix[off+1] = in.Cb[ci]
-				p.Pix[off+2] = in.Cr[ci]
-				off += 3
-			}
-		}
-	case image.YCbCrSubsampleRatio440:
-		for y := in.Rect.Min.Y; y < in.Rect.Max.Y; y++ {
-			yy := (y - in.Rect.Min.Y) * in.YStride
-			cy := (y/2 - in.Rect.Min.Y/2) * in.CStride
-			for x := in.Rect.Min.X; x < in.Rect.Max.X; x++ {
-				xx := (x - in.Rect.Min.X)
-				yi := yy + xx
-				ci := cy + xx
-				p.Pix[off+0] = in.Y[yi]
-				p.Pix[off+1] = in.Cb[ci]
-				p.Pix[off+2] = in.Cr[ci]
-				off += 3
-			}
-		}
-	default:
-		// Default to 4:4:4 subsampling.
-		for y := in.Rect.Min.Y; y < in.Rect.Max.Y; y++ {
-			yy := (y - in.Rect.Min.Y) * in.YStride
-			cy := (y - in.Rect.Min.Y) * in.CStride
-			for x := in.Rect.Min.X; x < in.Rect.Max.X; x++ {
-				xx := (x - in.Rect.Min.X)
-				yi := yy + xx
-				ci := cy + xx
-				p.Pix[off+0] = in.Y[yi]
-				p.Pix[off+1] = in.Cb[ci]
-				p.Pix[off+2] = in.Cr[ci]
-				off += 3
-			}
-		}
-	}
-	return &p
-}
diff --git a/vendor/vendor.json b/vendor/vendor.json
index 19285f3c..b7831a26 100644
--- a/vendor/vendor.json
+++ b/vendor/vendor.json
@@ -58,6 +58,12 @@
 			"revision": "349dd0209470eabd9514242c688c403c0926d266",
 			"revisionTime": "2016-12-24T14:14:13Z"
 		},
+		{
+			"checksumSHA1": "FOqUoijHGUZuS1T4pTcdlrNNlvc=",
+			"path": "github.com/bamiaux/rez",
+			"revision": "29f4463c688b986c11f166b12734f69b58b5555f",
+			"revisionTime": "2017-07-31T18:41:18Z"
+		},
 		{
 			"checksumSHA1": "usT4LCSQItkFvFOQT7cBlkCuGaE=",
 			"path": "github.com/beevik/etree",
@@ -112,12 +118,6 @@
 			"revision": "95345c4e1c0ebc9d16a3284177f09360f4d20fab",
 			"revisionTime": "2017-01-24T11:57:57Z"
 		},
-		{
-			"checksumSHA1": "r5eQHkttko6kxroDEENXbmXKrSs=",
-			"path": "github.com/nfnt/resize",
-			"revision": "891127d8d1b52734debe1b3c3d7e747502b6c366",
-			"revisionTime": "2016-07-24T20:39:20Z"
-		},
 		{
 			"checksumSHA1": "LuFv4/jlrmFNnDb/5SCSEPAM9vU=",
 			"path": "github.com/pmezard/go-difflib/difflib",