Improve performance of simple convolution functions

octu0 · Apr 12, 2021 · 7661853 · 7661853
1 parent efef904
commit 7661853
Show file tree

Hide file tree

Showing 2 changed files with 77 additions and 52 deletions.
diff --git a/README.md b/README.md
@@ -383,40 +383,40 @@ darwin/amd64 Intel(R) Core(TM) i7-8569U CPU @ 2.80GHz
 
 ```
 src 320x240
-BenchmarkJIT/cloneimg : 0.01952ms
-BenchmarkJIT/rotate0 : 0.01952ms
-BenchmarkJIT/rotate90 : 0.08169ms
-BenchmarkJIT/rotate180 : 0.02266ms
-BenchmarkJIT/rotate270 : 0.09420ms
-BenchmarkJIT/grayscale : 0.08012ms
-BenchmarkJIT/invert : 0.06201ms
-BenchmarkJIT/brightness : 0.06828ms
-BenchmarkJIT/gammacorrection : 0.11261ms
-BenchmarkJIT/contrast : 0.06815ms
-BenchmarkJIT/boxblur : 0.18756ms
-BenchmarkJIT/gaussianblur : 0.16728ms
-BenchmarkJIT/blockmozaic : 0.31110ms
-BenchmarkJIT/erosion : 0.10399ms
-BenchmarkJIT/dilation : 0.09312ms
-BenchmarkJIT/morphology_open : 0.13103ms
-BenchmarkJIT/morphology_close : 0.12874ms
-BenchmarkJIT/morphology_gradient : 0.07516ms
-BenchmarkJIT/emboss$1 : 0.15934ms
-BenchmarkJIT/laplacian : 0.13013ms
-BenchmarkJIT/highpass : 0.13032ms
-BenchmarkJIT/gradient : 0.12489ms
-BenchmarkJIT/edge : 0.10574ms
-BenchmarkJIT/sobel : 0.11102ms
-BenchmarkJIT/canny : 0.59503ms
-BenchmarkJIT/canny_dilate : 0.62175ms
-BenchmarkJIT/canny_morphology_open : 0.72361ms
-BenchmarkJIT/canny_morphology_close : 0.70342ms
-BenchmarkJIT/match_template_sad : 5.26705ms
-BenchmarkJIT/match_template_ssd : 4.14713ms
-BenchmarkJIT/match_template_ncc : 8.03261ms
-BenchmarkJIT/prepared_match_template_ncc : 5.83674ms
-BenchmarkJIT/match_template_zncc : 11.69895ms
-BenchmarkJIT/prepared_match_template_zncc : 10.88168ms
+BenchmarkJIT/cloneimg : 0.01065ms
+BenchmarkJIT/rotate0 : 0.01089ms
+BenchmarkJIT/rotate90 : 0.06175ms
+BenchmarkJIT/rotate180 : 0.01150ms
+BenchmarkJIT/rotate270 : 0.06265ms
+BenchmarkJIT/grayscale : 0.05361ms
+BenchmarkJIT/invert : 0.06487ms
+BenchmarkJIT/brightness : 0.05570ms
+BenchmarkJIT/gammacorrection : 0.09380ms
+BenchmarkJIT/contrast : 0.06393ms
+BenchmarkJIT/boxblur : 0.22507ms
+BenchmarkJIT/gaussianblur : 0.16436ms
+BenchmarkJIT/blockmozaic : 0.31727ms
+BenchmarkJIT/erosion : 0.07028ms
+BenchmarkJIT/dilation : 0.06876ms
+BenchmarkJIT/morphology_open : 0.13085ms
+BenchmarkJIT/morphology_close : 0.12909ms
+BenchmarkJIT/morphology_gradient : 0.08012ms
+BenchmarkJIT/emboss$1 : 0.16315ms
+BenchmarkJIT/laplacian : 0.09986ms
+BenchmarkJIT/highpass : 0.10241ms
+BenchmarkJIT/gradient : 0.09807ms
+BenchmarkJIT/edge : 0.10242ms
+BenchmarkJIT/sobel : 0.11067ms
+BenchmarkJIT/canny : 0.59974ms
+BenchmarkJIT/canny_dilate : 0.61581ms
+BenchmarkJIT/canny_morphology_open : 0.71039ms
+BenchmarkJIT/canny_morphology_close : 0.70871ms
+BenchmarkJIT/match_template_sad : 5.30760ms
+BenchmarkJIT/match_template_ssd : 4.17522ms
+BenchmarkJIT/match_template_ncc : 8.02835ms
+BenchmarkJIT/prepared_match_template_ncc : 5.92526ms
+BenchmarkJIT/match_template_zncc : 11.82611ms
+BenchmarkJIT/prepared_match_template_zncc : 10.78653ms
 ```
 
 ## AOT benchmarks

diff --git a/blurry.cpp b/blurry.cpp
@@ -241,12 +241,15 @@ Func filter2d_gray(
  conv.compute_root()
  .vectorize(x, 32);
 
- gradient.compute_root()
+ gradient.compute_at(in, x)
  .tile(x, y, xo, yo, xi, yi, 32, 32)
  .fuse(xo, yo, ti)
- .parallel(ti)
+ .parallel(ti, 4)
  .vectorize(xi, 32);
 
+ in.compute_root()
+ .unroll(y, 4)
+ .vectorize(x, 16);
  return gradient;
 }
 
@@ -461,6 +464,9 @@ Func cloneimg_fn(Func input, Param<int32_t> width, Param<int32_t> height) {
  Func cloneimg = Func("cloneimg");
  cloneimg(x, y, ch) = in(x, y, ch);
 
+ cloneimg.compute_at(in, x)
+ .unroll(y, 8)
+ .vectorize(x, 16);
  return cloneimg;
 }
 
@@ -474,6 +480,9 @@ Func rotate0_fn(Func input, Param<int32_t> width, Param<int32_t> height) {
  Func rotate = Func("rotate0");
  rotate(x, y, ch) = in(x, y, ch);
 
+ rotate.compute_at(in, x)
+ .unroll(y, 8)
+ .vectorize(x, 16);
  return rotate;
 }
 
@@ -485,6 +494,9 @@ Func rotate90_fn(Func input, Param<int32_t> width, Param<int32_t> height) {
  Func rotate = Func("rotate90");
  rotate(x, y, ch) = in(y, (height - 1) - x, ch);
 
+ rotate.compute_at(in, x)
+ .unroll(x, 4)
+ .vectorize(y, 16);
  return rotate;
 }
 
@@ -496,6 +508,9 @@ Func rotate180_fn(Func input, Param<int32_t> width, Param<int32_t> height) {
  Func rotate = Func("rotate180");
  rotate(x, y, ch) = in((width - 1) - x, (height - 1) - y, ch);
 
+ rotate.compute_at(in, x)
+ .unroll(y, 8)
+ .vectorize(x, 16);
  return rotate;
 }
 
@@ -507,6 +522,9 @@ Func rotate270_fn(Func input, Param<int32_t> width, Param<int32_t> height) {
  Func rotate = Func("rotate270");
  rotate(x, y, ch) = in((width - 1) - y, x, ch);
 
+ rotate.compute_at(in, x)
+ .unroll(x, 4)
+ .vectorize(y, 16);
  return rotate;
 }
 
@@ -524,11 +542,11 @@ Func erosion_fn(Func input, Param<int32_t> width, Param<int32_t> height, Param<u
  Expr value = in(x + rd.x, y + rd.y, ch);
  erosion(x, y, ch) = minimum(value);
 
- erosion.compute_root()
+ erosion.compute_at(in, x)
  .tile(x, y, xo, yo, xi, yi, 32, 32)
  .fuse(xo, yo, ti)
  .parallel(ch)
- .parallel(ti)
+ .parallel(ti, 4)
  .vectorize(xi, 32);
 
  in.compute_root();
@@ -553,7 +571,7 @@ Func dilation_fn(Func input, Param<int32_t> width, Param<int32_t> height, Param<
  .tile(x, y, xo, yo, xi, yi, 32, 32)
  .fuse(xo, yo, ti)
  .parallel(ch)
- .parallel(ti)
+ .parallel(ti, 4)
  .vectorize(xi, 32);
 
  in.compute_root();
@@ -634,11 +652,11 @@ Func grayscale_fn(Func input, Param<int32_t> width, Param<int32_t> height) {
  cast<uint8_t>(value)
  );
 
- grayscale.compute_root()
+ grayscale.compute_at(in, x)
  .tile(x, y, xo, yo, xi, yi, 32, 32)
  .fuse(xo, yo, ti)
  .parallel(ch)
- .parallel(ti)
+ .parallel(ti, 4)
  .vectorize(xi, 32);
 
  in.compute_root();
@@ -662,13 +680,15 @@ Func invert_fn(Func input, Param<int32_t> width, Param<int32_t> height) {
  );
  invert(x, y, ch) = value;
 
- invert.compute_root()
+ invert.compute_at(in, x)
  .tile(x, y, xo, yo, xi, yi, 32, 32)
  .fuse(xo, yo, ti)
  .parallel(ch)
- .parallel(ti)
+ .parallel(ti, 4)
  .vectorize(xi, 32);
 
+ in.compute_root();
+
  return invert;
 }
 
@@ -688,13 +708,15 @@ Func brightness_fn(Func input, Param<int32_t> width, Param<int32_t> height, Para
 
  brightness(x, y, ch) = cast<uint8_t>(value);
 
- brightness.compute_root()
+ brightness.compute_at(in, x)
  .tile(x, y, xo, yo, xi, yi, 32, 32)
  .fuse(xo, yo, ti)
  .parallel(ch)
- .parallel(ti)
+ .parallel(ti, 4)
  .vectorize(xi, 32);
 
+ in.compute_root();
+
  return brightness;
 }
 
@@ -719,13 +741,15 @@ Func gammacorrection_fn(Func input, Param<int32_t> width, Param<int32_t> height,
  cast<uint8_t>(value)
  );
 
- gammacorrection.compute_root()
+ gammacorrection.compute_at(in, x)
  .tile(x, y, xo, yo, xi, yi, 32, 32)
  .fuse(xo, yo, ti)
  .parallel(ch)
- .parallel(ti)
+ .parallel(ti, 4)
  .vectorize(xi, 32);
 
+ in.compute_root();
+
  return gammacorrection;
 }
 
@@ -749,13 +773,15 @@ Func contrast_fn(Func input, Param<int32_t> width, Param<int32_t> height, Param<
 
  contrast(x, y, ch) = cast<uint8_t>(value);
 
- contrast.compute_root()
+ contrast.compute_at(in, x)
  .tile(x, y, xo, yo, xi, yi, 32, 32)
  .fuse(xo, yo, ti)
  .parallel(ch)
- .parallel(ti)
+ .parallel(ti, 4)
  .vectorize(xi, 32);
 
+ in.compute_root();
+
  return contrast;
 }
 
@@ -788,12 +814,11 @@ Func boxblur_fn(Func input, Param<int32_t> width, Param<int32_t> height, Param<u
  .parallel(y, 8)
  .vectorize(x, 32);
 
- boxblur.compute_root()
- .async()
+ boxblur.compute_at(in, x)
  .tile(x, y, xo, yo, xi, yi, 32, 32)
  .fuse(xo, yo, ti)
  .parallel(ch)
- .parallel(ti, 8)
+ .parallel(ti)
  .vectorize(xi, 32);
 
  in.compute_root();