Skip to content

Commit

Permalink
Improve performance of simple convolution functions
Browse files Browse the repository at this point in the history
  • Loading branch information
octu0 committed Apr 12, 2021
1 parent efef904 commit 7661853
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 52 deletions.
68 changes: 34 additions & 34 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -383,40 +383,40 @@ darwin/amd64 Intel(R) Core(TM) i7-8569U CPU @ 2.80GHz

```
src 320x240
BenchmarkJIT/cloneimg : 0.01952ms
BenchmarkJIT/rotate0 : 0.01952ms
BenchmarkJIT/rotate90 : 0.08169ms
BenchmarkJIT/rotate180 : 0.02266ms
BenchmarkJIT/rotate270 : 0.09420ms
BenchmarkJIT/grayscale : 0.08012ms
BenchmarkJIT/invert : 0.06201ms
BenchmarkJIT/brightness : 0.06828ms
BenchmarkJIT/gammacorrection : 0.11261ms
BenchmarkJIT/contrast : 0.06815ms
BenchmarkJIT/boxblur : 0.18756ms
BenchmarkJIT/gaussianblur : 0.16728ms
BenchmarkJIT/blockmozaic : 0.31110ms
BenchmarkJIT/erosion : 0.10399ms
BenchmarkJIT/dilation : 0.09312ms
BenchmarkJIT/morphology_open : 0.13103ms
BenchmarkJIT/morphology_close : 0.12874ms
BenchmarkJIT/morphology_gradient : 0.07516ms
BenchmarkJIT/emboss$1 : 0.15934ms
BenchmarkJIT/laplacian : 0.13013ms
BenchmarkJIT/highpass : 0.13032ms
BenchmarkJIT/gradient : 0.12489ms
BenchmarkJIT/edge : 0.10574ms
BenchmarkJIT/sobel : 0.11102ms
BenchmarkJIT/canny : 0.59503ms
BenchmarkJIT/canny_dilate : 0.62175ms
BenchmarkJIT/canny_morphology_open : 0.72361ms
BenchmarkJIT/canny_morphology_close : 0.70342ms
BenchmarkJIT/match_template_sad : 5.26705ms
BenchmarkJIT/match_template_ssd : 4.14713ms
BenchmarkJIT/match_template_ncc : 8.03261ms
BenchmarkJIT/prepared_match_template_ncc : 5.83674ms
BenchmarkJIT/match_template_zncc : 11.69895ms
BenchmarkJIT/prepared_match_template_zncc : 10.88168ms
BenchmarkJIT/cloneimg : 0.01065ms
BenchmarkJIT/rotate0 : 0.01089ms
BenchmarkJIT/rotate90 : 0.06175ms
BenchmarkJIT/rotate180 : 0.01150ms
BenchmarkJIT/rotate270 : 0.06265ms
BenchmarkJIT/grayscale : 0.05361ms
BenchmarkJIT/invert : 0.06487ms
BenchmarkJIT/brightness : 0.05570ms
BenchmarkJIT/gammacorrection : 0.09380ms
BenchmarkJIT/contrast : 0.06393ms
BenchmarkJIT/boxblur : 0.22507ms
BenchmarkJIT/gaussianblur : 0.16436ms
BenchmarkJIT/blockmozaic : 0.31727ms
BenchmarkJIT/erosion : 0.07028ms
BenchmarkJIT/dilation : 0.06876ms
BenchmarkJIT/morphology_open : 0.13085ms
BenchmarkJIT/morphology_close : 0.12909ms
BenchmarkJIT/morphology_gradient : 0.08012ms
BenchmarkJIT/emboss$1 : 0.16315ms
BenchmarkJIT/laplacian : 0.09986ms
BenchmarkJIT/highpass : 0.10241ms
BenchmarkJIT/gradient : 0.09807ms
BenchmarkJIT/edge : 0.10242ms
BenchmarkJIT/sobel : 0.11067ms
BenchmarkJIT/canny : 0.59974ms
BenchmarkJIT/canny_dilate : 0.61581ms
BenchmarkJIT/canny_morphology_open : 0.71039ms
BenchmarkJIT/canny_morphology_close : 0.70871ms
BenchmarkJIT/match_template_sad : 5.30760ms
BenchmarkJIT/match_template_ssd : 4.17522ms
BenchmarkJIT/match_template_ncc : 8.02835ms
BenchmarkJIT/prepared_match_template_ncc : 5.92526ms
BenchmarkJIT/match_template_zncc : 11.82611ms
BenchmarkJIT/prepared_match_template_zncc : 10.78653ms
```

## AOT benchmarks
Expand Down
61 changes: 43 additions & 18 deletions blurry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,12 +241,15 @@ Func filter2d_gray(
conv.compute_root()
.vectorize(x, 32);

gradient.compute_root()
gradient.compute_at(in, x)
.tile(x, y, xo, yo, xi, yi, 32, 32)
.fuse(xo, yo, ti)
.parallel(ti)
.parallel(ti, 4)
.vectorize(xi, 32);

in.compute_root()
.unroll(y, 4)
.vectorize(x, 16);
return gradient;
}

Expand Down Expand Up @@ -461,6 +464,9 @@ Func cloneimg_fn(Func input, Param<int32_t> width, Param<int32_t> height) {
Func cloneimg = Func("cloneimg");
cloneimg(x, y, ch) = in(x, y, ch);

cloneimg.compute_at(in, x)
.unroll(y, 8)
.vectorize(x, 16);
return cloneimg;
}

Expand All @@ -474,6 +480,9 @@ Func rotate0_fn(Func input, Param<int32_t> width, Param<int32_t> height) {
Func rotate = Func("rotate0");
rotate(x, y, ch) = in(x, y, ch);

rotate.compute_at(in, x)
.unroll(y, 8)
.vectorize(x, 16);
return rotate;
}

Expand All @@ -485,6 +494,9 @@ Func rotate90_fn(Func input, Param<int32_t> width, Param<int32_t> height) {
Func rotate = Func("rotate90");
rotate(x, y, ch) = in(y, (height - 1) - x, ch);

rotate.compute_at(in, x)
.unroll(x, 4)
.vectorize(y, 16);
return rotate;
}

Expand All @@ -496,6 +508,9 @@ Func rotate180_fn(Func input, Param<int32_t> width, Param<int32_t> height) {
Func rotate = Func("rotate180");
rotate(x, y, ch) = in((width - 1) - x, (height - 1) - y, ch);

rotate.compute_at(in, x)
.unroll(y, 8)
.vectorize(x, 16);
return rotate;
}

Expand All @@ -507,6 +522,9 @@ Func rotate270_fn(Func input, Param<int32_t> width, Param<int32_t> height) {
Func rotate = Func("rotate270");
rotate(x, y, ch) = in((width - 1) - y, x, ch);

rotate.compute_at(in, x)
.unroll(x, 4)
.vectorize(y, 16);
return rotate;
}

Expand All @@ -524,11 +542,11 @@ Func erosion_fn(Func input, Param<int32_t> width, Param<int32_t> height, Param<u
Expr value = in(x + rd.x, y + rd.y, ch);
erosion(x, y, ch) = minimum(value);

erosion.compute_root()
erosion.compute_at(in, x)
.tile(x, y, xo, yo, xi, yi, 32, 32)
.fuse(xo, yo, ti)
.parallel(ch)
.parallel(ti)
.parallel(ti, 4)
.vectorize(xi, 32);

in.compute_root();
Expand All @@ -553,7 +571,7 @@ Func dilation_fn(Func input, Param<int32_t> width, Param<int32_t> height, Param<
.tile(x, y, xo, yo, xi, yi, 32, 32)
.fuse(xo, yo, ti)
.parallel(ch)
.parallel(ti)
.parallel(ti, 4)
.vectorize(xi, 32);

in.compute_root();
Expand Down Expand Up @@ -634,11 +652,11 @@ Func grayscale_fn(Func input, Param<int32_t> width, Param<int32_t> height) {
cast<uint8_t>(value)
);

grayscale.compute_root()
grayscale.compute_at(in, x)
.tile(x, y, xo, yo, xi, yi, 32, 32)
.fuse(xo, yo, ti)
.parallel(ch)
.parallel(ti)
.parallel(ti, 4)
.vectorize(xi, 32);

in.compute_root();
Expand All @@ -662,13 +680,15 @@ Func invert_fn(Func input, Param<int32_t> width, Param<int32_t> height) {
);
invert(x, y, ch) = value;

invert.compute_root()
invert.compute_at(in, x)
.tile(x, y, xo, yo, xi, yi, 32, 32)
.fuse(xo, yo, ti)
.parallel(ch)
.parallel(ti)
.parallel(ti, 4)
.vectorize(xi, 32);

in.compute_root();

return invert;
}

Expand All @@ -688,13 +708,15 @@ Func brightness_fn(Func input, Param<int32_t> width, Param<int32_t> height, Para

brightness(x, y, ch) = cast<uint8_t>(value);

brightness.compute_root()
brightness.compute_at(in, x)
.tile(x, y, xo, yo, xi, yi, 32, 32)
.fuse(xo, yo, ti)
.parallel(ch)
.parallel(ti)
.parallel(ti, 4)
.vectorize(xi, 32);

in.compute_root();

return brightness;
}

Expand All @@ -719,13 +741,15 @@ Func gammacorrection_fn(Func input, Param<int32_t> width, Param<int32_t> height,
cast<uint8_t>(value)
);

gammacorrection.compute_root()
gammacorrection.compute_at(in, x)
.tile(x, y, xo, yo, xi, yi, 32, 32)
.fuse(xo, yo, ti)
.parallel(ch)
.parallel(ti)
.parallel(ti, 4)
.vectorize(xi, 32);

in.compute_root();

return gammacorrection;
}

Expand All @@ -749,13 +773,15 @@ Func contrast_fn(Func input, Param<int32_t> width, Param<int32_t> height, Param<

contrast(x, y, ch) = cast<uint8_t>(value);

contrast.compute_root()
contrast.compute_at(in, x)
.tile(x, y, xo, yo, xi, yi, 32, 32)
.fuse(xo, yo, ti)
.parallel(ch)
.parallel(ti)
.parallel(ti, 4)
.vectorize(xi, 32);

in.compute_root();

return contrast;
}

Expand Down Expand Up @@ -788,12 +814,11 @@ Func boxblur_fn(Func input, Param<int32_t> width, Param<int32_t> height, Param<u
.parallel(y, 8)
.vectorize(x, 32);

boxblur.compute_root()
.async()
boxblur.compute_at(in, x)
.tile(x, y, xo, yo, xi, yi, 32, 32)
.fuse(xo, yo, ti)
.parallel(ch)
.parallel(ti, 8)
.parallel(ti)
.vectorize(xi, 32);

in.compute_root();
Expand Down

0 comments on commit 7661853

Please sign in to comment.