diff --git a/BUILD b/BUILD
new file mode 100755
index 00000000..a4fe1e06
--- /dev/null
+++ b/BUILD
@@ -0,0 +1,33 @@
+# Description:
+#   Bazel build file for Guetzli.
+
+cc_library(
+    name = "guetzli_lib",
+    srcs = glob(
+        [
+            "guetzli/*.h",
+            "guetzli/*.cc",
+            "guetzli/*.inc",
+        ],
+        exclude = ["guetzli/guetzli.cc"],
+    ),
+    copts = [ "-Wno-sign-compare" ],
+    deps = [
+        "@butteraugli//:butteraugli_lib",
+    ],
+)
+
+cc_binary(
+    name = "guetzli",
+    srcs = ["guetzli/guetzli.cc"],
+    linkopts = [
+        # TODO(robryk): Remove once https://github.com/gflags/gflags/issues/176
+        # is fixed
+        "-lpthread",
+    ],
+    deps = [
+        ":guetzli_lib",
+        "//external:gflags",
+        "@png_archive//:png",
+    ],
+)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100755
index 00000000..ff0a8318
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,27 @@
+Want to contribute bugfixes or non-output-modifying changes (e.g. speedups)?
+Great! First, read this page (including the small print at the end). Want to
+contribute changes that modify the generated JPEG file? Talk to us first.
+
+### Before you contribute
+Before we can use your code, you must sign the
+[Google Individual Contributor License Agreement](https://cla.developers.google.com/about/google-individual)
+(CLA), which you can do online. The CLA is necessary mainly because you own the
+copyright to your changes, even after your contribution becomes part of our
+codebase, so we need your permission to use and distribute your code. We also
+need to be sure of various other things—for instance that you'll tell us if you
+know that your code infringes on other people's patents. You don't have to sign
+the CLA until after you've submitted your code for review and a member has
+approved it, but you must do it before we can put your code into our codebase.
+Before you start working on a larger contribution, you should get in touch with
+us first through the issue tracker with your idea so that we can help out and
+possibly guide you. Coordinating up front makes it much easier to avoid
+frustration later on.
+
+### Code reviews
+All submissions, including submissions by project members, require review. We
+use Github pull requests for this purpose.
+
+### The small print
+Contributions made by corporations are covered by a different agreement than
+the one above, the
+[Software Grant and Corporate Contributor License Agreement](https://cla.developers.google.com/about/google-corporate).
diff --git a/LICENSE b/LICENSE
new file mode 100755
index 00000000..d6456956
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
new file mode 100755
index 00000000..f1c771b8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,35 @@
+# Introduction
+
+Guetzli is a JPEG encoder that aims for excellent compression density at high
+visual quality. Guetzli-generated images are typically 20-30% smaller than
+images of equivalent quality generated by libjpeg. Guetzli generates only
+sequential (nonprogressive) JPEGs due to faster decompression speeds they offer.
+
+# Building {#building}
+
+1.  Get a copy of the source code, either by cloning this repository, or by
+    downloading an
+    [archive](https://github.com/google/guetzli/archive/master.zip) and
+    unpacking it.
+2.  Install [Bazel](https://www.bazel.io) by following its [installation
+    instructions](https://www.bazel.io/versions/master/docs/install.html).
+3.  Run `bazel build -c opt :guetzli` in the directory this README file is in.
+
+# Using
+
+To try out Guetzli you need to [build](#building) or
+[download](https://github.com/google/guetzli/releases) the Guetzli binary. The
+binary reads a PNG or JPEG image and creates an optimized JPEG image:
+
+```bash
+bazel-bin/guetzli original.png output.jpg
+bazel-bin/guetzli original.jpg output.jpg
+```
+
+You can pass a `--quality Q` parameter to set quality in units equivalent to
+libjpeg quality. You can also pass a `--verbose` flag to see a trace of encoding
+attempts made.
+
+Please note that JPEG images do not support alpha channel (transparency). If
+the input is a PNG with an alpha channel, it will be overlaid on black
+background before encoding.
diff --git a/WORKSPACE b/WORKSPACE
new file mode 100755
index 00000000..4d15742d
--- /dev/null
+++ b/WORKSPACE
@@ -0,0 +1,37 @@
+# Description:
+#   Bazel workspace file for Guetzli.
+
+workspace(name = "guetzli")
+
+new_http_archive(
+    name = "png_archive",
+    build_file = "png.BUILD",
+    sha256 = "c35bcc6387495ee6e757507a68ba036d38ad05b415c2553b3debe2a57647a692",
+    strip_prefix = "libpng-1.2.53",
+    url = "http://github.com/glennrp/libpng/archive/v1.2.53.zip",
+)
+
+new_http_archive(
+    name = "zlib_archive",
+    build_file = "zlib.BUILD",
+    sha256 = "36658cb768a54c1d4dec43c3116c27ed893e88b02ecfcb44f2166f9c0b7f2a0d",
+    strip_prefix = "zlib-1.2.8",
+    url = "http://zlib.net/zlib-1.2.8.tar.gz",
+)
+
+git_repository(
+    name = "gflags_git",
+    commit = "cce68f0c9c5d054017425e6e6fd54f696d36e8ee",
+    remote = "https://github.com/gflags/gflags.git",
+)
+
+bind(
+    name = "gflags",
+    actual = "@gflags_git//:gflags",
+)
+
+git_repository(
+    name = "butteraugli",
+    commit = "037eff792f5b1cc7c21cc2cdd325de712c387e11",
+    remote = "https://github.com/google/butteraugli",
+)
diff --git a/guetzli/butteraugli_comparator.cc b/guetzli/butteraugli_comparator.cc
new file mode 100755
index 00000000..da642959
--- /dev/null
+++ b/guetzli/butteraugli_comparator.cc
@@ -0,0 +1,229 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "guetzli/butteraugli_comparator.h"
+
+#include "guetzli/debug_print.h"
+#include "guetzli/gamma_correct.h"
+#include "guetzli/score.h"
+
+namespace guetzli {
+
+namespace {
+using ::butteraugli::ConstRestrict;
+using ::butteraugli::ImageF;
+using ::butteraugli::CreatePlanes;
+using ::butteraugli::PlanesFromPacked;
+using ::butteraugli::PackedFromPlanes;
+
+std::vector<ImageF> LinearRgb(const size_t xsize, const size_t ysize,
+                              const std::vector<uint8_t>& rgb) {
+  const double* lut = Srgb8ToLinearTable();
+  std::vector<ImageF> planes = CreatePlanes<float>(xsize, ysize, 3);
+  for (int c = 0; c < 3; ++c) {
+    for (size_t y = 0; y < ysize; ++y) {
+      ConstRestrict<const uint8_t*> row_in = &rgb[3 * xsize * y];
+      ConstRestrict<float*> row_out = planes[c].Row(y);
+      for (size_t x = 0; x < xsize; ++x) {
+        row_out[x] = lut[row_in[3 * x + c]];
+      }
+    }
+  }
+  return planes;
+}
+
+}  // namespace
+
+ButteraugliComparator::ButteraugliComparator(const int width, const int height,
+                                             const std::vector<uint8_t>& rgb,
+                                             const float target_distance,
+                                             ProcessStats* stats)
+    : width_(width),
+      height_(height),
+      target_distance_(target_distance),
+      comparator_(width_, height_, kButteraugliStep),
+      distance_(0.0),
+      distmap_(width_, height_),
+      stats_(stats) {
+  rgb_linear_pregamma_ = LinearRgb(width, height, rgb);
+  const int block_w = (width_ + 7) / 8;
+  const int block_h = (height_ + 7) / 8;
+  const int nblocks = block_w * block_h;
+  per_block_pregamma_.resize(nblocks);
+  for (int block_y = 0, bx = 0; block_y < block_h; ++block_y) {
+    for (int block_x = 0; block_x < block_w; ++block_x, ++bx) {
+      per_block_pregamma_[bx].resize(3, std::vector<float>(kDCTBlockSize));
+      for (int iy = 0, i = 0; iy < 8; ++iy) {
+        for (int ix = 0; ix < 8; ++ix, ++i) {
+          int x = std::min(8 * block_x + ix, width_ - 1);
+          int y = std::min(8 * block_y + iy, height_ - 1);
+          for (int c = 0; c < 3; ++c) {
+            ConstRestrict<const float*> row_linear =
+                rgb_linear_pregamma_[c].Row(y);
+            per_block_pregamma_[bx][c][i] = row_linear[x];
+          }
+        }
+      }
+      ::butteraugli::OpsinDynamicsImage(8, 8, per_block_pregamma_[bx]);
+    }
+  }
+  std::vector<std::vector<float>> pregamma =
+      PackedFromPlanes(rgb_linear_pregamma_);
+  ::butteraugli::OpsinDynamicsImage(width_, height_, pregamma);
+  rgb_linear_pregamma_ = PlanesFromPacked(width_, height_, pregamma);
+  std::vector<std::vector<float> > dummy(3);
+  ::butteraugli::Mask(pregamma, pregamma, width_, height_,
+                      &mask_xyz_, &dummy);
+}
+
+void ButteraugliComparator::Compare(const OutputImage& img) {
+  std::vector<std::vector<float> > rgb(3, std::vector<float>(width_ * height_));
+  img.ToLinearRGB(&rgb);
+  ::butteraugli::OpsinDynamicsImage(width_, height_, rgb);
+  ImageF distmap;
+  const std::vector<ImageF> rgb_planes = PlanesFromPacked(width_, height_, rgb);
+  comparator_.DiffmapOpsinDynamicsImage(rgb_linear_pregamma_,
+                                        rgb_planes, distmap);
+  distmap_.resize(width_ * height_);
+  CopyToPacked(distmap, &distmap_);
+  distance_ = ::butteraugli::ButteraugliScoreFromDiffmap(distmap);
+  GUETZLI_LOG(stats_, " BA[100.00%%] D[%6.4f]", distance_);
+}
+
+double ButteraugliComparator::CompareBlock(
+    const OutputImage& img, int block_x, int block_y) const {
+  int xmin = 8 * block_x;
+  int ymin = 8 * block_y;
+  int block_ix = block_y * ((width_ + 7) / 8) + block_x;
+  const std::vector<std::vector<float> >& rgb0_c =
+      per_block_pregamma_[block_ix];
+
+  std::vector<std::vector<float> > rgb1_c(3, std::vector<float>(kDCTBlockSize));
+  img.ToLinearRGB(xmin, ymin, 8, 8, &rgb1_c);
+  ::butteraugli::OpsinDynamicsImage(8, 8, rgb1_c);
+
+  std::vector<std::vector<float> > rgb0 = rgb0_c;
+  std::vector<std::vector<float> > rgb1 = rgb1_c;
+
+  ::butteraugli::MaskHighIntensityChange(8, 8, rgb0_c, rgb1_c, rgb0, rgb1);
+
+  double b0[3 * kDCTBlockSize];
+  double b1[3 * kDCTBlockSize];
+  for (int c = 0; c < 3; ++c) {
+    for (int ix = 0; ix < kDCTBlockSize; ++ix) {
+      b0[c * kDCTBlockSize + ix] = rgb0[c][ix];
+      b1[c * kDCTBlockSize + ix] = rgb1[c][ix];
+    }
+  }
+  double diff_xyz_dc[3] = { 0.0 };
+  double diff_xyz_ac[3] = { 0.0 };
+  double diff_xyz_edge_dc[3] = { 0.0 };
+  ::butteraugli::ButteraugliBlockDiff(
+       b0, b1, diff_xyz_dc, diff_xyz_ac, diff_xyz_edge_dc);
+
+  double scale[3];
+  for (int c = 0; c < 3; ++c) {
+    scale[c] = mask_xyz_[c][ymin * width_ + xmin];
+  }
+
+  static const double kEdgeWeight = 0.05;
+
+  double diff = 0.0;
+  double diff_edge = 0.0;
+  for (int c = 0; c < 3; ++c) {
+    diff += diff_xyz_dc[c] * scale[c];
+    diff += diff_xyz_ac[c] * scale[c];
+    diff_edge += diff_xyz_edge_dc[c] * scale[c];
+  }
+  return sqrt((1 - kEdgeWeight) * diff + kEdgeWeight * diff_edge);
+}
+
+float ButteraugliComparator::BlockErrorLimit() const {
+  return target_distance_;
+}
+
+void ButteraugliComparator::ComputeBlockErrorAdjustmentWeights(
+      int direction,
+      int max_block_dist,
+      double target_mul,
+      int factor_x, int factor_y,
+      const std::vector<float>& distmap,
+      std::vector<float>* block_weight) {
+  const double target_distance = target_distance_ * target_mul;
+  const int sizex = 8 * factor_x;
+  const int sizey = 8 * factor_y;
+  const int block_width = (width_ + sizex - 1) / sizex;
+  const int block_height = (height_ + sizey - 1) / sizey;
+  std::vector<float> max_dist_per_block(block_width * block_height);
+  for (int block_y = 0; block_y < block_height; ++block_y) {
+    for (int block_x = 0; block_x < block_width; ++block_x) {
+      int block_ix = block_y * block_width + block_x;
+      int x_max = std::min(width_, sizex * (block_x + 1));
+      int y_max = std::min(height_, sizey * (block_y + 1));
+      float max_dist = 0.0;
+      for (int y = sizey * block_y; y < y_max; ++y) {
+        for (int x = sizex * block_x; x < x_max; ++x) {
+          max_dist = std::max(max_dist, distmap[y * width_ + x]);
+        }
+      }
+      max_dist_per_block[block_ix] = max_dist;
+    }
+  }
+  for (int block_y = 0; block_y < block_height; ++block_y) {
+    for (int block_x = 0; block_x < block_width; ++block_x) {
+      int block_ix = block_y * block_width + block_x;
+      float max_local_dist = target_distance;
+      int x_min = std::max(0, block_x - max_block_dist);
+      int y_min = std::max(0, block_y - max_block_dist);
+      int x_max = std::min(block_width, block_x + 1 + max_block_dist);
+      int y_max = std::min(block_height, block_y + 1 + max_block_dist);
+      for (int y = y_min; y < y_max; ++y) {
+        for (int x = x_min; x < x_max; ++x) {
+          max_local_dist =
+              std::max(max_local_dist, max_dist_per_block[y * block_width + x]);
+        }
+      }
+      if (direction > 0) {
+        if (max_dist_per_block[block_ix] <= target_distance &&
+            max_local_dist <= 1.1 * target_distance) {
+          (*block_weight)[block_ix] = 1.0;
+        }
+      } else {
+        constexpr double kLocalMaxWeight = 0.5;
+        if (max_dist_per_block[block_ix] <=
+            (1 - kLocalMaxWeight) * target_distance +
+            kLocalMaxWeight * max_local_dist) {
+          continue;
+        }
+        for (int y = y_min; y < y_max; ++y) {
+          for (int x = x_min; x < x_max; ++x) {
+            int d = std::max(std::abs(y - block_y), std::abs(x - block_x));
+            int ix = y * block_width + x;
+            (*block_weight)[ix] = std::max<float>(
+                (*block_weight)[ix], 1.0 / (d + 1.0));
+          }
+        }
+      }
+    }
+  }
+}
+
+double ButteraugliComparator::ScoreOutputSize(int size) const {
+  return ScoreJPEG(distance_, size, target_distance_);
+}
+
+
+}  // namespace guetzli
diff --git a/guetzli/butteraugli_comparator.h b/guetzli/butteraugli_comparator.h
new file mode 100755
index 00000000..d9ae0b0a
--- /dev/null
+++ b/guetzli/butteraugli_comparator.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GUETZLI_BUTTERAUGLI_COMPARATOR_H_
+#define GUETZLI_BUTTERAUGLI_COMPARATOR_H_
+
+#include <vector>
+
+#include "butteraugli/butteraugli.h"
+#include "guetzli/comparator.h"
+#include "guetzli/jpeg_data.h"
+#include "guetzli/output_image.h"
+#include "guetzli/stats.h"
+
+namespace guetzli {
+
+constexpr int kButteraugliStep = 3;
+
+class ButteraugliComparator : public Comparator {
+ public:
+  ButteraugliComparator(const int width, const int height,
+                        const std::vector<uint8_t>& rgb,
+                        const float target_distance, ProcessStats* stats);
+
+  void Compare(const OutputImage& img) override;
+
+  double CompareBlock(const OutputImage& img,
+                      int block_x, int block_y) const override;
+
+  double ScoreOutputSize(int size) const override;
+
+  bool DistanceOK(double target_mul) const override {
+    return distance_ <= target_mul * target_distance_;
+  }
+
+  const std::vector<float> distmap() const override { return distmap_; }
+  float distmap_aggregate() const override { return distance_; }
+
+  float BlockErrorLimit() const override;
+
+  void ComputeBlockErrorAdjustmentWeights(
+      int direction, int max_block_dist, double target_mul, int factor_x,
+      int factor_y, const std::vector<float>& distmap,
+      std::vector<float>* block_weight) override;
+
+ private:
+  const int width_;
+  const int height_;
+  const float target_distance_;
+  std::vector<::butteraugli::ImageF> rgb_linear_pregamma_;
+  std::vector<std::vector<float>> mask_xyz_;
+  std::vector<std::vector<std::vector<float>>> per_block_pregamma_;
+  ::butteraugli::ButteraugliComparator comparator_;
+  float distance_;
+  std::vector<float> distmap_;
+  ProcessStats* stats_;
+};
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_BUTTERAUGLI_COMPARATOR_H_
diff --git a/guetzli/color_transform.h b/guetzli/color_transform.h
new file mode 100755
index 00000000..850b2d0b
--- /dev/null
+++ b/guetzli/color_transform.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GUETZLI_COLOR_TRANSFORM_H_
+#define GUETZLI_COLOR_TRANSFORM_H_
+
+namespace guetzli {
+
+static const int kCrToRedTable[256] = {
+  -179, -178, -177, -175, -174, -172, -171, -170, -168, -167, -165, -164,
+  -163, -161, -160, -158, -157, -156, -154, -153, -151, -150, -149, -147,
+  -146, -144, -143, -142, -140, -139, -137, -136, -135, -133, -132, -130,
+  -129, -128, -126, -125, -123, -122, -121, -119, -118, -116, -115, -114,
+  -112, -111, -109, -108, -107, -105, -104, -102, -101, -100,  -98,  -97,
+   -95,  -94,  -93,  -91,  -90,  -88,  -87,  -86,  -84,  -83,  -81,  -80,
+   -79,  -77,  -76,  -74,  -73,  -72,  -70,  -69,  -67,  -66,  -64,  -63,
+   -62,  -60,  -59,  -57,  -56,  -55,  -53,  -52,  -50,  -49,  -48,  -46,
+   -45,  -43,  -42,  -41,  -39,  -38,  -36,  -35,  -34,  -32,  -31,  -29,
+   -28,  -27,  -25,  -24,  -22,  -21,  -20,  -18,  -17,  -15,  -14,  -13,
+   -11,  -10,   -8,   -7,   -6,   -4,   -3,   -1,    0,    1,    3,    4,
+     6,    7,    8,   10,   11,   13,   14,   15,   17,   18,   20,   21,
+    22,   24,   25,   27,   28,   29,   31,   32,   34,   35,   36,   38,
+    39,   41,   42,   43,   45,   46,   48,   49,   50,   52,   53,   55,
+    56,   57,   59,   60,   62,   63,   64,   66,   67,   69,   70,   72,
+    73,   74,   76,   77,   79,   80,   81,   83,   84,   86,   87,   88,
+    90,   91,   93,   94,   95,   97,   98,  100,  101,  102,  104,  105,
+   107,  108,  109,  111,  112,  114,  115,  116,  118,  119,  121,  122,
+   123,  125,  126,  128,  129,  130,  132,  133,  135,  136,  137,  139,
+   140,  142,  143,  144,  146,  147,  149,  150,  151,  153,  154,  156,
+   157,  158,  160,  161,  163,  164,  165,  167,  168,  170,  171,  172,
+   174,  175,  177,  178
+};
+
+static const int kCbToBlueTable[256] = {
+  -227, -225, -223, -222, -220, -218, -216, -214, -213, -211, -209, -207,
+  -206, -204, -202, -200, -198, -197, -195, -193, -191, -190, -188, -186,
+  -184, -183, -181, -179, -177, -175, -174, -172, -170, -168, -167, -165,
+  -163, -161, -159, -158, -156, -154, -152, -151, -149, -147, -145, -144,
+  -142, -140, -138, -136, -135, -133, -131, -129, -128, -126, -124, -122,
+  -120, -119, -117, -115, -113, -112, -110, -108, -106, -105, -103, -101,
+   -99,  -97,  -96,  -94,  -92,  -90,  -89,  -87,  -85,  -83,  -82,  -80,
+   -78,  -76,  -74,  -73,  -71,  -69,  -67,  -66,  -64,  -62,  -60,  -58,
+   -57,  -55,  -53,  -51,  -50,  -48,  -46,  -44,  -43,  -41,  -39,  -37,
+   -35,  -34,  -32,  -30,  -28,  -27,  -25,  -23,  -21,  -19,  -18,  -16,
+   -14,  -12,  -11,   -9,   -7,   -5,   -4,   -2,    0,    2,    4,    5,
+     7,    9,   11,   12,   14,   16,   18,   19,   21,   23,   25,   27,
+    28,   30,   32,   34,   35,   37,   39,   41,   43,   44,   46,   48,
+    50,   51,   53,   55,   57,   58,   60,   62,   64,   66,   67,   69,
+    71,   73,   74,   76,   78,   80,   82,   83,   85,   87,   89,   90,
+    92,   94,   96,   97,   99,  101,  103,  105,  106,  108,  110,  112,
+   113,  115,  117,  119,  120,  122,  124,  126,  128,  129,  131,  133,
+   135,  136,  138,  140,  142,  144,  145,  147,  149,  151,  152,  154,
+   156,  158,  159,  161,  163,  165,  167,  168,  170,  172,  174,  175,
+   177,  179,  181,  183,  184,  186,  188,  190,  191,  193,  195,  197,
+   198,  200,  202,  204,  206,  207,  209,  211,  213,  214,  216,  218,
+   220,  222,  223,  225,
+};
+
+static const int kCrToGreenTable[256] = {
+  5990656,  5943854,  5897052,  5850250,  5803448,  5756646,  5709844,  5663042,
+  5616240,  5569438,  5522636,  5475834,  5429032,  5382230,  5335428,  5288626,
+  5241824,  5195022,  5148220,  5101418,  5054616,  5007814,  4961012,  4914210,
+  4867408,  4820606,  4773804,  4727002,  4680200,  4633398,  4586596,  4539794,
+  4492992,  4446190,  4399388,  4352586,  4305784,  4258982,  4212180,  4165378,
+  4118576,  4071774,  4024972,  3978170,  3931368,  3884566,  3837764,  3790962,
+  3744160,  3697358,  3650556,  3603754,  3556952,  3510150,  3463348,  3416546,
+  3369744,  3322942,  3276140,  3229338,  3182536,  3135734,  3088932,  3042130,
+  2995328,  2948526,  2901724,  2854922,  2808120,  2761318,  2714516,  2667714,
+  2620912,  2574110,  2527308,  2480506,  2433704,  2386902,  2340100,  2293298,
+  2246496,  2199694,  2152892,  2106090,  2059288,  2012486,  1965684,  1918882,
+  1872080,  1825278,  1778476,  1731674,  1684872,  1638070,  1591268,  1544466,
+  1497664,  1450862,  1404060,  1357258,  1310456,  1263654,  1216852,  1170050,
+  1123248,  1076446,  1029644,   982842,   936040,   889238,   842436,   795634,
+   748832,   702030,   655228,   608426,   561624,   514822,   468020,   421218,
+   374416,   327614,   280812,   234010,   187208,   140406,    93604,    46802,
+        0,   -46802,   -93604,  -140406,  -187208,  -234010,  -280812,  -327614,
+  -374416,  -421218,  -468020,  -514822,  -561624,  -608426,  -655228,  -702030,
+  -748832,  -795634,  -842436,  -889238,  -936040,  -982842, -1029644, -1076446,
+ -1123248, -1170050, -1216852, -1263654, -1310456, -1357258, -1404060, -1450862,
+ -1497664, -1544466, -1591268, -1638070, -1684872, -1731674, -1778476, -1825278,
+ -1872080, -1918882, -1965684, -2012486, -2059288, -2106090, -2152892, -2199694,
+ -2246496, -2293298, -2340100, -2386902, -2433704, -2480506, -2527308, -2574110,
+ -2620912, -2667714, -2714516, -2761318, -2808120, -2854922, -2901724, -2948526,
+ -2995328, -3042130, -3088932, -3135734, -3182536, -3229338, -3276140, -3322942,
+ -3369744, -3416546, -3463348, -3510150, -3556952, -3603754, -3650556, -3697358,
+ -3744160, -3790962, -3837764, -3884566, -3931368, -3978170, -4024972, -4071774,
+ -4118576, -4165378, -4212180, -4258982, -4305784, -4352586, -4399388, -4446190,
+ -4492992, -4539794, -4586596, -4633398, -4680200, -4727002, -4773804, -4820606,
+ -4867408, -4914210, -4961012, -5007814, -5054616, -5101418, -5148220, -5195022,
+ -5241824, -5288626, -5335428, -5382230, -5429032, -5475834, -5522636, -5569438,
+ -5616240, -5663042, -5709844, -5756646, -5803448, -5850250, -5897052, -5943854,
+};
+
+static const int kCbToGreenTable[256] = {
+  2919680,  2897126,  2874572,  2852018,  2829464,  2806910,  2784356,  2761802,
+  2739248,  2716694,  2694140,  2671586,  2649032,  2626478,  2603924,  2581370,
+  2558816,  2536262,  2513708,  2491154,  2468600,  2446046,  2423492,  2400938,
+  2378384,  2355830,  2333276,  2310722,  2288168,  2265614,  2243060,  2220506,
+  2197952,  2175398,  2152844,  2130290,  2107736,  2085182,  2062628,  2040074,
+  2017520,  1994966,  1972412,  1949858,  1927304,  1904750,  1882196,  1859642,
+  1837088,  1814534,  1791980,  1769426,  1746872,  1724318,  1701764,  1679210,
+  1656656,  1634102,  1611548,  1588994,  1566440,  1543886,  1521332,  1498778,
+  1476224,  1453670,  1431116,  1408562,  1386008,  1363454,  1340900,  1318346,
+  1295792,  1273238,  1250684,  1228130,  1205576,  1183022,  1160468,  1137914,
+  1115360,  1092806,  1070252,  1047698,  1025144,  1002590,   980036,   957482,
+   934928,   912374,   889820,   867266,   844712,   822158,   799604,   777050,
+   754496,   731942,   709388,   686834,   664280,   641726,   619172,   596618,
+   574064,   551510,   528956,   506402,   483848,   461294,   438740,   416186,
+   393632,   371078,   348524,   325970,   303416,   280862,   258308,   235754,
+   213200,   190646,   168092,   145538,   122984,   100430,    77876,    55322,
+    32768,    10214,   -12340,   -34894,   -57448,   -80002,  -102556,  -125110,
+  -147664,  -170218,  -192772,  -215326,  -237880,  -260434,  -282988,  -305542,
+  -328096,  -350650,  -373204,  -395758,  -418312,  -440866,  -463420,  -485974,
+  -508528,  -531082,  -553636,  -576190,  -598744,  -621298,  -643852,  -666406,
+  -688960,  -711514,  -734068,  -756622,  -779176,  -801730,  -824284,  -846838,
+  -869392,  -891946,  -914500,  -937054,  -959608,  -982162, -1004716, -1027270,
+ -1049824, -1072378, -1094932, -1117486, -1140040, -1162594, -1185148, -1207702,
+ -1230256, -1252810, -1275364, -1297918, -1320472, -1343026, -1365580, -1388134,
+ -1410688, -1433242, -1455796, -1478350, -1500904, -1523458, -1546012, -1568566,
+ -1591120, -1613674, -1636228, -1658782, -1681336, -1703890, -1726444, -1748998,
+ -1771552, -1794106, -1816660, -1839214, -1861768, -1884322, -1906876, -1929430,
+ -1951984, -1974538, -1997092, -2019646, -2042200, -2064754, -2087308, -2109862,
+ -2132416, -2154970, -2177524, -2200078, -2222632, -2245186, -2267740, -2290294,
+ -2312848, -2335402, -2357956, -2380510, -2403064, -2425618, -2448172, -2470726,
+ -2493280, -2515834, -2538388, -2560942, -2583496, -2606050, -2628604, -2651158,
+ -2673712, -2696266, -2718820, -2741374, -2763928, -2786482, -2809036, -2831590,
+};
+
+static const uint8_t kRangeLimitLut[4 * 256] = {
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
+  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,
+  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,
+  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,
+  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,
+  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,  79,
+  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,
+  96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+ 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+ 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+ 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+ 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+ 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207,
+ 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+ 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
+ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+};
+
+static const uint8_t* kRangeLimit = kRangeLimitLut + 384;
+
+inline void ColorTransformYCbCrToRGB(uint8_t* pixel) {
+  int y  = pixel[0];
+  int cb = pixel[1];
+  int cr = pixel[2];
+  pixel[0] = kRangeLimit[y + kCrToRedTable[cr]];
+  pixel[1] = kRangeLimit[y +
+                         ((kCrToGreenTable[cr] + kCbToGreenTable[cb]) >> 16)];
+  pixel[2] = kRangeLimit[y + kCbToBlueTable[cb]];
+}
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_COLOR_TRANSFORM_H_
diff --git a/guetzli/comparator.h b/guetzli/comparator.h
new file mode 100755
index 00000000..6c49caf0
--- /dev/null
+++ b/guetzli/comparator.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GUETZLI_COMPARATOR_H_
+#define GUETZLI_COMPARATOR_H_
+
+#include <vector>
+
+#include "guetzli/output_image.h"
+#include "guetzli/stats.h"
+
+namespace guetzli {
+
+// Represents a baseline image, a comparison metric and an image acceptance
+// criteria based on this metric.
+class Comparator {
+ public:
+  Comparator() {}
+  virtual ~Comparator() {}
+
+  // Compares img with the baseline image and saves the resulting distance map
+  // inside the object. The provided image must have the same dimensions as the
+  // baseline image.
+  virtual void Compare(const OutputImage& img) = 0;
+
+  // Compares an 8x8 block of the baseline image with the same block of img and
+  // returns the resulting per-block distance. The interpretation of the
+  // returned distance depends on the comparator used.
+  virtual double CompareBlock(const OutputImage& img,
+                              int block_x, int block_y) const = 0;
+
+  // Returns the combined score of the output image in the last Compare() call
+  // (or the baseline image, if Compare() was not called yet), based on output
+  // size and the similarity metric.
+  virtual double ScoreOutputSize(int size) const = 0;
+
+  // Returns true if the argument of the last Compare() call (or the baseline
+  // image, if Compare() was not called yet) meets the image acceptance
+  // criteria. The target_mul modifies the acceptance criteria used in this call
+  // the following way:
+  //    = 1.0 : the original acceptance criteria is used,
+  //    < 1.0 : a more strict acceptance criteria is used,
+  //    > 1.0 : a less strict acceptance criteria is used.
+  virtual bool DistanceOK(double target_mul) const = 0;
+
+  // Returns the distance map between the baseline image and the image in the
+  // last Compare() call (or the baseline image, if Compare() was not called
+  // yet).
+  // The dimensions of the distance map are the same as the baseline image.
+  // The interpretation of the distance values depend on the comparator used.
+  virtual const std::vector<float> distmap() const = 0;
+
+  // Returns an aggregate distance or similarity value between the baseline
+  // image and the image in the last Compare() call (or the baseline image, if
+  // Compare() was not called yet).
+  // The interpretation of this aggregate value depends on the comparator used.
+  virtual float distmap_aggregate() const = 0;
+
+  // Returns a heuristic cutoff on block errors in the sense that we won't
+  // consider distortions where a block error is greater than this.
+  virtual float BlockErrorLimit() const = 0;
+  // Given the search direction (+1 for upwards and -1 for downwards) and the
+  // current distance map, fills in *block_weight image with the relative block
+  // error adjustment weights.
+  // The target_mul param has the same semantics as in DistanceOK().
+  // Note that this is essentially a static function in the sense that it does
+  // not depend on the last Compare() call.
+  virtual void ComputeBlockErrorAdjustmentWeights(
+      int direction, int max_block_dist, double target_mul, int factor_x,
+      int factor_y, const std::vector<float>& distmap,
+      std::vector<float>* block_weight) = 0;
+};
+
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_COMPARATOR_H_
diff --git a/guetzli/dct_double.cc b/guetzli/dct_double.cc
new file mode 100755
index 00000000..037381d7
--- /dev/null
+++ b/guetzli/dct_double.cc
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "guetzli/dct_double.h"
+
+#include <algorithm>
+#include <cmath>
+
+namespace guetzli {
+
+namespace {
+
+// kDCTMatrix[8*u+x] = 0.5*alpha(u)*cos((2*x+1)*u*M_PI/16),
+// where alpha(0) = 1/sqrt(2) and alpha(u) = 1 for u > 0.
+static const double kDCTMatrix[64] = {
+  0.3535533906,  0.3535533906,  0.3535533906,  0.3535533906,
+  0.3535533906,  0.3535533906,  0.3535533906,  0.3535533906,
+  0.4903926402,  0.4157348062,  0.2777851165,  0.0975451610,
+ -0.0975451610, -0.2777851165, -0.4157348062, -0.4903926402,
+  0.4619397663,  0.1913417162, -0.1913417162, -0.4619397663,
+ -0.4619397663, -0.1913417162,  0.1913417162,  0.4619397663,
+  0.4157348062, -0.0975451610, -0.4903926402, -0.2777851165,
+  0.2777851165,  0.4903926402,  0.0975451610, -0.4157348062,
+  0.3535533906, -0.3535533906, -0.3535533906,  0.3535533906,
+  0.3535533906, -0.3535533906, -0.3535533906,  0.3535533906,
+  0.2777851165, -0.4903926402,  0.0975451610,  0.4157348062,
+ -0.4157348062, -0.0975451610,  0.4903926402, -0.2777851165,
+  0.1913417162, -0.4619397663,  0.4619397663, -0.1913417162,
+ -0.1913417162,  0.4619397663, -0.4619397663,  0.1913417162,
+  0.0975451610, -0.2777851165,  0.4157348062, -0.4903926402,
+  0.4903926402, -0.4157348062,  0.2777851165, -0.0975451610,
+};
+
+void DCT1d(const double* in, int stride, double* out) {
+  for (int x = 0; x < 8; ++x) {
+    out[x * stride] = 0.0;
+    for (int u = 0; u < 8; ++u) {
+      out[x * stride] += kDCTMatrix[8 * x + u] * in[u * stride];
+    }
+  }
+}
+
+void IDCT1d(const double* in, int stride, double* out) {
+  for (int x = 0; x < 8; ++x) {
+    out[x * stride] = 0.0;
+    for (int u = 0; u < 8; ++u) {
+      out[x * stride] += kDCTMatrix[8 * u + x] * in[u * stride];
+    }
+  }
+}
+
+typedef void (*Transform1d)(const double* in, int stride, double* out);
+
+void TransformBlock(double block[64], Transform1d f) {
+  double tmp[64];
+  for (int x = 0; x < 8; ++x) {
+    f(&block[x], 8, &tmp[x]);
+  }
+  for (int y = 0; y < 8; ++y) {
+    f(&tmp[8 * y], 1, &block[8 * y]);
+  }
+}
+
+}  // namespace
+
+void ComputeBlockDCTDouble(double block[64]) {
+  TransformBlock(block, DCT1d);
+}
+
+void ComputeBlockIDCTDouble(double block[64]) {
+  TransformBlock(block, IDCT1d);
+}
+
+}  // namespace guetzli
diff --git a/guetzli/dct_double.h b/guetzli/dct_double.h
new file mode 100755
index 00000000..636b9f9c
--- /dev/null
+++ b/guetzli/dct_double.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GUETZLI_DCT_DOUBLE_H_
+#define GUETZLI_DCT_DOUBLE_H_
+
+namespace guetzli {
+
+// Performs in-place floating point 8x8 DCT on block[0..63].
+// Note that the DCT used here is the DCT-2 with the first term multiplied by
+// 1/sqrt(2) and the result scaled by 1/2.
+void ComputeBlockDCTDouble(double block[64]);
+
+// Performs in-place floating point 8x8 inverse DCT on block[0..63].
+void ComputeBlockIDCTDouble(double block[64]);
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_DCT_DOUBLE_H_
diff --git a/guetzli/debug_print.cc b/guetzli/debug_print.cc
new file mode 100755
index 00000000..b4fbf8c1
--- /dev/null
+++ b/guetzli/debug_print.cc
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "guetzli/debug_print.h"
+
+namespace guetzli {
+
+void PrintDebug(ProcessStats* stats, std::string s) {
+  if (stats->debug_output) {
+    stats->debug_output->append(s);
+  }
+  if (stats->debug_output_file) {
+    fprintf(stats->debug_output_file, "%s", s.c_str());
+  }
+}
+
+}  // namespace guetzli
diff --git a/guetzli/debug_print.h b/guetzli/debug_print.h
new file mode 100755
index 00000000..5ba0135d
--- /dev/null
+++ b/guetzli/debug_print.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GUETZLI_DEBUG_PRINT_H_
+#define GUETZLI_DEBUG_PRINT_H_
+
+#include "guetzli/stats.h"
+
+namespace guetzli {
+
+void PrintDebug(ProcessStats* stats, std::string s);
+
+}  // namespace guetzli
+
+#define GUETZLI_LOG(stats, ...)                                    \
+  do {                                                             \
+    char debug_string[1024];                                       \
+    snprintf(debug_string, sizeof(debug_string), __VA_ARGS__);     \
+    debug_string[sizeof(debug_string) - 1] = '\0';                 \
+    ::guetzli::PrintDebug(                      \
+         stats, std::string(debug_string));        \
+  } while (0)
+#define GUETZLI_LOG_QUANT(stats, q)                    \
+  for (int y = 0; y < 8; ++y) {                        \
+    for (int c = 0; c < 3; ++c) {                      \
+      for (int x = 0; x < 8; ++x)                      \
+        GUETZLI_LOG(stats, " %2d", (q)[c][8 * y + x]); \
+      GUETZLI_LOG(stats, "   ");                       \
+    }                                                  \
+    GUETZLI_LOG(stats, "\n");                          \
+  }
+
+#endif  // GUETZLI_DEBUG_PRINT_H_
diff --git a/guetzli/entropy_encode.cc b/guetzli/entropy_encode.cc
new file mode 100755
index 00000000..632fd210
--- /dev/null
+++ b/guetzli/entropy_encode.cc
@@ -0,0 +1,147 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Entropy encoding (Huffman) utilities.
+
+#include "guetzli/entropy_encode.h"
+
+#include <assert.h>
+#include <algorithm>
+
+namespace guetzli {
+
+bool SetDepth(int p0, HuffmanTree *pool, uint8_t *depth, int max_depth) {
+  int stack[17];
+  int level = 0;
+  int p = p0;
+  assert(max_depth <= 16);
+  stack[0] = -1;
+  while (true) {
+    if (pool[p].index_left_ >= 0) {
+      level++;
+      if (level > max_depth) return false;
+      stack[level] = pool[p].index_right_or_value_;
+      p = pool[p].index_left_;
+      continue;
+    } else {
+      depth[pool[p].index_right_or_value_] = static_cast<uint8_t>(level);
+    }
+    while (level >= 0 && stack[level] == -1) level--;
+    if (level < 0) return true;
+    p = stack[level];
+    stack[level] = -1;
+  }
+}
+
+// Sort the root nodes, least popular first.
+static inline bool SortHuffmanTree(const HuffmanTree& v0,
+                                   const HuffmanTree& v1) {
+  if (v0.total_count_ != v1.total_count_) {
+    return v0.total_count_ < v1.total_count_;
+  }
+  return v0.index_right_or_value_ > v1.index_right_or_value_;
+}
+
+// This function will create a Huffman tree.
+//
+// The catch here is that the tree cannot be arbitrarily deep.
+// Brotli specifies a maximum depth of 15 bits for "code trees"
+// and 7 bits for "code length code trees."
+//
+// count_limit is the value that is to be faked as the minimum value
+// and this minimum value is raised until the tree matches the
+// maximum length requirement.
+//
+// This algorithm is not of excellent performance for very long data blocks,
+// especially when population counts are longer than 2**tree_limit, but
+// we are not planning to use this with extremely long blocks.
+//
+// See http://en.wikipedia.org/wiki/Huffman_coding
+void CreateHuffmanTree(const uint32_t *data,
+                       const size_t length,
+                       const int tree_limit,
+                       HuffmanTree* tree,
+                       uint8_t *depth) {
+  // For block sizes below 64 kB, we never need to do a second iteration
+  // of this loop. Probably all of our block sizes will be smaller than
+  // that, so this loop is mostly of academic interest. If we actually
+  // would need this, we would be better off with the Katajainen algorithm.
+  for (uint32_t count_limit = 1; ; count_limit *= 2) {
+    size_t n = 0;
+    for (size_t i = length; i != 0;) {
+      --i;
+      if (data[i]) {
+        const uint32_t count = std::max<uint32_t>(data[i], count_limit);
+        tree[n++] = HuffmanTree(count, -1, static_cast<int16_t>(i));
+      }
+    }
+
+    if (n == 1) {
+      depth[tree[0].index_right_or_value_] = 1;      // Only one element.
+      break;
+    }
+
+    std::sort(tree, tree + n, SortHuffmanTree);
+
+    // The nodes are:
+    // [0, n): the sorted leaf nodes that we start with.
+    // [n]: we add a sentinel here.
+    // [n + 1, 2n): new parent nodes are added here, starting from
+    //              (n+1). These are naturally in ascending order.
+    // [2n]: we add a sentinel at the end as well.
+    // There will be (2n+1) elements at the end.
+    const HuffmanTree sentinel(~static_cast<uint32_t>(0), -1, -1);
+    tree[n] = sentinel;
+    tree[n + 1] = sentinel;
+
+    size_t i = 0;      // Points to the next leaf node.
+    size_t j = n + 1;  // Points to the next non-leaf node.
+    for (size_t k = n - 1; k != 0; --k) {
+      size_t left, right;
+      if (tree[i].total_count_ <= tree[j].total_count_) {
+        left = i;
+        ++i;
+      } else {
+        left = j;
+        ++j;
+      }
+      if (tree[i].total_count_ <= tree[j].total_count_) {
+        right = i;
+        ++i;
+      } else {
+        right = j;
+        ++j;
+      }
+
+      // The sentinel node becomes the parent node.
+      size_t j_end = 2 * n - k;
+      tree[j_end].total_count_ =
+          tree[left].total_count_ + tree[right].total_count_;
+      tree[j_end].index_left_ = static_cast<int16_t>(left);
+      tree[j_end].index_right_or_value_ = static_cast<int16_t>(right);
+
+      // Add back the last sentinel node.
+      tree[j_end + 1] = sentinel;
+    }
+    if (SetDepth(static_cast<int>(2 * n - 1), &tree[0], depth, tree_limit)) {
+      /* We need to pack the Huffman tree in tree_limit bits. If this was not
+         successful, add fake entities to the lowest values and retry. */
+      break;
+    }
+  }
+}
+
+}  // namespace guetzli
diff --git a/guetzli/entropy_encode.h b/guetzli/entropy_encode.h
new file mode 100755
index 00000000..1b6b84cc
--- /dev/null
+++ b/guetzli/entropy_encode.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Entropy encoding (Huffman) utilities.
+
+#ifndef GUETZLI_ENTROPY_ENCODE_H_
+#define GUETZLI_ENTROPY_ENCODE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+namespace guetzli {
+
+// A node of a Huffman tree.
+struct HuffmanTree {
+  HuffmanTree() {}
+  HuffmanTree(uint32_t count, int16_t left, int16_t right)
+      : total_count_(count),
+        index_left_(left),
+        index_right_or_value_(right) {
+  }
+  uint32_t total_count_;
+  int16_t index_left_;
+  int16_t index_right_or_value_;
+};
+
+bool SetDepth(int p, HuffmanTree *pool, uint8_t *depth, int max_depth);
+
+// This function will create a Huffman tree.
+//
+// The (data,length) contains the population counts.
+// The tree_limit is the maximum bit depth of the Huffman codes.
+//
+// The depth contains the tree, i.e., how many bits are used for
+// the symbol.
+//
+// The actual Huffman tree is constructed in the tree[] array, which has to
+// be at least 2 * length + 1 long.
+//
+// See http://en.wikipedia.org/wiki/Huffman_coding
+void CreateHuffmanTree(const uint32_t *data,
+                       const size_t length,
+                       const int tree_limit,
+                       HuffmanTree* tree,
+                       uint8_t *depth);
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_ENTROPY_ENCODE_H_
diff --git a/guetzli/fast_log.h b/guetzli/fast_log.h
new file mode 100755
index 00000000..48677b79
--- /dev/null
+++ b/guetzli/fast_log.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GUETZLI_FAST_LOG_H_
+#define GUETZLI_FAST_LOG_H_
+
+#include <math.h>
+
+namespace guetzli {
+
+inline int Log2FloorNonZero(uint32_t n) {
+#ifdef __GNUC__
+  return 31 ^ __builtin_clz(n);
+#else
+  unsigned int result = 0;
+  while (n >>= 1) result++;
+  return result;
+#endif
+}
+
+inline int Log2Floor(uint32_t n) {
+  return n == 0 ? -1 : Log2FloorNonZero(n);
+}
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_FAST_LOG_H_
diff --git a/guetzli/fdct.cc b/guetzli/fdct.cc
new file mode 100755
index 00000000..137893d9
--- /dev/null
+++ b/guetzli/fdct.cc
@@ -0,0 +1,242 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Integer implementation of the Discrete Cosine Transform (DCT)
+//
+// Note! DCT output is kept scaled by 16, to retain maximum 16bit precision
+
+#include "guetzli/fdct.h"
+
+namespace guetzli {
+
+namespace {
+
+///////////////////////////////////////////////////////////////////////////////
+// Cosine table: C(k) = cos(k.pi/16)/sqrt(2), k = 1..7 using 15 bits signed
+const coeff_t kTable04[7] = { 22725, 21407, 19266, 16384, 12873,  8867, 4520 };
+// rows #1 and #7 are pre-multiplied by 2.C(1) before the 2nd pass.
+// This multiply is merged in the table of constants used during 1rst pass:
+const coeff_t kTable17[7] = { 31521, 29692, 26722, 22725, 17855, 12299, 6270 };
+// rows #2 and #6 are pre-multiplied by 2.C(2):
+const coeff_t kTable26[7] = { 29692, 27969, 25172, 21407, 16819, 11585, 5906 };
+// rows #3 and #5 are pre-multiplied by 2.C(3):
+const coeff_t kTable35[7] = { 26722, 25172, 22654, 19266, 15137, 10426, 5315 };
+
+///////////////////////////////////////////////////////////////////////////////
+// Constants (15bit precision) and C macros for IDCT vertical pass
+
+#define kTan1   (13036)   // = tan(pi/16)
+#define kTan2   (27146)   // = tan(2.pi/16) = sqrt(2) - 1.
+#define kTan3m1 (-21746)  // = tan(3.pi/16) - 1
+#define k2Sqrt2 (23170)   // = 1 / 2.sqrt(2)
+
+  // performs: {a,b} <- {a-b, a+b}, without saturation
+#define BUTTERFLY(a, b) do {   \
+  SUB((a), (b));               \
+  ADD((b), (b));               \
+  ADD((b), (a));               \
+} while (0)
+
+///////////////////////////////////////////////////////////////////////////////
+// Constants for DCT horizontal pass
+
+// Note about the CORRECT_LSB macro:
+// using 16bit fixed-point constants, we often compute products like:
+// p = (A*x + B*y + 32768) >> 16 by adding two sub-terms q = (A*x) >> 16
+// and r = (B*y) >> 16 together. Statistically, we have p = q + r + 1
+// in 3/4 of the cases. This can be easily seen from the relation:
+//   (a + b + 1) >> 1 = (a >> 1) + (b >> 1) + ((a|b)&1)
+// The approximation we are doing is replacing ((a|b)&1) by 1.
+// In practice, this is a slightly more involved because the constants A and B
+// have also been rounded compared to their exact floating point value.
+// However, all in all the correction is quite small, and CORRECT_LSB can
+// be defined empty if needed.
+
+#define COLUMN_DCT8(in) do { \
+  LOAD(m0, (in)[0 * 8]);     \
+  LOAD(m2, (in)[2 * 8]);     \
+  LOAD(m7, (in)[7 * 8]);     \
+  LOAD(m5, (in)[5 * 8]);     \
+                             \
+  BUTTERFLY(m0, m7);         \
+  BUTTERFLY(m2, m5);         \
+                             \
+  LOAD(m3, (in)[3 * 8]);     \
+  LOAD(m4, (in)[4 * 8]);     \
+  BUTTERFLY(m3, m4);         \
+                             \
+  LOAD(m6, (in)[6 * 8]);     \
+  LOAD(m1, (in)[1 * 8]);     \
+  BUTTERFLY(m1, m6);         \
+  BUTTERFLY(m7, m4);         \
+  BUTTERFLY(m6, m5);         \
+                             \
+  /* RowIdct() needs 15bits fixed-point input, when the output from   */ \
+  /* ColumnIdct() would be 12bits. We are better doing the shift by 3 */ \
+  /* now instead of in RowIdct(), because we have some multiplies to  */ \
+  /* perform, that can take advantage of the extra 3bits precision.   */ \
+  LSHIFT(m4, 3);             \
+  LSHIFT(m5, 3);             \
+  BUTTERFLY(m4, m5);         \
+  STORE16((in)[0 * 8], m5);  \
+  STORE16((in)[4 * 8], m4);  \
+                             \
+  LSHIFT(m7, 3);             \
+  LSHIFT(m6, 3);             \
+  LSHIFT(m3, 3);             \
+  LSHIFT(m0, 3);             \
+                             \
+  LOAD_CST(m4, kTan2);       \
+  m5 = m4;                   \
+  MULT(m4, m7);              \
+  MULT(m5, m6);              \
+  SUB(m4, m6);               \
+  ADD(m5, m7);               \
+  STORE16((in)[2 * 8], m5);  \
+  STORE16((in)[6 * 8], m4);  \
+                             \
+  /* We should be multiplying m6 by C4 = 1/sqrt(2) here, but we only have */ \
+  /* the k2Sqrt2 = 1/(2.sqrt(2)) constant that fits into 15bits. So we    */ \
+  /* shift by 4 instead of 3 to compensate for the additional 1/2 factor. */ \
+  LOAD_CST(m6, k2Sqrt2);     \
+  LSHIFT(m2, 3 + 1);         \
+  LSHIFT(m1, 3 + 1);         \
+  BUTTERFLY(m1, m2);         \
+  MULT(m2, m6);              \
+  MULT(m1, m6);              \
+  BUTTERFLY(m3, m1);         \
+  BUTTERFLY(m0, m2);         \
+                             \
+  LOAD_CST(m4, kTan3m1);     \
+  LOAD_CST(m5, kTan1);       \
+  m7 = m3;                   \
+  m6 = m1;                   \
+  MULT(m3, m4);              \
+  MULT(m1, m5);              \
+                             \
+  ADD(m3, m7);               \
+  ADD(m1, m2);               \
+  CORRECT_LSB(m1);           \
+  CORRECT_LSB(m3);           \
+  MULT(m4, m0);              \
+  MULT(m5, m2);              \
+  ADD(m4, m0);               \
+  SUB(m0, m3);               \
+  ADD(m7, m4);               \
+  SUB(m5, m6);               \
+                             \
+  STORE16((in)[1 * 8], m1);  \
+  STORE16((in)[3 * 8], m0);  \
+  STORE16((in)[5 * 8], m7);  \
+  STORE16((in)[7 * 8], m5);  \
+} while (0)
+
+
+// these are the macro required by COLUMN_*
+#define LOAD_CST(dst, src) (dst) = (src)
+#define LOAD(dst, src) (dst) = (src)
+#define MULT(a, b)  (a) = (((a) * (b)) >> 16)
+#define ADD(a, b)   (a) = (a) + (b)
+#define SUB(a, b)   (a) = (a) - (b)
+#define LSHIFT(a, n) (a) = ((a) << (n))
+#define STORE16(a, b) (a) = (b)
+#define CORRECT_LSB(a) (a) += 1
+
+// DCT vertical pass
+
+inline void ColumnDct(coeff_t* in) {
+  for (int i = 0; i < 8; ++i) {
+    int m0, m1, m2, m3, m4, m5, m6, m7;
+    COLUMN_DCT8(in + i);
+  }
+}
+
+// DCT horizontal pass
+
+// We don't really need to round before descaling, since we
+// still have 4 bits of precision left as final scaled output.
+#define DESCALE(a)  static_cast<coeff_t>((a) >> 16)
+
+void RowDct(coeff_t* in, const coeff_t* table) {
+  // The Fourier transform is an unitary operator, so we're basically
+  // doing the transpose of RowIdct()
+  const int a0 = in[0] + in[7];
+  const int b0 = in[0] - in[7];
+  const int a1 = in[1] + in[6];
+  const int b1 = in[1] - in[6];
+  const int a2 = in[2] + in[5];
+  const int b2 = in[2] - in[5];
+  const int a3 = in[3] + in[4];
+  const int b3 = in[3] - in[4];
+
+  // even part
+  const int C2 = table[1];
+  const int C4 = table[3];
+  const int C6 = table[5];
+  const int c0 = a0 + a3;
+  const int c1 = a0 - a3;
+  const int c2 = a1 + a2;
+  const int c3 = a1 - a2;
+
+  in[0] = DESCALE(C4 * (c0 + c2));
+  in[4] = DESCALE(C4 * (c0 - c2));
+  in[2] = DESCALE(C2 * c1 + C6 * c3);
+  in[6] = DESCALE(C6 * c1 - C2 * c3);
+
+  // odd part
+  const int C1 = table[0];
+  const int C3 = table[2];
+  const int C5 = table[4];
+  const int C7 = table[6];
+  in[1] = DESCALE(C1 * b0 + C3 * b1 + C5 * b2 + C7 * b3);
+  in[3] = DESCALE(C3 * b0 - C7 * b1 - C1 * b2 - C5 * b3);
+  in[5] = DESCALE(C5 * b0 - C1 * b1 + C7 * b2 + C3 * b3);
+  in[7] = DESCALE(C7 * b0 - C5 * b1 + C3 * b2 - C1 * b3);
+}
+#undef DESCALE
+#undef LOAD_CST
+#undef LOAD
+#undef MULT
+#undef ADD
+#undef SUB
+#undef LSHIFT
+#undef STORE16
+#undef CORRECT_LSB
+#undef kTan1
+#undef kTan2
+#undef kTan3m1
+#undef k2Sqrt2
+#undef BUTTERFLY
+#undef COLUMN_DCT8
+
+}  // namespace
+
+///////////////////////////////////////////////////////////////////////////////
+// visible FDCT callable functions
+
+void ComputeBlockDCT(coeff_t* coeffs) {
+  ColumnDct(coeffs);
+  RowDct(coeffs + 0 * 8, kTable04);
+  RowDct(coeffs + 1 * 8, kTable17);
+  RowDct(coeffs + 2 * 8, kTable26);
+  RowDct(coeffs + 3 * 8, kTable35);
+  RowDct(coeffs + 4 * 8, kTable04);
+  RowDct(coeffs + 5 * 8, kTable35);
+  RowDct(coeffs + 6 * 8, kTable26);
+  RowDct(coeffs + 7 * 8, kTable17);
+}
+
+}  // namespace guetzli
diff --git a/guetzli/fdct.h b/guetzli/fdct.h
new file mode 100755
index 00000000..76d9d395
--- /dev/null
+++ b/guetzli/fdct.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GUETZLI_FDCT_H_
+#define GUETZLI_FDCT_H_
+
+#include "guetzli/jpeg_data.h"
+
+namespace guetzli {
+
+// Computes the DCT (Discrete Cosine Transform) of the 8x8 array in 'block',
+// scaled up by a factor of 16. The values in 'block' are laid out row-by-row
+// and the result is written to the same memory area.
+void ComputeBlockDCT(coeff_t* block);
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_FDCT_H_
diff --git a/guetzli/gamma_correct.cc b/guetzli/gamma_correct.cc
new file mode 100755
index 00000000..82c17b03
--- /dev/null
+++ b/guetzli/gamma_correct.cc
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "guetzli/gamma_correct.h"
+
+#include <cmath>
+
+namespace guetzli {
+
+const double* NewSrgb8ToLinearTable() {
+  double* table = new double[256];
+  int i = 0;
+  for (; i < 11; ++i) {
+    table[i] = i / 12.92;
+  }
+  for (; i < 256; ++i) {
+    table[i] = 255.0 * std::pow(((i / 255.0) + 0.055) / 1.055, 2.4);
+  }
+  return table;
+}
+
+const double* Srgb8ToLinearTable() {
+  static const double* const kSrgb8ToLinearTable = NewSrgb8ToLinearTable();
+  return kSrgb8ToLinearTable;
+}
+
+}  // namespace guetzli
diff --git a/guetzli/gamma_correct.h b/guetzli/gamma_correct.h
new file mode 100755
index 00000000..7dffef62
--- /dev/null
+++ b/guetzli/gamma_correct.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GUETZLI_GAMMA_CORRECT_H_
+#define GUETZLI_GAMMA_CORRECT_H_
+
+namespace guetzli {
+
+const double* Srgb8ToLinearTable();
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_GAMMA_CORRECT_H_
diff --git a/guetzli/guetzli.cc b/guetzli/guetzli.cc
new file mode 100755
index 00000000..48ca7000
--- /dev/null
+++ b/guetzli/guetzli.cc
@@ -0,0 +1,234 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include "gflags/gflags.h"
+#include "png.h"
+#include "guetzli/processor.h"
+#include "guetzli/quality.h"
+#include "guetzli/stats.h"
+
+
+DEFINE_bool(verbose, false,
+            "Print a verbose trace of all attempts to standard output.");
+DEFINE_double(quality, 95,
+              "Visual quality to aim for, expressed as a JPEG quality value.");
+
+namespace {
+
+inline uint8_t BlendOnBlack(const uint8_t val, const uint8_t alpha) {
+  return (static_cast<int>(val) * static_cast<int>(alpha) + 128) / 255;
+}
+
+bool ReadPNG(FILE* f, int* xsize, int* ysize,
+             std::vector<uint8_t>* rgb) {
+  png_structp png_ptr =
+      png_create_read_struct(PNG_LIBPNG_VER_STRING, nullptr, nullptr, nullptr);
+  if (!png_ptr) {
+    return false;
+  }
+
+  png_infop info_ptr = png_create_info_struct(png_ptr);
+  if (!info_ptr) {
+    png_destroy_read_struct(&png_ptr, nullptr, nullptr);
+    return false;
+  }
+
+  if (setjmp(png_jmpbuf(png_ptr)) != 0) {
+    // Ok we are here because of the setjmp.
+    png_destroy_read_struct(&png_ptr, &info_ptr, nullptr);
+    return false;
+  }
+
+  rewind(f);
+  png_init_io(png_ptr, f);
+
+  // The png_transforms flags are as follows:
+  // packing == convert 1,2,4 bit images,
+  // strip == 16 -> 8 bits / channel,
+  // shift == use sBIT dynamics, and
+  // expand == palettes -> rgb, grayscale -> 8 bit images, tRNS -> alpha.
+  const unsigned int png_transforms =
+      PNG_TRANSFORM_PACKING | PNG_TRANSFORM_EXPAND | PNG_TRANSFORM_STRIP_16;
+
+  png_read_png(png_ptr, info_ptr, png_transforms, nullptr);
+
+  png_bytep* row_pointers = png_get_rows(png_ptr, info_ptr);
+
+  *xsize = png_get_image_width(png_ptr, info_ptr);
+  *ysize = png_get_image_height(png_ptr, info_ptr);
+  rgb->resize(3 * (*xsize) * (*ysize));
+
+  const int components = png_get_channels(png_ptr, info_ptr);
+  switch (components) {
+    case 1: {
+      // GRAYSCALE
+      for (int y = 0; y < *ysize; ++y) {
+        const uint8_t* row_in = row_pointers[y];
+        uint8_t* row_out = &(*rgb)[3 * y * (*xsize)];
+        for (int x = 0; x < *xsize; ++x) {
+          const uint8_t gray = row_in[x];
+          row_out[3 * x + 0] = gray;
+          row_out[3 * x + 1] = gray;
+          row_out[3 * x + 2] = gray;
+        }
+      }
+      break;
+    }
+    case 2: {
+      // GRAYSCALE + ALPHA
+      for (int y = 0; y < *ysize; ++y) {
+        const uint8_t* row_in = row_pointers[y];
+        uint8_t* row_out = &(*rgb)[3 * y * (*xsize)];
+        for (int x = 0; x < *xsize; ++x) {
+          const uint8_t gray = BlendOnBlack(row_in[2 * x], row_in[2 * x + 1]);
+          row_out[3 * x + 0] = gray;
+          row_out[3 * x + 1] = gray;
+          row_out[3 * x + 2] = gray;
+        }
+      }
+      break;
+    }
+    case 3: {
+      // RGB
+      for (int y = 0; y < *ysize; ++y) {
+        const uint8_t* row_in = row_pointers[y];
+        uint8_t* row_out = &(*rgb)[3 * y * (*xsize)];
+        memcpy(row_out, row_in, 3 * (*xsize));
+      }
+      break;
+    }
+    case 4: {
+      // RGBA
+      for (int y = 0; y < *ysize; ++y) {
+        const uint8_t* row_in = row_pointers[y];
+        uint8_t* row_out = &(*rgb)[3 * y * (*xsize)];
+        for (int x = 0; x < *xsize; ++x) {
+          const uint8_t alpha = row_in[4 * x + 3];
+          row_out[3 * x + 0] = BlendOnBlack(row_in[4 * x + 0], alpha);
+          row_out[3 * x + 1] = BlendOnBlack(row_in[4 * x + 1], alpha);
+          row_out[3 * x + 2] = BlendOnBlack(row_in[4 * x + 2], alpha);
+        }
+      }
+      break;
+    }
+    default:
+      png_destroy_read_struct(&png_ptr, &info_ptr, nullptr);
+      return false;
+  }
+  png_destroy_read_struct(&png_ptr, &info_ptr, nullptr);
+  return true;
+}
+
+std::string ReadFileOrDie(FILE* f) {
+  if (fseek(f, 0, SEEK_END) != 0) {
+    perror("fseek");
+    exit(1);
+  }
+  off_t size = ftell(f);
+  if (size < 0) {
+    perror("ftell");
+    exit(1);
+  }
+  if (fseek(f, 0, SEEK_SET) != 0) {
+    perror("fseek");
+    exit(1);
+  }
+  std::unique_ptr<char[]> buf(new char[size]);
+  if (fread(buf.get(), 1, size, f) != size) {
+    perror("fread");
+    exit(1);
+  }
+  std::string result(buf.get(), size);
+  return result;
+}
+
+void WriteFileOrDie(FILE* f, const std::string& contents) {
+  if (fwrite(contents.data(), 1, contents.size(), f) != contents.size()) {
+    perror("fwrite");
+    exit(1);
+  }
+  if (fclose(f) < 0) {
+    perror("fclose");
+    exit(1);
+  }
+}
+
+}  // namespace
+
+int main(int argc, char** argv) {
+  gflags::SetUsageMessage("Guetzli JPEG compressor. Usage: \n"
+                          "guetzli [flags] input_filename output_filename");
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  if (argc != 3) {
+    gflags::ShowUsageWithFlags(argv[0]);
+    return 1;
+  }
+
+  FILE* fin = fopen(argv[1], "rb");
+  if (!fin) {
+    fprintf(stderr, "Can't open input file\n");
+    return 1;
+  }
+
+  FILE* fout = fopen(argv[2], "wb");
+  if (!fout) {
+    fprintf(stderr, "Can't open output file for writing\n");
+    return 1;
+  }
+
+  std::string in_data = ReadFileOrDie(fin);
+  std::string out_data;
+
+  guetzli::Params params;
+  params.butteraugli_target =
+      guetzli::ButteraugliScoreForQuality(FLAGS_quality);
+
+  guetzli::ProcessStats stats;
+
+  if (FLAGS_verbose) {
+    stats.debug_output_file = stdout;
+  }
+
+  static const unsigned char kPNGMagicBytes[] = {
+      0x89, 'P', 'N', 'G', '\r', '\n', 0x1a, '\n',
+  };
+  if (in_data.size() >= 8 &&
+      memcmp(in_data.data(), kPNGMagicBytes, sizeof(kPNGMagicBytes)) == 0) {
+    int xsize, ysize;
+    std::vector<uint8_t> rgb;
+    if (!ReadPNG(fin, &xsize, &ysize, &rgb)) {
+      fprintf(stderr, "Error reading PNG data from input file\n");
+      return 1;
+    }
+    if (!guetzli::Process(params, &stats, rgb, xsize, ysize, &out_data)) {
+      fprintf(stderr, "Guetzli processing failed\n");
+      return 1;
+    }
+  } else {
+    if (!guetzli::Process(params, &stats, in_data, &out_data)) {
+      fprintf(stderr, "Guetzli processing failed\n");
+      return 1;
+    }
+  }
+
+  WriteFileOrDie(fout, out_data);
+  return 0;
+}
diff --git a/guetzli/idct.cc b/guetzli/idct.cc
new file mode 100755
index 00000000..ee6bb5ea
--- /dev/null
+++ b/guetzli/idct.cc
@@ -0,0 +1,162 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Integer implementation of the Inverse Discrete Cosine Transform (IDCT).
+
+#include "guetzli/idct.h"
+
+#include <math.h>
+
+namespace guetzli {
+
+// kIDCTMatrix[8*x+u] = alpha(u)*cos((2*x+1)*u*M_PI/16)*sqrt(2), with fixed 13
+// bit precision, where alpha(0) = 1/sqrt(2) and alpha(u) = 1 for u > 0.
+// Some coefficients are off by +-1 to mimick libjpeg's behaviour.
+static const int kIDCTMatrix[kDCTBlockSize] = {
+  8192,  11363,  10703,   9633,   8192,   6437,   4433,   2260,
+  8192,   9633,   4433,  -2259,  -8192, -11362, -10704,  -6436,
+  8192,   6437,  -4433, -11362,  -8192,   2261,  10704,   9633,
+  8192,   2260, -10703,  -6436,   8192,   9633,  -4433, -11363,
+  8192,  -2260, -10703,   6436,   8192,  -9633,  -4433,  11363,
+  8192,  -6437,  -4433,  11362,  -8192,  -2261,  10704,  -9633,
+  8192,  -9633,   4433,   2259,  -8192,  11362, -10704,   6436,
+  8192, -11363,  10703,  -9633,   8192,  -6437,   4433,  -2260,
+};
+
+// Computes out[x] = sum{kIDCTMatrix[8*x+u]*in[u*stride]; for u in [0..7]}
+inline void Compute1dIDCT(const coeff_t* in, const int stride, int out[8]) {
+  int tmp0, tmp1, tmp2, tmp3, tmp4;
+
+  tmp1 = kIDCTMatrix[0] * in[0];
+  out[0] = out[1] = out[2] = out[3] = out[4] = out[5] = out[6] = out[7] = tmp1;
+
+  tmp0 = in[stride];
+  tmp1 = kIDCTMatrix[ 1] * tmp0;
+  tmp2 = kIDCTMatrix[ 9] * tmp0;
+  tmp3 = kIDCTMatrix[17] * tmp0;
+  tmp4 = kIDCTMatrix[25] * tmp0;
+  out[0] += tmp1;
+  out[1] += tmp2;
+  out[2] += tmp3;
+  out[3] += tmp4;
+  out[4] -= tmp4;
+  out[5] -= tmp3;
+  out[6] -= tmp2;
+  out[7] -= tmp1;
+
+  tmp0 = in[2 * stride];
+  tmp1 = kIDCTMatrix[ 2] * tmp0;
+  tmp2 = kIDCTMatrix[10] * tmp0;
+  out[0] += tmp1;
+  out[1] += tmp2;
+  out[2] -= tmp2;
+  out[3] -= tmp1;
+  out[4] -= tmp1;
+  out[5] -= tmp2;
+  out[6] += tmp2;
+  out[7] += tmp1;
+
+  tmp0 = in[3 * stride];
+  tmp1 = kIDCTMatrix[ 3] * tmp0;
+  tmp2 = kIDCTMatrix[11] * tmp0;
+  tmp3 = kIDCTMatrix[19] * tmp0;
+  tmp4 = kIDCTMatrix[27] * tmp0;
+  out[0] += tmp1;
+  out[1] += tmp2;
+  out[2] += tmp3;
+  out[3] += tmp4;
+  out[4] -= tmp4;
+  out[5] -= tmp3;
+  out[6] -= tmp2;
+  out[7] -= tmp1;
+
+  tmp0 = in[4 * stride];
+  tmp1 = kIDCTMatrix[ 4] * tmp0;
+  out[0] += tmp1;
+  out[1] -= tmp1;
+  out[2] -= tmp1;
+  out[3] += tmp1;
+  out[4] += tmp1;
+  out[5] -= tmp1;
+  out[6] -= tmp1;
+  out[7] += tmp1;
+
+  tmp0 = in[5 * stride];
+  tmp1 = kIDCTMatrix[ 5] * tmp0;
+  tmp2 = kIDCTMatrix[13] * tmp0;
+  tmp3 = kIDCTMatrix[21] * tmp0;
+  tmp4 = kIDCTMatrix[29] * tmp0;
+  out[0] += tmp1;
+  out[1] += tmp2;
+  out[2] += tmp3;
+  out[3] += tmp4;
+  out[4] -= tmp4;
+  out[5] -= tmp3;
+  out[6] -= tmp2;
+  out[7] -= tmp1;
+
+  tmp0 = in[6 * stride];
+  tmp1 = kIDCTMatrix[ 6] * tmp0;
+  tmp2 = kIDCTMatrix[14] * tmp0;
+  out[0] += tmp1;
+  out[1] += tmp2;
+  out[2] -= tmp2;
+  out[3] -= tmp1;
+  out[4] -= tmp1;
+  out[5] -= tmp2;
+  out[6] += tmp2;
+  out[7] += tmp1;
+
+  tmp0 = in[7 * stride];
+  tmp1 = kIDCTMatrix[ 7] * tmp0;
+  tmp2 = kIDCTMatrix[15] * tmp0;
+  tmp3 = kIDCTMatrix[23] * tmp0;
+  tmp4 = kIDCTMatrix[31] * tmp0;
+  out[0] += tmp1;
+  out[1] += tmp2;
+  out[2] += tmp3;
+  out[3] += tmp4;
+  out[4] -= tmp4;
+  out[5] -= tmp3;
+  out[6] -= tmp2;
+  out[7] -= tmp1;
+}
+
+void ComputeBlockIDCT(const coeff_t* block, uint8_t* out) {
+  coeff_t colidcts[kDCTBlockSize];
+  const int kColScale = 11;
+  const int kColRound = 1 << (kColScale - 1);
+  for (int x = 0; x < 8; ++x) {
+    int colbuf[8] = { 0 };
+    Compute1dIDCT(&block[x], 8, colbuf);
+    for (int y = 0; y < 8; ++y) {
+      colidcts[8 * y + x] = (colbuf[y] + kColRound) >> kColScale;
+    }
+  }
+  const int kRowScale = 18;
+  const int kRowRound = 257 << (kRowScale - 1);  // includes offset by 128
+  for (int y = 0; y < 8; ++y) {
+    const int rowidx = 8 * y;
+    int rowbuf[8] = { 0 };
+    Compute1dIDCT(&colidcts[rowidx], 1, rowbuf);
+    for (int x = 0; x < 8; ++x) {
+      out[rowidx + x] =
+          std::max(0, std::min(255, (rowbuf[x] + kRowRound) >> kRowScale));
+    }
+  }
+}
+
+}  // namespace guetzli
diff --git a/guetzli/idct.h b/guetzli/idct.h
new file mode 100755
index 00000000..3e6c436d
--- /dev/null
+++ b/guetzli/idct.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GUETZLI_IDCT_H_
+#define GUETZLI_IDCT_H_
+
+#include "guetzli/jpeg_data.h"
+
+namespace guetzli {
+
+// Fills in 'result' with the inverse DCT of 'block'.
+// The arguments 'block' and 'result' point to 8x8 arrays that are arranged in
+// a row-by-row memory layout.
+void ComputeBlockIDCT(const coeff_t* block, uint8_t* result);
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_IDCT_H_
diff --git a/guetzli/jpeg_bit_writer.h b/guetzli/jpeg_bit_writer.h
new file mode 100755
index 00000000..3a531376
--- /dev/null
+++ b/guetzli/jpeg_bit_writer.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GUETZLI_JPEG_BIT_WRITER_H_
+#define GUETZLI_JPEG_BIT_WRITER_H_
+
+#include <stdint.h>
+#include <memory>
+
+namespace guetzli {
+
+// Returns non-zero if and only if x has a zero byte, i.e. one of
+// x & 0xff, x & 0xff00, ..., x & 0xff00000000000000 is zero.
+inline uint64_t HasZeroByte(uint64_t x) {
+  return (x - 0x0101010101010101ULL) & ~x & 0x8080808080808080ULL;
+}
+
+// Handles the packing of bits into output bytes.
+struct BitWriter {
+  explicit BitWriter(size_t length) : len(length),
+                                      data(new uint8_t[len]),
+                                      pos(0),
+                                      put_buffer(0),
+                                      put_bits(64),
+                                      overflow(false) {}
+
+  void WriteBits(int nbits, uint64_t bits) {
+    put_bits -= nbits;
+    put_buffer |= (bits << put_bits);
+    if (put_bits <= 16) {
+      // At this point we are ready to emit the most significant 6 bytes of
+      // put_buffer_ to the output.
+      // The JPEG format requires that after every 0xff byte in the entropy
+      // coded section, there is a zero byte, therefore we first check if any of
+      // the 6 most significant bytes of put_buffer_ is 0xff.
+      if (HasZeroByte(~put_buffer | 0xffff)) {
+        // We have a 0xff byte somewhere, examine each byte and append a zero
+        // byte if necessary.
+        EmitByte((put_buffer >> 56) & 0xff);
+        EmitByte((put_buffer >> 48) & 0xff);
+        EmitByte((put_buffer >> 40) & 0xff);
+        EmitByte((put_buffer >> 32) & 0xff);
+        EmitByte((put_buffer >> 24) & 0xff);
+        EmitByte((put_buffer >> 16) & 0xff);
+      } else if (pos + 6 < len) {
+        // We don't have any 0xff bytes, output all 6 bytes without checking.
+        data[pos] = (put_buffer >> 56) & 0xff;
+        data[pos + 1] = (put_buffer >> 48) & 0xff;
+        data[pos + 2] = (put_buffer >> 40) & 0xff;
+        data[pos + 3] = (put_buffer >> 32) & 0xff;
+        data[pos + 4] = (put_buffer >> 24) & 0xff;
+        data[pos + 5] = (put_buffer >> 16) & 0xff;
+        pos += 6;
+      } else {
+        overflow = true;
+      }
+      put_buffer <<= 48;
+      put_bits += 48;
+    }
+  }
+
+  // Writes the given byte to the output, writes an extra zero if byte is 0xff.
+  void EmitByte(int byte) {
+    if (pos < len) {
+      data[pos++] = byte;
+    } else {
+      overflow = true;
+    }
+    if (byte == 0xff) {
+      EmitByte(0);
+    }
+  }
+
+  void JumpToByteBoundary() {
+    while (put_bits <= 56) {
+      int c = (put_buffer >> 56) & 0xff;
+      EmitByte(c);
+      put_buffer <<= 8;
+      put_bits += 8;
+    }
+    if (put_bits < 64) {
+      int padmask = 0xff >> (64 - put_bits);
+      int c = ((put_buffer >> 56) & ~padmask) | padmask;
+      EmitByte(c);
+    }
+    put_buffer = 0;
+    put_bits = 64;
+  }
+
+  size_t len;
+  std::unique_ptr<uint8_t[]> data;
+  int pos;
+  uint64_t put_buffer;
+  int put_bits;
+  bool overflow;
+};
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_JPEG_BIT_WRITER_H_
diff --git a/guetzli/jpeg_data.cc b/guetzli/jpeg_data.cc
new file mode 100755
index 00000000..587251e4
--- /dev/null
+++ b/guetzli/jpeg_data.cc
@@ -0,0 +1,104 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "guetzli/jpeg_data.h"
+
+#include <assert.h>
+#include <string.h>
+
+namespace guetzli {
+
+bool JPEGData::Is420() const {
+  return (components.size() == 3 &&
+          max_h_samp_factor == 2 &&
+          max_v_samp_factor == 2 &&
+          components[0].h_samp_factor == 2 &&
+          components[0].v_samp_factor == 2 &&
+          components[1].h_samp_factor == 1 &&
+          components[1].v_samp_factor == 1 &&
+          components[2].h_samp_factor == 1 &&
+          components[2].v_samp_factor == 1);
+}
+
+bool JPEGData::Is444() const {
+  return (components.size() == 3 &&
+          max_h_samp_factor == 1 &&
+          max_v_samp_factor == 1 &&
+          components[0].h_samp_factor == 1 &&
+          components[0].v_samp_factor == 1 &&
+          components[1].h_samp_factor == 1 &&
+          components[1].v_samp_factor == 1 &&
+          components[2].h_samp_factor == 1 &&
+          components[2].v_samp_factor == 1);
+}
+
+void InitJPEGDataForYUV444(int w, int h, JPEGData* jpg) {
+  jpg->width = w;
+  jpg->height = h;
+  jpg->max_h_samp_factor = 1;
+  jpg->max_v_samp_factor = 1;
+  jpg->MCU_rows = (h + 7) >> 3;
+  jpg->MCU_cols = (w + 7) >> 3;
+  jpg->quant.resize(3);
+  jpg->components.resize(3);
+  for (int i = 0; i < 3; ++i) {
+    JPEGComponent* c = &jpg->components[i];
+    c->id = i;
+    c->h_samp_factor = 1;
+    c->v_samp_factor = 1;
+    c->quant_idx = i;
+    c->width_in_blocks = jpg->MCU_cols;
+    c->height_in_blocks = jpg->MCU_rows;
+    c->num_blocks = c->width_in_blocks * c->height_in_blocks;
+    c->coeffs.resize(c->num_blocks * kDCTBlockSize);
+  }
+}
+
+void SaveQuantTables(const int q[3][kDCTBlockSize], JPEGData* jpg) {
+  const size_t kTableSize = kDCTBlockSize * sizeof(q[0][0]);
+  jpg->quant.clear();
+  int num_tables = 0;
+  for (int i = 0; i < jpg->components.size(); ++i) {
+    JPEGComponent* comp = &jpg->components[i];
+    // Check if we have this quant table already.
+    bool found = false;
+    for (int j = 0; j < num_tables; ++j) {
+      if (memcmp(&q[i][0], &jpg->quant[j].values[0], kTableSize) == 0) {
+        comp->quant_idx = j;
+        found = true;
+        break;
+      }
+    }
+    if (!found) {
+      JPEGQuantTable table;
+      memcpy(&table.values[0], &q[i][0], kTableSize);
+      table.precision = 0;
+      for (int k = 0; k < kDCTBlockSize; ++k) {
+        assert(table.values[k] >= 0);
+        assert(table.values[k] < (1 << 16));
+        if (table.values[k] > 0xff) {
+          table.precision = 1;
+        }
+      }
+      table.index = num_tables;
+      comp->quant_idx = num_tables;
+      jpg->quant.push_back(table);
+      ++num_tables;
+    }
+  }
+}
+
+}  // namespace guetzli
diff --git a/guetzli/jpeg_data.h b/guetzli/jpeg_data.h
new file mode 100755
index 00000000..5623e51e
--- /dev/null
+++ b/guetzli/jpeg_data.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Data structures that represent the contents of a jpeg file.
+
+#ifndef GUETZLI_JPEG_DATA_H_
+#define GUETZLI_JPEG_DATA_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string>
+#include <vector>
+
+#include "guetzli/jpeg_error.h"
+
+namespace guetzli {
+
+static const int kDCTBlockSize = 64;
+static const int kMaxComponents = 4;
+static const int kMaxQuantTables = 4;
+static const int kMaxHuffmanTables = 4;
+static const int kJpegHuffmanMaxBitLength = 16;
+static const int kJpegHuffmanAlphabetSize = 256;
+static const int kJpegDCAlphabetSize = 12;
+static const int kMaxDHTMarkers = 512;
+
+static const uint8_t kDefaultQuantMatrix[2][64] = {
+  { 16,  11,  10,  16,  24,  40,  51,  61,
+    12,  12,  14,  19,  26,  58,  60,  55,
+    14,  13,  16,  24,  40,  57,  69,  56,
+    14,  17,  22,  29,  51,  87,  80,  62,
+    18,  22,  37,  56,  68, 109, 103,  77,
+    24,  35,  55,  64,  81, 104, 113,  92,
+    49,  64,  78,  87, 103, 121, 120, 101,
+    72,  92,  95,  98, 112, 100, 103,  99 },
+  { 17,  18,  24,  47,  99,  99,  99,  99,
+    18,  21,  26,  66,  99,  99,  99,  99,
+    24,  26,  56,  99,  99,  99,  99,  99,
+    47,  66,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99,
+    99,  99,  99,  99,  99,  99,  99,  99 }
+};
+
+const int kJPEGNaturalOrder[80] = {
+  0,   1,  8, 16,  9,  2,  3, 10,
+  17, 24, 32, 25, 18, 11,  4,  5,
+  12, 19, 26, 33, 40, 48, 41, 34,
+  27, 20, 13,  6,  7, 14, 21, 28,
+  35, 42, 49, 56, 57, 50, 43, 36,
+  29, 22, 15, 23, 30, 37, 44, 51,
+  58, 59, 52, 45, 38, 31, 39, 46,
+  53, 60, 61, 54, 47, 55, 62, 63,
+  // extra entries for safety in decoder
+  63, 63, 63, 63, 63, 63, 63, 63,
+  63, 63, 63, 63, 63, 63, 63, 63
+};
+
+const int kJPEGZigZagOrder[64] = {
+  0,   1,  5,  6, 14, 15, 27, 28,
+  2,   4,  7, 13, 16, 26, 29, 42,
+  3,   8, 12, 17, 25, 30, 41, 43,
+  9,  11, 18, 24, 31, 40, 44, 53,
+  10, 19, 23, 32, 39, 45, 52, 54,
+  20, 22, 33, 38, 46, 51, 55, 60,
+  21, 34, 37, 47, 50, 56, 59, 61,
+  35, 36, 48, 49, 57, 58, 62, 63
+};
+
+// Quantization values for an 8x8 pixel block.
+struct JPEGQuantTable {
+  JPEGQuantTable() : values(kDCTBlockSize), precision(0),
+                     index(0), is_last(true) {}
+
+  std::vector<int> values;
+  int precision;
+  // The index of this quantization table as it was parsed from the input JPEG.
+  // Each DQT marker segment contains an 'index' field, and we save this index
+  // here. Valid values are 0 to 3.
+  int index;
+  // Set to true if this table is the last one within its marker segment.
+  bool is_last;
+};
+
+// Huffman code and decoding lookup table used for DC and AC coefficients.
+struct JPEGHuffmanCode {
+  JPEGHuffmanCode() : counts(kJpegHuffmanMaxBitLength + 1),
+                      values(kJpegHuffmanAlphabetSize + 1),
+                      slot_id(0),
+                      is_last(true) {}
+
+  // Bit length histogram.
+  std::vector<int> counts;
+  // Symbol values sorted by increasing bit lengths.
+  std::vector<int> values;
+  // The index of the Huffman code in the current set of Huffman codes. For AC
+  // component Huffman codes, 0x10 is added to the index.
+  int slot_id;
+  // Set to true if this Huffman code is the last one within its marker segment.
+  bool is_last;
+};
+
+// Huffman table indexes used for one component of one scan.
+struct JPEGComponentScanInfo {
+  int comp_idx;
+  int dc_tbl_idx;
+  int ac_tbl_idx;
+};
+
+// Contains information that is used in one scan.
+struct JPEGScanInfo {
+  // Parameters used for progressive scans (named the same way as in the spec):
+  //   Ss : Start of spectral band in zig-zag sequence.
+  //   Se : End of spectral band in zig-zag sequence.
+  //   Ah : Successive approximation bit position, high.
+  //   Al : Successive approximation bit position, low.
+  int Ss;
+  int Se;
+  int Ah;
+  int Al;
+  std::vector<JPEGComponentScanInfo> components;
+};
+
+typedef int16_t coeff_t;
+
+// Represents one component of a jpeg file.
+struct JPEGComponent {
+  JPEGComponent() : id(0),
+                    h_samp_factor(1),
+                    v_samp_factor(1),
+                    quant_idx(0),
+                    width_in_blocks(0),
+                    height_in_blocks(0) {}
+
+  // One-byte id of the component.
+  int id;
+  // Horizontal and vertical sampling factors.
+  // In interleaved mode, each minimal coded unit (MCU) has
+  // h_samp_factor x v_samp_factor DCT blocks from this component.
+  int h_samp_factor;
+  int v_samp_factor;
+  // The index of the quantization table used for this component.
+  int quant_idx;
+  // The dimensions of the component measured in 8x8 blocks.
+  int width_in_blocks;
+  int height_in_blocks;
+  int num_blocks;
+  // The DCT coefficients of this component, laid out block-by-block, divided
+  // through the quantization matrix values.
+  std::vector<coeff_t> coeffs;
+};
+
+// Represents a parsed jpeg file.
+struct JPEGData {
+  JPEGData() : width(0),
+               height(0),
+               version(0),
+               max_h_samp_factor(1),
+               max_v_samp_factor(1),
+               MCU_rows(0),
+               MCU_cols(0),
+               restart_interval(0),
+               original_jpg(NULL),
+               original_jpg_size(0),
+               error(JPEG_OK) {}
+
+  bool Is420() const;
+  bool Is444() const;
+
+  int width;
+  int height;
+  int version;
+  int max_h_samp_factor;
+  int max_v_samp_factor;
+  int MCU_rows;
+  int MCU_cols;
+  int restart_interval;
+  std::vector<std::string> app_data;
+  std::vector<std::string> com_data;
+  std::vector<JPEGQuantTable> quant;
+  std::vector<JPEGHuffmanCode> huffman_code;
+  std::vector<JPEGComponent> components;
+  std::vector<JPEGScanInfo> scan_info;
+  std::vector<uint8_t> marker_order;
+  std::vector<std::string> inter_marker_data;
+  std::string tail_data;
+  const uint8_t* original_jpg;
+  size_t original_jpg_size;
+  JPEGReadError error;
+};
+
+void InitJPEGDataForYUV444(int w, int h, JPEGData* jpg);
+void SaveQuantTables(const int q[3][kDCTBlockSize], JPEGData* jpg);
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_JPEG_DATA_H_
diff --git a/guetzli/jpeg_data_decoder.cc b/guetzli/jpeg_data_decoder.cc
new file mode 100755
index 00000000..98f9f4cc
--- /dev/null
+++ b/guetzli/jpeg_data_decoder.cc
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "guetzli/jpeg_data_decoder.h"
+
+#include "guetzli/output_image.h"
+
+namespace guetzli {
+
+// Mimic libjpeg's heuristics to guess jpeg color space.
+// Requires that the jpg has 3 components.
+bool HasYCbCrColorSpace(const JPEGData& jpg) {
+  bool has_Adobe_marker = false;
+  uint8_t Adobe_transform = 0;
+  for (const std::string& app : jpg.app_data) {
+    if (static_cast<uint8_t>(app[0]) == 0xe0) {
+      return true;
+    } else if (static_cast<uint8_t>(app[0]) == 0xee && app.size() >= 15) {
+      has_Adobe_marker = true;
+      Adobe_transform = app[14];
+    }
+  }
+  if (has_Adobe_marker) {
+    return (Adobe_transform != 0);
+  }
+  const int cid0 = jpg.components[0].id;
+  const int cid1 = jpg.components[1].id;
+  const int cid2 = jpg.components[2].id;
+  return (cid0 != 'R' || cid1 != 'G' || cid2 != 'B');
+}
+
+std::vector<uint8_t> DecodeJpegToRGB(const JPEGData& jpg) {
+  if (jpg.components.size() == 1 ||
+      (jpg.components.size() == 3 &&
+       HasYCbCrColorSpace(jpg) && (jpg.Is420() || jpg.Is444()))) {
+    OutputImage img(jpg.width, jpg.height);
+    img.CopyFromJpegData(jpg);
+    return img.ToSRGB();
+  }
+  return std::vector<uint8_t>();
+}
+
+}  // namespace guetzli
diff --git a/guetzli/jpeg_data_decoder.h b/guetzli/jpeg_data_decoder.h
new file mode 100755
index 00000000..11679dcb
--- /dev/null
+++ b/guetzli/jpeg_data_decoder.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Library to decode jpeg coefficients into an RGB image.
+
+#ifndef GUETZLI_JPEG_DATA_DECODER_H_
+#define GUETZLI_JPEG_DATA_DECODER_H_
+
+#include <stdint.h>
+
+#include "guetzli/jpeg_data.h"
+
+namespace guetzli {
+
+// Decodes the parsed jpeg coefficients into an RGB image.
+// There can be only either 1 or 3 image components, in either case, an RGB
+// output image will be generated.
+// Only YUV420 and YUV444 sampling factors are supported.
+// Vector will be empty if a decoding error occurred.
+std::vector<uint8_t> DecodeJpegToRGB(const JPEGData& jpg);
+
+// Mimic libjpeg's heuristics to guess jpeg color space.
+// Requires that the jpg has 3 components.
+bool HasYCbCrColorSpace(const JPEGData& jpg);
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_JPEG_DATA_DECODER_H_
diff --git a/guetzli/jpeg_data_encoder.cc b/guetzli/jpeg_data_encoder.cc
new file mode 100755
index 00000000..ce7f685a
--- /dev/null
+++ b/guetzli/jpeg_data_encoder.cc
@@ -0,0 +1,137 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "guetzli/jpeg_data_encoder.h"
+
+#include <string.h>
+
+#include "guetzli/fdct.h"
+
+namespace guetzli {
+
+namespace {
+
+static const int kIQuantBits = 16;
+// Output of the DCT is upscaled by 16.
+static const int kDCTBits = kIQuantBits + 4;
+static const int kBias = 0x80 << (kDCTBits - 8);
+
+void Quantize(coeff_t* v, int iquant) {
+  *v = (*v * iquant + kBias) >> kDCTBits;
+}
+
+// Single pixel rgb to 16-bit yuv conversion.
+// The returned yuv values are signed integers in the
+// range [-128, 127] inclusive.
+inline static void RGBToYUV16(const uint8_t* const rgb,
+                              coeff_t *out) {
+  enum { FRAC = 16, HALF = 1 << (FRAC - 1) };
+  const int r = rgb[0];
+  const int g = rgb[1];
+  const int b = rgb[2];
+  out[0] = (19595 * r  + 38469 * g +  7471 * b - (128 << 16) + HALF) >> FRAC;
+  out[64] = (-11059 * r - 21709 * g + 32768 * b + HALF - 1) >> FRAC;
+  out[128] = (32768 * r  - 27439 * g -  5329 * b + HALF - 1) >> FRAC;
+}
+
+}  // namespace
+
+void AddApp0Data(JPEGData* jpg) {
+  const unsigned char kApp0Data[] = {
+      0xe0, 0x00, 0x10,              // APP0
+      0x4a, 0x46, 0x49, 0x46, 0x00,  // 'JFIF'
+      0x01, 0x01,                    // v1.01
+      0x00, 0x00, 0x01, 0x00, 0x01,  // aspect ratio = 1:1
+      0x00, 0x00                     // thumbnail width/height
+  };
+  jpg->app_data.push_back(
+      std::string(reinterpret_cast<const char*>(kApp0Data),
+                                 sizeof(kApp0Data)));
+}
+
+bool EncodeRGBToJpeg(const std::vector<uint8_t>& rgb, int w, int h,
+                     const int* quant, JPEGData* jpg) {
+  if (w < 0 || w >= 1 << 16 || h < 0 || h >= 1 << 16 ||
+      rgb.size() != 3 * w * h) {
+    return false;
+  }
+  InitJPEGDataForYUV444(w, h, jpg);
+  AddApp0Data(jpg);
+
+  int iquant[3 * kDCTBlockSize];
+  int idx = 0;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < kDCTBlockSize; ++j) {
+      int v = quant[idx];
+      jpg->quant[i].values[j] = v;
+      iquant[idx++] = ((1 << kIQuantBits) + 1) / v;
+    }
+  }
+
+  // Compute YUV444 DCT coefficients.
+  int block_ix = 0;
+  for (int block_y = 0; block_y < jpg->MCU_rows; ++block_y) {
+    for (int block_x = 0; block_x < jpg->MCU_cols; ++block_x) {
+      coeff_t block[3 * kDCTBlockSize];
+      // RGB->YUV transform.
+      for (int iy = 0; iy < 8; ++iy) {
+        for (int ix = 0; ix < 8; ++ix) {
+          int y = std::min(h - 1, 8 * block_y + iy);
+          int x = std::min(w - 1, 8 * block_x + ix);
+          int p = y * w + x;
+          RGBToYUV16(&rgb[3 * p], &block[8 * iy + ix]);
+        }
+      }
+      // DCT
+      for (int i = 0; i < 3; ++i) {
+        ComputeBlockDCT(&block[i * kDCTBlockSize]);
+      }
+      // Quantization
+      for (int i = 0; i < 3 * 64; ++i) {
+        Quantize(&block[i], iquant[i]);
+      }
+      // Copy the resulting coefficients to *jpg.
+      for (int i = 0; i < 3; ++i) {
+        memcpy(&jpg->components[i].coeffs[block_ix * kDCTBlockSize],
+               &block[i * kDCTBlockSize], kDCTBlockSize * sizeof(block[0]));
+      }
+      ++block_ix;
+    }
+  }
+
+  return true;
+}
+
+bool EncodeRGBToJpeg(const std::vector<uint8_t>& rgb, int w, int h,
+                     JPEGData* jpg) {
+  static const int quant[3 * kDCTBlockSize] = {
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  };
+  return EncodeRGBToJpeg(rgb, w, h, quant, jpg);
+}
+
+}  // namespace guetzli
diff --git a/guetzli/jpeg_data_encoder.h b/guetzli/jpeg_data_encoder.h
new file mode 100755
index 00000000..06b33157
--- /dev/null
+++ b/guetzli/jpeg_data_encoder.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GUETZLI_JPEG_DATA_ENCODER_H_
+#define GUETZLI_JPEG_DATA_ENCODER_H_
+
+#include <stdint.h>
+
+#include "guetzli/jpeg_data.h"
+
+namespace guetzli {
+
+
+// Adds APP0 header data.
+void AddApp0Data(JPEGData* jpg);
+
+// Creates a JPEG from the rgb pixel data. Returns true on success.
+bool EncodeRGBToJpeg(const std::vector<uint8_t>& rgb, int w, int h,
+                     JPEGData* jpg);
+
+// Creates a JPEG from the rgb pixel data. Returns true on success. The given
+// quantization table must have 3 * kDCTBlockSize values.
+bool EncodeRGBToJpeg(const std::vector<uint8_t>& rgb, int w, int h,
+                     const int* quant, JPEGData* jpg);
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_JPEG_DATA_ENCODER_H_
diff --git a/guetzli/jpeg_data_reader.cc b/guetzli/jpeg_data_reader.cc
new file mode 100755
index 00000000..44f55ece
--- /dev/null
+++ b/guetzli/jpeg_data_reader.cc
@@ -0,0 +1,1075 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "guetzli/jpeg_data_reader.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#include "guetzli/jpeg_huffman_decode.h"
+
+namespace guetzli {
+
+namespace {
+
+// Macros for commonly used error conditions.
+
+#define VERIFY_LEN(n)                                                   \
+  if (*pos + (n) > len) {                                               \
+    fprintf(stderr, "Unexpected end of input: pos=%d need=%d len=%d\n", \
+            static_cast<int>(*pos), static_cast<int>(n),                \
+            static_cast<int>(len));                                     \
+    jpg->error = JPEG_UNEXPECTED_EOF;                                   \
+    return false;                                                       \
+  }
+
+#define VERIFY_INPUT(var, low, high, code)                              \
+  if (var < low || var > high) {                                        \
+    fprintf(stderr, "Invalid %s: %d\n", #var, static_cast<int>(var));   \
+    jpg->error = JPEG_INVALID_ ## code;                                 \
+        return false;                                                   \
+  }
+
+#define VERIFY_MARKER_END()                                             \
+  if (start_pos + marker_len != *pos) {                                 \
+    fprintf(stderr, "Invalid marker length: declared=%d actual=%d\n",   \
+            static_cast<int>(marker_len),                               \
+            static_cast<int>(*pos - start_pos));                        \
+    jpg->error = JPEG_WRONG_MARKER_SIZE;                                \
+    return false;                                                       \
+  }
+
+#define EXPECT_MARKER() \
+  if (pos + 2 > len || data[pos] != 0xff) {                             \
+    fprintf(stderr, "Marker byte (0xff) expected, found: %d "           \
+            "pos=%d len=%d\n",                                          \
+            (pos < len ? data[pos] : 0), static_cast<int>(pos),         \
+            static_cast<int>(len));                                     \
+    jpg->error = JPEG_MARKER_BYTE_NOT_FOUND;                            \
+    return false;                                                       \
+  }
+
+// Returns ceil(a/b).
+inline int DivCeil(int a, int b) {
+  return (a + b - 1) / b;
+}
+
+inline int ReadUint8(const uint8_t* data, size_t* pos) {
+  return data[(*pos)++];
+}
+
+inline int ReadUint16(const uint8_t* data, size_t* pos) {
+  int v = (data[*pos] << 8) + data[*pos + 1];
+  *pos += 2;
+  return v;
+}
+
+// Reads the Start of Frame (SOF) marker segment and fills in *jpg with the
+// parsed data.
+bool ProcessSOF(const uint8_t* data, const size_t len,
+                JpegReadMode mode, size_t* pos, JPEGData* jpg) {
+  if (jpg->width != 0) {
+    fprintf(stderr, "Duplicate SOF marker.\n");
+    jpg->error = JPEG_DUPLICATE_SOF;
+    return false;
+  }
+  const size_t start_pos = *pos;
+  VERIFY_LEN(8);
+  size_t marker_len = ReadUint16(data, pos);
+  int precision = ReadUint8(data, pos);
+  int height = ReadUint16(data, pos);
+  int width = ReadUint16(data, pos);
+  int num_components = ReadUint8(data, pos);
+  VERIFY_INPUT(precision, 8, 8, PRECISION);
+  VERIFY_INPUT(height, 1, 65535, HEIGHT);
+  VERIFY_INPUT(width, 1, 65535, WIDTH);
+  VERIFY_INPUT(num_components, 1, kMaxComponents, NUMCOMP);
+  VERIFY_LEN(3 * num_components);
+  jpg->height = height;
+  jpg->width = width;
+  jpg->components.resize(num_components);
+
+  // Read sampling factors and quant table index for each component.
+  std::vector<bool> ids_seen(256, false);
+  for (int i = 0; i < jpg->components.size(); ++i) {
+    const int id = ReadUint8(data, pos);
+    if (ids_seen[id]) {   // (cf. section B.2.2, syntax of Ci)
+      fprintf(stderr, "Duplicate ID %d in SOF.\n", id);
+      jpg->error = JPEG_DUPLICATE_COMPONENT_ID;
+      return false;
+    }
+    ids_seen[id] = true;
+    jpg->components[i].id = id;
+    int factor = ReadUint8(data, pos);
+    int h_samp_factor = factor >> 4;
+    int v_samp_factor = factor & 0xf;
+    VERIFY_INPUT(h_samp_factor, 1, 15, SAMP_FACTOR);
+    VERIFY_INPUT(v_samp_factor, 1, 15, SAMP_FACTOR);
+    jpg->components[i].h_samp_factor = h_samp_factor;
+    jpg->components[i].v_samp_factor = v_samp_factor;
+    jpg->components[i].quant_idx = ReadUint8(data, pos);
+    jpg->max_h_samp_factor = std::max(jpg->max_h_samp_factor, h_samp_factor);
+    jpg->max_v_samp_factor = std::max(jpg->max_v_samp_factor, v_samp_factor);
+  }
+
+  // We have checked above that none of the sampling factors are 0, so the max
+  // sampling factors can not be 0.
+  jpg->MCU_rows = DivCeil(jpg->height, jpg->max_v_samp_factor * 8);
+  jpg->MCU_cols = DivCeil(jpg->width, jpg->max_h_samp_factor * 8);
+  // Compute the block dimensions for each component.
+  if (mode == JPEG_READ_ALL) {
+    for (int i = 0; i < jpg->components.size(); ++i) {
+      JPEGComponent* c = &jpg->components[i];
+      if (jpg->max_h_samp_factor % c->h_samp_factor != 0 ||
+          jpg->max_v_samp_factor % c->v_samp_factor != 0) {
+        fprintf(stderr, "Non-integral subsampling ratios.\n");
+        jpg->error = JPEG_INVALID_SAMPLING_FACTORS;
+        return false;
+      }
+      c->width_in_blocks = jpg->MCU_cols * c->h_samp_factor;
+      c->height_in_blocks = jpg->MCU_rows * c->v_samp_factor;
+      const uint64_t num_blocks =
+          static_cast<uint64_t>(c->width_in_blocks) * c->height_in_blocks;
+      if (num_blocks > (1ull << 21)) {
+        // Refuse to allocate more than 1 GB of memory for the coefficients,
+        // that is 2M blocks x 64 coeffs x 2 bytes per coeff x max 4 components.
+        // TODO(user) Add this limit to a GuetzliParams struct.
+        fprintf(stderr, "Image too large.\n");
+        jpg->error = JPEG_IMAGE_TOO_LARGE;
+        return false;
+      }
+      c->num_blocks = static_cast<int>(num_blocks);
+      c->coeffs.resize(c->num_blocks * kDCTBlockSize);
+    }
+  }
+  VERIFY_MARKER_END();
+  return true;
+}
+
+// Reads the Start of Scan (SOS) marker segment and fills in *scan_info with the
+// parsed data.
+bool ProcessSOS(const uint8_t* data, const size_t len, size_t* pos,
+                JPEGData* jpg) {
+  const size_t start_pos = *pos;
+  VERIFY_LEN(3);
+  size_t marker_len = ReadUint16(data, pos);
+  int comps_in_scan = ReadUint8(data, pos);
+  VERIFY_INPUT(comps_in_scan, 1, jpg->components.size(), COMPS_IN_SCAN);
+
+  JPEGScanInfo scan_info;
+  scan_info.components.resize(comps_in_scan);
+  VERIFY_LEN(2 * comps_in_scan);
+  std::vector<bool> ids_seen(256, false);
+  for (int i = 0; i < comps_in_scan; ++i) {
+    int id = ReadUint8(data, pos);
+    if (ids_seen[id]) {   // (cf. section B.2.3, regarding CSj)
+      fprintf(stderr, "Duplicate ID %d in SOS.\n", id);
+      jpg->error = JPEG_DUPLICATE_COMPONENT_ID;
+      return false;
+    }
+    ids_seen[id] = true;
+    bool found_index = false;
+    for (int j = 0; j < jpg->components.size(); ++j) {
+      if (jpg->components[j].id == id) {
+        scan_info.components[i].comp_idx = j;
+        found_index = true;
+      }
+    }
+    if (!found_index) {
+      fprintf(stderr, "SOS marker: Could not find component with id %d\n", id);
+      jpg->error = JPEG_COMPONENT_NOT_FOUND;
+      return false;
+    }
+    int c = ReadUint8(data, pos);
+    int dc_tbl_idx = c >> 4;
+    int ac_tbl_idx = c & 0xf;
+    VERIFY_INPUT(dc_tbl_idx, 0, 3, HUFFMAN_INDEX);
+    VERIFY_INPUT(ac_tbl_idx, 0, 3, HUFFMAN_INDEX);
+    scan_info.components[i].dc_tbl_idx = dc_tbl_idx;
+    scan_info.components[i].ac_tbl_idx = ac_tbl_idx;
+  }
+  VERIFY_LEN(3);
+  scan_info.Ss = ReadUint8(data, pos);
+  scan_info.Se = ReadUint8(data, pos);
+  VERIFY_INPUT(scan_info.Ss, 0, 63, START_OF_SCAN);
+  VERIFY_INPUT(scan_info.Se, scan_info.Ss, 63, END_OF_SCAN);
+  int c = ReadUint8(data, pos);
+  scan_info.Ah = c >> 4;
+  scan_info.Al = c & 0xf;
+  // Check that all the Huffman tables needed for this scan are defined.
+  for (int i = 0; i < comps_in_scan; ++i) {
+    bool found_dc_table = false;
+    bool found_ac_table = false;
+    for (int j = 0; j < jpg->huffman_code.size(); ++j) {
+      int slot_id = jpg->huffman_code[j].slot_id;
+      if (slot_id == scan_info.components[i].dc_tbl_idx) {
+        found_dc_table = true;
+      } else if (slot_id == scan_info.components[i].ac_tbl_idx + 16) {
+        found_ac_table = true;
+      }
+    }
+    if (scan_info.Ss == 0 && !found_dc_table) {
+      fprintf(stderr, "SOS marker: Could not find DC Huffman table with index "
+              "%d\n", scan_info.components[i].dc_tbl_idx);
+      jpg->error = JPEG_HUFFMAN_TABLE_NOT_FOUND;
+      return false;
+    }
+    if (scan_info.Se > 0 && !found_ac_table) {
+      fprintf(stderr, "SOS marker: Could not find AC Huffman table with index "
+              "%d\n", scan_info.components[i].ac_tbl_idx);
+      jpg->error = JPEG_HUFFMAN_TABLE_NOT_FOUND;
+      return false;
+    }
+  }
+  jpg->scan_info.push_back(scan_info);
+  VERIFY_MARKER_END();
+  return true;
+}
+
+// Reads the Define Huffman Table (DHT) marker segment and fills in *jpg with
+// the parsed data. Builds the Huffman decoding table in either dc_huff_lut or
+// ac_huff_lut, depending on the type and solt_id of Huffman code being read.
+bool ProcessDHT(const uint8_t* data, const size_t len,
+                JpegReadMode mode,
+                std::vector<HuffmanTableEntry>* dc_huff_lut,
+                std::vector<HuffmanTableEntry>* ac_huff_lut,
+                size_t* pos,
+                JPEGData* jpg) {
+  const size_t start_pos = *pos;
+  VERIFY_LEN(2);
+  size_t marker_len = ReadUint16(data, pos);
+  if (marker_len == 2) {
+    fprintf(stderr, "DHT marker: no Huffman table found\n");
+    jpg->error = JPEG_EMPTY_DHT;
+    return false;
+  }
+  while (*pos < start_pos + marker_len) {
+    VERIFY_LEN(1 + kJpegHuffmanMaxBitLength);
+    JPEGHuffmanCode huff;
+    huff.slot_id = ReadUint8(data, pos);
+    int huffman_index = huff.slot_id;
+    int is_ac_table = (huff.slot_id & 0x10) != 0;
+    HuffmanTableEntry* huff_lut;
+    if (is_ac_table) {
+      huffman_index -= 0x10;
+      VERIFY_INPUT(huffman_index, 0, 3, HUFFMAN_INDEX);
+      huff_lut = &(*ac_huff_lut)[huffman_index * kJpegHuffmanLutSize];
+    } else {
+      VERIFY_INPUT(huffman_index, 0, 3, HUFFMAN_INDEX);
+      huff_lut = &(*dc_huff_lut)[huffman_index * kJpegHuffmanLutSize];
+    }
+    huff.counts[0] = 0;
+    int total_count = 0;
+    int space = 1 << kJpegHuffmanMaxBitLength;
+    int max_depth = 1;
+    for (int i = 1; i <= kJpegHuffmanMaxBitLength; ++i) {
+      int count = ReadUint8(data, pos);
+      if (count != 0) {
+        max_depth = i;
+      }
+      huff.counts[i] = count;
+      total_count += count;
+      space -= count * (1 << (kJpegHuffmanMaxBitLength - i));
+    }
+    if (is_ac_table) {
+      VERIFY_INPUT(total_count, 0, kJpegHuffmanAlphabetSize, HUFFMAN_CODE);
+    } else {
+      VERIFY_INPUT(total_count, 0, kJpegDCAlphabetSize, HUFFMAN_CODE);
+    }
+    VERIFY_LEN(total_count);
+    std::vector<bool> values_seen(256, false);
+    for (int i = 0; i < total_count; ++i) {
+      uint8_t value = ReadUint8(data, pos);
+      if (!is_ac_table) {
+        VERIFY_INPUT(value, 0, kJpegDCAlphabetSize - 1, HUFFMAN_CODE);
+      }
+      if (values_seen[value]) {
+        fprintf(stderr, "Duplicate Huffman code value %d\n", value);
+        jpg->error = JPEG_INVALID_HUFFMAN_CODE;
+        return false;
+      }
+      values_seen[value] = true;
+      huff.values[i] = value;
+    }
+    // Add an invalid symbol that will have the all 1 code.
+    ++huff.counts[max_depth];
+    huff.values[total_count] = kJpegHuffmanAlphabetSize;
+    space -= (1 << (kJpegHuffmanMaxBitLength - max_depth));
+    if (space < 0) {
+      fprintf(stderr, "Invalid Huffman code lengths.\n");
+      jpg->error = JPEG_INVALID_HUFFMAN_CODE;
+      return false;
+    } else if (space > 0 && huff_lut[0].value != 0xffff) {
+      // Re-initialize the values to an invalid symbol so that we can recognize
+      // it when reading the bit stream using a Huffman code with space > 0.
+      for (int i = 0; i < kJpegHuffmanLutSize; ++i) {
+        huff_lut[i].bits = 0;
+        huff_lut[i].value = 0xffff;
+      }
+    }
+    huff.is_last = (*pos == start_pos + marker_len);
+    if (mode == JPEG_READ_ALL &&
+        !BuildJpegHuffmanTable(&huff.counts[0], &huff.values[0], huff_lut)) {
+      fprintf(stderr, "Failed to build Huffman table.\n");
+      jpg->error = JPEG_INVALID_HUFFMAN_CODE;
+      return false;
+    }
+    jpg->huffman_code.push_back(huff);
+  }
+  VERIFY_MARKER_END();
+  return true;
+}
+
+// Reads the Define Quantization Table (DQT) marker segment and fills in *jpg
+// with the parsed data.
+bool ProcessDQT(const uint8_t* data, const size_t len, size_t* pos,
+                JPEGData* jpg) {
+  const size_t start_pos = *pos;
+  VERIFY_LEN(2);
+  size_t marker_len = ReadUint16(data, pos);
+  if (marker_len == 2) {
+    fprintf(stderr, "DQT marker: no quantization table found\n");
+    jpg->error = JPEG_EMPTY_DQT;
+    return false;
+  }
+  while (*pos < start_pos + marker_len && jpg->quant.size() < kMaxQuantTables) {
+    VERIFY_LEN(1);
+    int quant_table_index = ReadUint8(data, pos);
+    int quant_table_precision = quant_table_index >> 4;
+    quant_table_index &= 0xf;
+    VERIFY_INPUT(quant_table_index, 0, 3, QUANT_TBL_INDEX);
+    VERIFY_LEN((quant_table_precision ? 2 : 1) * kDCTBlockSize);
+    JPEGQuantTable table;
+    table.index = quant_table_index;
+    table.precision = quant_table_precision;
+    for (int i = 0; i < kDCTBlockSize; ++i) {
+      int quant_val = quant_table_precision ?
+          ReadUint16(data, pos) :
+          ReadUint8(data, pos);
+      VERIFY_INPUT(quant_val, 1, 65535, QUANT_VAL);
+      table.values[kJPEGNaturalOrder[i]] = quant_val;
+    }
+    table.is_last = (*pos == start_pos + marker_len);
+    jpg->quant.push_back(table);
+  }
+  VERIFY_MARKER_END();
+  return true;
+}
+
+// Reads the DRI marker and saved the restart interval into *jpg.
+bool ProcessDRI(const uint8_t* data, const size_t len, size_t* pos,
+                JPEGData* jpg) {
+  if (jpg->restart_interval > 0) {
+    fprintf(stderr, "Duplicate DRI marker.\n");
+    jpg->error = JPEG_DUPLICATE_DRI;
+    return false;
+  }
+  const size_t start_pos = *pos;
+  VERIFY_LEN(4);
+  size_t marker_len = ReadUint16(data, pos);
+  int restart_interval = ReadUint16(data, pos);
+  jpg->restart_interval = restart_interval;
+  VERIFY_MARKER_END();
+  return true;
+}
+
+// Saves the APP marker segment as a string to *jpg.
+bool ProcessAPP(const uint8_t* data, const size_t len, size_t* pos,
+                JPEGData* jpg) {
+  VERIFY_LEN(2);
+  size_t marker_len = ReadUint16(data, pos);
+  VERIFY_INPUT(marker_len, 2, 65535, MARKER_LEN);
+  VERIFY_LEN(marker_len - 2);
+  // Save the marker type together with the app data.
+  std::string app_str(reinterpret_cast<const char*>(
+      &data[*pos - 3]), marker_len + 1);
+  *pos += marker_len - 2;
+  jpg->app_data.push_back(app_str);
+  return true;
+}
+
+// Saves the COM marker segment as a string to *jpg.
+bool ProcessCOM(const uint8_t* data, const size_t len, size_t* pos,
+                JPEGData* jpg) {
+  VERIFY_LEN(2);
+  size_t marker_len = ReadUint16(data, pos);
+  VERIFY_INPUT(marker_len, 2, 65535, MARKER_LEN);
+  VERIFY_LEN(marker_len - 2);
+  std::string com_str(reinterpret_cast<const char*>(
+      &data[*pos - 2]), marker_len);
+  *pos += marker_len - 2;
+  jpg->com_data.push_back(com_str);
+  return true;
+}
+
+// Helper structure to read bits from the entropy coded data segment.
+struct BitReaderState {
+  BitReaderState(const uint8_t* data, const size_t len, size_t pos)
+      : data_(data), len_(len) {
+    Reset(pos);
+  }
+
+  void Reset(size_t pos) {
+    pos_ = pos;
+    val_ = 0;
+    bits_left_ = 0;
+    next_marker_pos_ = len_ - 2;
+    FillBitWindow();
+  }
+
+  // Returns the next byte and skips the 0xff/0x00 escape sequences.
+  uint8_t GetNextByte() {
+    if (pos_ >= next_marker_pos_) {
+      ++pos_;
+      return 0;
+    }
+    uint8_t c = data_[pos_++];
+    if (c == 0xff) {
+      uint8_t escape = data_[pos_];
+      if (escape == 0) {
+        ++pos_;
+      } else {
+        // 0xff was followed by a non-zero byte, which means that we found the
+        // start of the next marker segment.
+        next_marker_pos_ = pos_ - 1;
+      }
+    }
+    return c;
+  }
+
+  void FillBitWindow() {
+    if (bits_left_ <= 16) {
+      while (bits_left_ <= 56) {
+        val_ <<= 8;
+        val_ |= (uint64_t)GetNextByte();
+        bits_left_ += 8;
+      }
+    }
+  }
+
+  int ReadBits(int nbits) {
+    FillBitWindow();
+    uint64_t val = (val_ >> (bits_left_ - nbits)) & ((1ULL << nbits) - 1);
+    bits_left_ -= nbits;
+    return val;
+  }
+
+  // Sets *pos to the next stream position where parsing should continue.
+  // Returns false if the stream ended too early.
+  bool FinishStream(size_t* pos) {
+    // Give back some bytes that we did not use.
+    int unused_bytes_left = bits_left_ >> 3;
+    while (unused_bytes_left-- > 0) {
+      --pos_;
+      // If we give back a 0 byte, we need to check if it was a 0xff/0x00 escape
+      // sequence, and if yes, we need to give back one more byte.
+      if (pos_ < next_marker_pos_ &&
+          data_[pos_] == 0 && data_[pos_ - 1] == 0xff) {
+        --pos_;
+      }
+    }
+    if (pos_ > next_marker_pos_) {
+      // Data ran out before the scan was complete.
+      fprintf(stderr, "Unexpected end of scan.\n");
+      return false;
+    }
+    *pos = pos_;
+    return true;
+  }
+
+  const uint8_t* data_;
+  const size_t len_;
+  size_t pos_;
+  uint64_t val_;
+  int bits_left_;
+  size_t next_marker_pos_;
+};
+
+// Returns the next Huffman-coded symbol.
+int ReadSymbol(const HuffmanTableEntry* table, BitReaderState* br) {
+  int nbits;
+  br->FillBitWindow();
+  int val = (br->val_ >> (br->bits_left_ - 8)) & 0xff;
+  table += val;
+  nbits = table->bits - 8;
+  if (nbits > 0) {
+    br->bits_left_ -= 8;
+    table += table->value;
+    val = (br->val_ >> (br->bits_left_ - nbits)) & ((1 << nbits) - 1);
+    table += val;
+  }
+  br->bits_left_ -= table->bits;
+  return table->value;
+}
+
+// Returns the DC diff or AC value for extra bits value x and prefix code s.
+// See Tables F.1 and F.2 of the spec.
+int HuffExtend(int x, int s) {
+  return (x < (1 << (s - 1)) ? x + ((-1) << s ) + 1 : x);
+}
+
+// Decodes one 8x8 block of DCT coefficients from the bit stream.
+bool DecodeDCTBlock(const HuffmanTableEntry* dc_huff,
+                    const HuffmanTableEntry* ac_huff,
+                    int Ss, int Se, int Al,
+                    int* eobrun,
+                    BitReaderState* br,
+                    JPEGData* jpg,
+                    coeff_t* last_dc_coeff,
+                    coeff_t* coeffs) {
+  int s;
+  int r;
+  bool eobrun_allowed = Ss > 0;
+  if (Ss == 0) {
+    s = ReadSymbol(dc_huff, br);
+    if (s >= kJpegDCAlphabetSize) {
+      fprintf(stderr, "Invalid Huffman symbol %d for DC coefficient.\n", s);
+      jpg->error = JPEG_INVALID_SYMBOL;
+      return false;
+    }
+    if (s > 0) {
+      r = br->ReadBits(s);
+      s = HuffExtend(r, s);
+    }
+    s += *last_dc_coeff;
+    const int dc_coeff = s << Al;
+    coeffs[0] = dc_coeff;
+    if (dc_coeff != coeffs[0]) {
+      fprintf(stderr, "Invalid DC coefficient %d\n", dc_coeff);
+      jpg->error = JPEG_NON_REPRESENTABLE_DC_COEFF;
+      return false;
+    }
+    *last_dc_coeff = s;
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  if (*eobrun > 0) {
+    --(*eobrun);
+    return true;
+  }
+  for (int k = Ss; k <= Se; k++) {
+    s = ReadSymbol(ac_huff, br);
+    if (s >= kJpegHuffmanAlphabetSize) {
+      fprintf(stderr, "Invalid Huffman symbol %d for AC coefficient %d\n",
+              s, k);
+      jpg->error = JPEG_INVALID_SYMBOL;
+      return false;
+    }
+    r = s >> 4;
+    s &= 15;
+    if (s > 0) {
+      k += r;
+      if (k > Se) {
+        fprintf(stderr, "Out-of-band coefficient %d band was %d-%d\n",
+                k, Ss, Se);
+        jpg->error = JPEG_OUT_OF_BAND_COEFF;
+        return false;
+      }
+      if (s + Al >= kJpegDCAlphabetSize) {
+        fprintf(stderr, "Out of range AC coefficient value: s=%d Al=%d k=%d\n",
+                s, Al, k);
+        jpg->error = JPEG_NON_REPRESENTABLE_AC_COEFF;
+        return false;
+      }
+      r = br->ReadBits(s);
+      s = HuffExtend(r, s);
+      coeffs[kJPEGNaturalOrder[k]] = s << Al;
+    } else if (r == 15) {
+      k += 15;
+    } else {
+      *eobrun = 1 << r;
+      if (r > 0) {
+        if (!eobrun_allowed) {
+          fprintf(stderr, "End-of-block run crossing DC coeff.\n");
+          jpg->error = JPEG_EOB_RUN_TOO_LONG;
+          return false;
+        }
+        *eobrun += br->ReadBits(r);
+      }
+      break;
+    }
+  }
+  --(*eobrun);
+  return true;
+}
+
+bool RefineDCTBlock(const HuffmanTableEntry* ac_huff,
+                    int Ss, int Se, int Al,
+                    int* eobrun,
+                    BitReaderState* br,
+                    JPEGData* jpg,
+                    coeff_t* coeffs) {
+  bool eobrun_allowed = Ss > 0;
+  if (Ss == 0) {
+    int s = br->ReadBits(1);
+    coeff_t dc_coeff = coeffs[0];
+    dc_coeff |= s << Al;
+    coeffs[0] = dc_coeff;
+    ++Ss;
+  }
+  if (Ss > Se) {
+    return true;
+  }
+  int p1 = 1 << Al;
+  int m1 = (-1) << Al;
+  int k = Ss;
+  int r;
+  int s;
+  bool in_zero_run = false;
+  if (*eobrun <= 0) {
+    for (; k <= Se; k++) {
+      s = ReadSymbol(ac_huff, br);
+      if (s >= kJpegHuffmanAlphabetSize) {
+        fprintf(stderr, "Invalid Huffman symbol %d for AC coefficient %d\n",
+                s, k);
+        jpg->error = JPEG_INVALID_SYMBOL;
+        return false;
+      }
+      r = s >> 4;
+      s &= 15;
+      if (s) {
+        if (s != 1) {
+          fprintf(stderr, "Invalid Huffman symbol %d for AC coefficient %d\n",
+                  s, k);
+          jpg->error = JPEG_INVALID_SYMBOL;
+          return false;
+        }
+        s = br->ReadBits(1) ? p1 : m1;
+        in_zero_run = false;
+      } else {
+        if (r != 15) {
+          *eobrun = 1 << r;
+          if (r > 0) {
+            if (!eobrun_allowed) {
+              fprintf(stderr, "End-of-block run crossing DC coeff.\n");
+              jpg->error = JPEG_EOB_RUN_TOO_LONG;
+              return false;
+            }
+            *eobrun += br->ReadBits(r);
+          }
+          break;
+        }
+        in_zero_run = true;
+      }
+      do {
+        coeff_t thiscoef = coeffs[kJPEGNaturalOrder[k]];
+        if (thiscoef != 0) {
+          if (br->ReadBits(1)) {
+            if ((thiscoef & p1) == 0) {
+              if (thiscoef >= 0) {
+                thiscoef += p1;
+              } else {
+                thiscoef += m1;
+              }
+            }
+          }
+          coeffs[kJPEGNaturalOrder[k]] = thiscoef;
+        } else {
+          if (--r < 0) {
+            break;
+          }
+        }
+        k++;
+      } while (k <= Se);
+      if (s) {
+        if (k > Se) {
+          fprintf(stderr, "Out-of-band coefficient %d band was %d-%d\n",
+                  k, Ss, Se);
+          jpg->error = JPEG_OUT_OF_BAND_COEFF;
+          return false;
+        }
+        coeffs[kJPEGNaturalOrder[k]] = s;
+      }
+    }
+  }
+  if (in_zero_run) {
+    fprintf(stderr, "Extra zero run before end-of-block.\n");
+    jpg->error = JPEG_EXTRA_ZERO_RUN;
+    return false;
+  }
+  if (*eobrun > 0) {
+    for (; k <= Se; k++) {
+      coeff_t thiscoef = coeffs[kJPEGNaturalOrder[k]];
+      if (thiscoef != 0) {
+        if (br->ReadBits(1)) {
+          if ((thiscoef & p1) == 0) {
+            if (thiscoef >= 0) {
+              thiscoef += p1;
+            } else {
+              thiscoef += m1;
+            }
+          }
+        }
+        coeffs[kJPEGNaturalOrder[k]] = thiscoef;
+      }
+    }
+  }
+  --(*eobrun);
+  return true;
+}
+
+bool ProcessRestart(const uint8_t* data, const size_t len,
+                    int* next_restart_marker, BitReaderState* br,
+                    JPEGData* jpg) {
+  size_t pos = 0;
+  if (!br->FinishStream(&pos)) {
+    jpg->error = JPEG_INVALID_SCAN;
+    return false;
+  }
+  int expected_marker = 0xd0 + *next_restart_marker;
+  EXPECT_MARKER();
+  int marker = data[pos + 1];
+  if (marker != expected_marker) {
+    fprintf(stderr, "Did not find expected restart marker %d actual=%d\n",
+            expected_marker, marker);
+    jpg->error = JPEG_WRONG_RESTART_MARKER;
+    return false;
+  }
+  br->Reset(pos + 2);
+  *next_restart_marker += 1;
+  *next_restart_marker &= 0x7;
+  return true;
+}
+
+bool ProcessScan(const uint8_t* data, const size_t len,
+                 const std::vector<HuffmanTableEntry>& dc_huff_lut,
+                 const std::vector<HuffmanTableEntry>& ac_huff_lut,
+                 uint16_t scan_progression[kMaxComponents][kDCTBlockSize],
+                 bool is_progressive,
+                 size_t* pos,
+                 JPEGData* jpg) {
+  if (!ProcessSOS(data, len, pos, jpg)) {
+    return false;
+  }
+  JPEGScanInfo* scan_info = &jpg->scan_info.back();
+  bool is_interleaved = (scan_info->components.size() > 1);
+  int MCUs_per_row;
+  int MCU_rows;
+  if (is_interleaved) {
+    MCUs_per_row = jpg->MCU_cols;
+    MCU_rows = jpg->MCU_rows;
+  } else {
+    const JPEGComponent& c = jpg->components[scan_info->components[0].comp_idx];
+    MCUs_per_row =
+        DivCeil(jpg->width * c.h_samp_factor, 8 * jpg->max_h_samp_factor);
+    MCU_rows =
+        DivCeil(jpg->height * c.v_samp_factor, 8 * jpg->max_v_samp_factor);
+  }
+  coeff_t last_dc_coeff[kMaxComponents] = {0};
+  BitReaderState br(data, len, *pos);
+  int restarts_to_go = jpg->restart_interval;
+  int next_restart_marker = 0;
+  int eobrun = -1;
+  int block_scan_index = 0;
+  const int Al = is_progressive ? scan_info->Al : 0;
+  const int Ah = is_progressive ? scan_info->Ah : 0;
+  const int Ss = is_progressive ? scan_info->Ss : 0;
+  const int Se = is_progressive ? scan_info->Se : 63;
+  const uint16_t scan_bitmask = Ah == 0 ? (0xffff << Al) : (1u << Al);
+  const uint16_t refinement_bitmask = (1 << Al) - 1;
+  for (int i = 0; i < scan_info->components.size(); ++i) {
+    int comp_idx = scan_info->components[i].comp_idx;
+    for (int k = Ss; k <= Se; ++k) {
+      if (scan_progression[comp_idx][k] & scan_bitmask) {
+        fprintf(stderr, "Overlapping scans: component=%d k=%d prev_mask=%d "
+                "cur_mask=%d\n", comp_idx, k, scan_progression[i][k],
+                scan_bitmask);
+        jpg->error = JPEG_OVERLAPPING_SCANS;
+        return false;
+      }
+      if (scan_progression[comp_idx][k] & refinement_bitmask) {
+        fprintf(stderr, "Invalid scan order, a more refined scan was already "
+                "done: component=%d k=%d prev_mask=%d cur_mask=%d\n", comp_idx,
+                k, scan_progression[i][k], scan_bitmask);
+        jpg->error = JPEG_INVALID_SCAN_ORDER;
+        return false;
+      }
+      scan_progression[comp_idx][k] |= scan_bitmask;
+    }
+  }
+  if (Al > 10) {
+    fprintf(stderr, "Scan parameter Al=%d is not supported in guetzli.\n", Al);
+    jpg->error = JPEG_NON_REPRESENTABLE_AC_COEFF;
+    return false;
+  }
+  for (int mcu_y = 0; mcu_y < MCU_rows; ++mcu_y) {
+    for (int mcu_x = 0; mcu_x < MCUs_per_row; ++mcu_x) {
+      // Handle the restart intervals.
+      if (jpg->restart_interval > 0) {
+        if (restarts_to_go == 0) {
+          if (ProcessRestart(data, len,
+                             &next_restart_marker, &br, jpg)) {
+            restarts_to_go = jpg->restart_interval;
+            memset(last_dc_coeff, 0, sizeof(last_dc_coeff));
+            if (eobrun > 0) {
+              fprintf(stderr, "End-of-block run too long.\n");
+              jpg->error = JPEG_EOB_RUN_TOO_LONG;
+              return false;
+            }
+            eobrun = -1;   // fresh start
+          } else {
+            return false;
+          }
+        }
+        --restarts_to_go;
+      }
+      // Decode one MCU.
+      for (int i = 0; i < scan_info->components.size(); ++i) {
+        JPEGComponentScanInfo* si = &scan_info->components[i];
+        JPEGComponent* c = &jpg->components[si->comp_idx];
+        const HuffmanTableEntry* dc_lut =
+            &dc_huff_lut[si->dc_tbl_idx * kJpegHuffmanLutSize];
+        const HuffmanTableEntry* ac_lut =
+            &ac_huff_lut[si->ac_tbl_idx * kJpegHuffmanLutSize];
+        int nblocks_y = is_interleaved ? c->v_samp_factor : 1;
+        int nblocks_x = is_interleaved ? c->h_samp_factor : 1;
+        for (int iy = 0; iy < nblocks_y; ++iy) {
+          for (int ix = 0; ix < nblocks_x; ++ix) {
+            int block_y = mcu_y * nblocks_y + iy;
+            int block_x = mcu_x * nblocks_x + ix;
+            int block_idx = block_y * c->width_in_blocks + block_x;
+            coeff_t* coeffs = &c->coeffs[block_idx * kDCTBlockSize];
+            if (Ah == 0) {
+              if (!DecodeDCTBlock(dc_lut, ac_lut, Ss, Se, Al, &eobrun, &br, jpg,
+                                  &last_dc_coeff[si->comp_idx], coeffs)) {
+                return false;
+              }
+            } else {
+              if (!RefineDCTBlock(ac_lut, Ss, Se, Al,
+                                  &eobrun, &br, jpg, coeffs)) {
+                return false;
+              }
+            }
+            ++block_scan_index;
+          }
+        }
+      }
+    }
+  }
+  if (eobrun > 0) {
+    fprintf(stderr, "End-of-block run too long.\n");
+    jpg->error = JPEG_EOB_RUN_TOO_LONG;
+    return false;
+  }
+  if (!br.FinishStream(pos)) {
+    jpg->error = JPEG_INVALID_SCAN;
+    return false;
+  }
+  if (*pos > len) {
+    fprintf(stderr, "Unexpected end of file during scan. pos=%d len=%d\n",
+            static_cast<int>(*pos), static_cast<int>(len));
+    jpg->error = JPEG_UNEXPECTED_EOF;
+    return false;
+  }
+  return true;
+}
+
+// Changes the quant_idx field of the components to refer to the index of the
+// quant table in the jpg->quant array.
+bool FixupIndexes(JPEGData* jpg) {
+  for (int i = 0; i < jpg->components.size(); ++i) {
+    JPEGComponent* c = &jpg->components[i];
+    bool found_index = false;
+    for (int j = 0; j < jpg->quant.size(); ++j) {
+      if (jpg->quant[j].index == c->quant_idx) {
+        c->quant_idx = j;
+        found_index = true;
+        break;
+      }
+    }
+    if (!found_index) {
+      fprintf(stderr, "Quantization table with index %d not found\n",
+              c->quant_idx);
+      jpg->error = JPEG_QUANT_TABLE_NOT_FOUND;
+      return false;
+    }
+  }
+  return true;
+}
+
+size_t FindNextMarker(const uint8_t* data, const size_t len, size_t pos) {
+  // kIsValidMarker[i] == 1 means (0xc0 + i) is a valid marker.
+  static const uint8_t kIsValidMarker[] = {
+    1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
+  };
+  size_t num_skipped = 0;
+  while (pos + 1 < len &&
+         (data[pos] != 0xff || data[pos + 1] < 0xc0 ||
+          !kIsValidMarker[data[pos + 1] - 0xc0])) {
+    ++pos;
+    ++num_skipped;
+  }
+  return num_skipped;
+}
+
+}  // namespace
+
+bool ReadJpeg(const uint8_t* data, const size_t len, JpegReadMode mode,
+              JPEGData* jpg) {
+  size_t pos = 0;
+  // Check SOI marker.
+  EXPECT_MARKER();
+  int marker = data[pos + 1];
+  pos += 2;
+  if (marker != 0xd8) {
+    fprintf(stderr, "Did not find expected SOI marker, actual=%d\n", marker);
+    jpg->error = JPEG_SOI_NOT_FOUND;
+    return false;
+  }
+  int lut_size = kMaxHuffmanTables * kJpegHuffmanLutSize;
+  std::vector<HuffmanTableEntry> dc_huff_lut(lut_size);
+  std::vector<HuffmanTableEntry> ac_huff_lut(lut_size);
+  bool found_sof = false;
+  uint16_t scan_progression[kMaxComponents][kDCTBlockSize] = { { 0 } };
+
+  bool is_progressive = false;   // default
+  do {
+    // Read next marker.
+    size_t num_skipped = FindNextMarker(data, len, pos);
+    if (num_skipped > 0) {
+      // Add a fake marker to indicate arbitrary in-between-markers data.
+      jpg->marker_order.push_back(0xff);
+      jpg->inter_marker_data.push_back(
+          std::string(reinterpret_cast<const char*>(&data[pos]),
+                                      num_skipped));
+      pos += num_skipped;
+    }
+    EXPECT_MARKER();
+    marker = data[pos + 1];
+    pos += 2;
+    bool ok = true;
+    switch (marker) {
+      case 0xc0:
+      case 0xc1:
+      case 0xc2:
+        is_progressive = (marker == 0xc2);
+        ok = ProcessSOF(data, len, mode, &pos, jpg);
+        found_sof = true;
+        break;
+      case 0xc4:
+        ok = ProcessDHT(data, len, mode, &dc_huff_lut, &ac_huff_lut, &pos, jpg);
+        break;
+      case 0xd0:
+      case 0xd1:
+      case 0xd2:
+      case 0xd3:
+      case 0xd4:
+      case 0xd5:
+      case 0xd6:
+      case 0xd7:
+        // RST markers do not have any data.
+        break;
+      case 0xd9:
+        // Found end marker.
+        break;
+      case 0xda:
+        if (mode == JPEG_READ_ALL) {
+          ok = ProcessScan(data, len, dc_huff_lut, ac_huff_lut,
+                           scan_progression, is_progressive, &pos, jpg);
+        }
+        break;
+      case 0xdb:
+        ok = ProcessDQT(data, len, &pos, jpg);
+        break;
+      case 0xdd:
+        ok = ProcessDRI(data, len, &pos, jpg);
+        break;
+      case 0xe0:
+      case 0xe1:
+      case 0xe2:
+      case 0xe3:
+      case 0xe4:
+      case 0xe5:
+      case 0xe6:
+      case 0xe7:
+      case 0xe8:
+      case 0xe9:
+      case 0xea:
+      case 0xeb:
+      case 0xec:
+      case 0xed:
+      case 0xee:
+      case 0xef:
+        if (mode != JPEG_READ_TABLES) {
+          ok = ProcessAPP(data, len, &pos, jpg);
+        }
+        break;
+      case 0xfe:
+        if (mode != JPEG_READ_TABLES) {
+          ok = ProcessCOM(data, len, &pos, jpg);
+        }
+        break;
+      default:
+        fprintf(stderr, "Unsupported marker: %d pos=%d len=%d\n",
+                marker, static_cast<int>(pos), static_cast<int>(len));
+        jpg->error = JPEG_UNSUPPORTED_MARKER;
+        ok = false;
+        break;
+    }
+    if (!ok) {
+      return false;
+    }
+    jpg->marker_order.push_back(marker);
+    if (mode == JPEG_READ_HEADER && found_sof) {
+      break;
+    }
+  } while (marker != 0xd9);
+
+  if (!found_sof) {
+    fprintf(stderr, "Missing SOF marker.\n");
+    jpg->error = JPEG_SOF_NOT_FOUND;
+    return false;
+  }
+
+  // Supplemental checks.
+  if (mode == JPEG_READ_ALL) {
+    if (pos < len) {
+      jpg->tail_data.assign(reinterpret_cast<const char*>(&data[pos]),
+                            len - pos);
+    }
+    if (!FixupIndexes(jpg)) {
+      return false;
+    }
+    if (jpg->huffman_code.size() == 0) {
+      // Section B.2.4.2: "If a table has never been defined for a particular
+      // destination, then when this destination is specified in a scan header,
+      // the results are unpredictable."
+      fprintf(stderr, "Need at least one Huffman code table.\n");
+      jpg->error = JPEG_HUFFMAN_TABLE_ERROR;
+      return false;
+    }
+    if (jpg->huffman_code.size() >= kMaxDHTMarkers) {
+      fprintf(stderr, "Too many Huffman tables.\n");
+      jpg->error = JPEG_HUFFMAN_TABLE_ERROR;
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ReadJpeg(const std::string& data, JpegReadMode mode,
+              JPEGData* jpg) {
+  return ReadJpeg(reinterpret_cast<const uint8_t*>(data.data()),
+                  static_cast<const size_t>(data.size()),
+                  mode, jpg);
+}
+
+}  // namespace guetzli
diff --git a/guetzli/jpeg_data_reader.h b/guetzli/jpeg_data_reader.h
new file mode 100755
index 00000000..32142cf0
--- /dev/null
+++ b/guetzli/jpeg_data_reader.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Functions for reading a jpeg byte stream into a JPEGData object.
+
+#ifndef GUETZLI_JPEG_DATA_READER_H_
+#define GUETZLI_JPEG_DATA_READER_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <string>
+
+#include "guetzli/jpeg_data.h"
+
+namespace guetzli {
+
+enum JpegReadMode {
+  JPEG_READ_HEADER,   // only basic headers
+  JPEG_READ_TABLES,   // headers and tables (quant, Huffman, ...)
+  JPEG_READ_ALL,      // everything
+};
+
+// Parses the jpeg stream contained in data[*pos ... len) and fills in *jpg with
+// the parsed information.
+// If mode is JPEG_READ_HEADER, it fills in only the image dimensions in *jpg.
+// Returns false if the data is not valid jpeg, or if it contains an unsupported
+// jpeg feature.
+bool ReadJpeg(const uint8_t* data, const size_t len, JpegReadMode mode,
+              JPEGData* jpg);
+// string variant
+bool ReadJpeg(const std::string& data, JpegReadMode mode,
+              JPEGData* jpg);
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_JPEG_DATA_READER_H_
diff --git a/guetzli/jpeg_data_writer.cc b/guetzli/jpeg_data_writer.cc
new file mode 100755
index 00000000..5f07f9ec
--- /dev/null
+++ b/guetzli/jpeg_data_writer.cc
@@ -0,0 +1,567 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "guetzli/jpeg_data_writer.h"
+
+#include <assert.h>
+#include <cstdlib>
+
+#include "guetzli/entropy_encode.h"
+#include "guetzli/fast_log.h"
+#include "guetzli/jpeg_bit_writer.h"
+
+namespace guetzli {
+
+namespace {
+
+static const int kJpegPrecision = 8;
+
+// Writes len bytes from buf, using the out callback.
+inline bool JPEGWrite(JPEGOutput out, const uint8_t* buf, size_t len) {
+  static const size_t kBlockSize = 1u << 30;
+  size_t pos = 0;
+  while (len - pos > kBlockSize) {
+    if (!out.Write(buf + pos, kBlockSize)) {
+      return false;
+    }
+    pos += kBlockSize;
+  }
+  return out.Write(buf + pos, len - pos);
+}
+
+// Writes a string using the out callback.
+inline bool JPEGWrite(JPEGOutput out, const std::string& s) {
+  const uint8_t* data = reinterpret_cast<const uint8_t*>(&s[0]);
+  return JPEGWrite(out, data, s.size());
+}
+
+bool EncodeMetadata(const JPEGData& jpg, bool strip_metadata, JPEGOutput out) {
+  if (strip_metadata) {
+    const uint8_t kApp0Data[] = {
+      0xff, 0xe0, 0x00, 0x10,        // APP0
+      0x4a, 0x46, 0x49, 0x46, 0x00,  // 'JFIF'
+      0x01, 0x01,                    // v1.01
+      0x00, 0x00, 0x01, 0x00, 0x01,  // aspect ratio = 1:1
+      0x00, 0x00                     // thumbnail width/height
+    };
+    return JPEGWrite(out, kApp0Data, sizeof(kApp0Data));
+  }
+  bool ok = true;
+  for (int i = 0; i < jpg.app_data.size(); ++i) {
+    uint8_t data[1] = { 0xff };
+    ok = ok && JPEGWrite(out, data, sizeof(data));
+    ok = ok && JPEGWrite(out, jpg.app_data[i]);
+  }
+  for (int i = 0; i < jpg.com_data.size(); ++i) {
+    uint8_t data[2] = { 0xff, 0xfe };
+    ok = ok && JPEGWrite(out, data, sizeof(data));
+    ok = ok && JPEGWrite(out, jpg.com_data[i]);
+  }
+  return ok;
+}
+
+bool EncodeDQT(const std::vector<JPEGQuantTable>& quant, JPEGOutput out) {
+  int marker_len = 2;
+  for (int i = 0; i < quant.size(); ++i) {
+    marker_len += 1 + (quant[i].precision ? 2 : 1) * kDCTBlockSize;
+  }
+  std::vector<uint8_t> data(marker_len + 2);
+  size_t pos = 0;
+  data[pos++] = 0xff;
+  data[pos++] = 0xdb;
+  data[pos++] = marker_len >> 8;
+  data[pos++] = marker_len & 0xff;
+  for (int i = 0; i < quant.size(); ++i) {
+    const JPEGQuantTable& table = quant[i];
+    data[pos++] = (table.precision << 4) + table.index;
+    for (int k = 0; k < kDCTBlockSize; ++k) {
+      int val = table.values[kJPEGNaturalOrder[k]];
+      if (table.precision) {
+        data[pos++] = val >> 8;
+      }
+      data[pos++] = val & 0xff;
+    }
+  }
+  return JPEGWrite(out, &data[0], pos);
+}
+
+bool EncodeSOF(const JPEGData& jpg, JPEGOutput out) {
+  const size_t ncomps = jpg.components.size();
+  const size_t marker_len = 8 + 3 * ncomps;
+  std::vector<uint8_t> data(marker_len + 2);
+  size_t pos = 0;
+  data[pos++] = 0xff;
+  data[pos++] = 0xc0;
+  data[pos++] = marker_len >> 8;
+  data[pos++] = marker_len & 0xff;
+  data[pos++] = kJpegPrecision;
+  data[pos++] = jpg.height >> 8;
+  data[pos++] = jpg.height & 0xff;
+  data[pos++] = jpg.width >> 8;
+  data[pos++] = jpg.width & 0xff;
+  data[pos++] = ncomps;
+  for (size_t i = 0; i < ncomps; ++i) {
+    data[pos++] = jpg.components[i].id;
+    data[pos++] = ((jpg.components[i].h_samp_factor << 4) |
+                      (jpg.components[i].v_samp_factor));
+    const int quant_idx = jpg.components[i].quant_idx;
+    if (quant_idx >= jpg.quant.size()) {
+      return false;
+    }
+    data[pos++] = jpg.quant[quant_idx].index;
+  }
+  return JPEGWrite(out, &data[0], pos);
+}
+
+// Builds a JPEG-style huffman code from the given bit depths.
+void BuildHuffmanCode(uint8_t* depth, int* counts, int* values) {
+  for (int i = 0; i < JpegHistogram::kSize; ++i) {
+    if (depth[i] > 0) {
+      ++counts[depth[i]];
+    }
+  }
+  int offset[kJpegHuffmanMaxBitLength + 1] = { 0 };
+  for (int i = 1; i <= kJpegHuffmanMaxBitLength; ++i) {
+    offset[i] = offset[i - 1] + counts[i - 1];
+  }
+  for (int i = 0; i < JpegHistogram::kSize; ++i) {
+    if (depth[i] > 0) {
+      values[offset[depth[i]]++] = i;
+    }
+  }
+}
+
+void BuildHuffmanCodeTable(const int* counts, const int* values,
+                           HuffmanCodeTable* table) {
+  int huffcode[256];
+  int huffsize[256];
+  int p = 0;
+  for (int l = 1; l <= kJpegHuffmanMaxBitLength; ++l) {
+    int i = counts[l];
+    while (i--) huffsize[p++] = l;
+  }
+
+  if (p == 0)
+    return;
+
+  huffsize[p - 1] = 0;
+  int lastp = p - 1;
+
+  int code = 0;
+  int si = huffsize[0];
+  p = 0;
+  while (huffsize[p]) {
+    while ((huffsize[p]) == si) {
+      huffcode[p++] = code;
+      code++;
+    }
+    code <<= 1;
+    si++;
+  }
+  for (p = 0; p < lastp; p++) {
+    int i = values[p];
+    table->depth[i] = huffsize[p];
+    table->code[i] = huffcode[p];
+  }
+}
+
+}  // namespace
+
+// Updates ac_histogram with the counts of the AC symbols that will be added by
+// a sequential jpeg encoder for this block. Every symbol is counted twice so
+// that we can add a fake symbol at the end with count 1 to be the last (least
+// frequent) symbol with the all 1 code.
+void UpdateACHistogramForDCTBlock(const coeff_t* coeffs,
+                                  JpegHistogram* ac_histogram) {
+  int r = 0;
+  for (int k = 1; k < 64; ++k) {
+    coeff_t coeff = coeffs[kJPEGNaturalOrder[k]];
+    if (coeff == 0) {
+      r++;
+      continue;
+    }
+    while (r > 15) {
+      ac_histogram->Add(0xf0);
+      r -= 16;
+    }
+    int nbits = Log2FloorNonZero(std::abs(coeff)) + 1;
+    int symbol = (r << 4) + nbits;
+    ac_histogram->Add(symbol);
+    r = 0;
+  }
+  if (r > 0) {
+    ac_histogram->Add(0);
+  }
+}
+
+size_t HistogramHeaderCost(const JpegHistogram& histo) {
+  size_t header_bits = 17 * 8;
+  for (int i = 0; i + 1 < JpegHistogram::kSize; ++i) {
+    if (histo.counts[i] > 0) {
+      header_bits += 8;
+    }
+  }
+  return header_bits;
+}
+
+size_t HistogramEntropyCost(const JpegHistogram& histo,
+                            const uint8_t depths[256]) {
+  size_t bits = 0;
+  for (int i = 0; i + 1 < JpegHistogram::kSize; ++i) {
+    // JpegHistogram::Add() counts every symbol twice, so we have to divide by
+    // two here.
+    bits += (histo.counts[i] / 2) * (depths[i] + (i & 0xf));
+  }
+  // Estimate escape byte rate to be 0.75/256.
+  bits += (bits * 3 + 512) >> 10;
+  return bits;
+}
+
+void BuildDCHistograms(const JPEGData& jpg, JpegHistogram* histo) {
+  for (int i = 0; i < jpg.components.size(); ++i) {
+    const JPEGComponent& c = jpg.components[i];
+    JpegHistogram* dc_histogram = &histo[i];
+    coeff_t last_dc_coeff = 0;
+    for (int mcu_y = 0; mcu_y < jpg.MCU_rows; ++mcu_y) {
+      for (int mcu_x = 0; mcu_x < jpg.MCU_cols; ++mcu_x) {
+        for (int iy = 0; iy < c.v_samp_factor; ++iy) {
+          for (int ix = 0; ix < c.h_samp_factor; ++ix) {
+            int block_y = mcu_y * c.v_samp_factor + iy;
+            int block_x = mcu_x * c.h_samp_factor + ix;
+            int block_idx = block_y * c.width_in_blocks + block_x;
+            coeff_t dc_coeff = c.coeffs[block_idx << 6];
+            int diff = std::abs(dc_coeff - last_dc_coeff);
+            int nbits = Log2Floor(diff) + 1;
+            dc_histogram->Add(nbits);
+            last_dc_coeff = dc_coeff;
+          }
+        }
+      }
+    }
+  }
+}
+
+void BuildACHistograms(const JPEGData& jpg, JpegHistogram* histo) {
+  for (int i = 0; i < jpg.components.size(); ++i) {
+    const JPEGComponent& c = jpg.components[i];
+    JpegHistogram* ac_histogram = &histo[i];
+    for (int j = 0; j < c.coeffs.size(); j += kDCTBlockSize) {
+      UpdateACHistogramForDCTBlock(&c.coeffs[j], ac_histogram);
+    }
+  }
+}
+
+// Size of everything except the Huffman codes and the entropy coded data.
+size_t JpegHeaderSize(const JPEGData& jpg, bool strip_metadata) {
+  size_t num_bytes = 0;
+  num_bytes += 2;  // SOI
+  if (strip_metadata) {
+    num_bytes += 18;  // APP0
+  } else {
+    for (int i = 0; i < jpg.app_data.size(); ++i) {
+      num_bytes += 1 + jpg.app_data[i].size();
+    }
+    for (int i = 0; i < jpg.com_data.size(); ++i) {
+      num_bytes += 2 + jpg.com_data[i].size();
+    }
+  }
+  // DQT
+  num_bytes += 4;
+  for (int i = 0; i < jpg.quant.size(); ++i) {
+    num_bytes += 1 + (jpg.quant[i].precision ? 2 : 1) * kDCTBlockSize;
+  }
+  num_bytes += 10 + 3 * jpg.components.size();  // SOF
+  num_bytes += 4;  // DHT (w/o actual Huffman code data)
+  num_bytes += 8 + 2 * jpg.components.size();  // SOS
+  num_bytes += 2;  // EOI
+  num_bytes += jpg.tail_data.size();
+  return num_bytes;
+}
+
+size_t ClusterHistograms(JpegHistogram* histo, size_t* num,
+                         int* histo_indexes, uint8_t* depth) {
+  memset(depth, 0, *num * JpegHistogram::kSize);
+  size_t costs[kMaxComponents];
+  for (size_t i = 0; i < *num; ++i) {
+    histo_indexes[i] = i;
+    std::vector<HuffmanTree> tree(2 * JpegHistogram::kSize + 1);
+    CreateHuffmanTree(histo[i].counts, JpegHistogram::kSize,
+                      kJpegHuffmanMaxBitLength, &tree[0],
+                      &depth[i * JpegHistogram::kSize]);
+    costs[i] = (HistogramHeaderCost(histo[i]) +
+                HistogramEntropyCost(histo[i],
+                                     &depth[i * JpegHistogram::kSize]));
+  }
+  const size_t orig_num = *num;
+  while (*num > 1) {
+    size_t last = *num - 1;
+    size_t second_last = *num - 2;
+    JpegHistogram combined(histo[last]);
+    combined.AddHistogram(histo[second_last]);
+    std::vector<HuffmanTree> tree(2 * JpegHistogram::kSize + 1);
+    uint8_t depth_combined[JpegHistogram::kSize] = { 0 };
+    CreateHuffmanTree(combined.counts, JpegHistogram::kSize,
+                      kJpegHuffmanMaxBitLength, &tree[0], depth_combined);
+    size_t cost_combined = (HistogramHeaderCost(combined) +
+                            HistogramEntropyCost(combined, depth_combined));
+    if (cost_combined < costs[last] + costs[second_last]) {
+      histo[second_last] = combined;
+      histo[last] = JpegHistogram();
+      costs[second_last] = cost_combined;
+      memcpy(&depth[second_last * JpegHistogram::kSize], depth_combined,
+             sizeof(depth_combined));
+      for (size_t i = 0; i < orig_num; ++i) {
+        if (histo_indexes[i] == last) {
+          histo_indexes[i] = second_last;
+        }
+      }
+      --(*num);
+    } else {
+      break;
+    }
+  }
+  size_t total_cost = 0;
+  for (int i = 0; i < *num; ++i) {
+    total_cost += costs[i];
+  }
+  return (total_cost + 7) / 8;
+}
+
+size_t EstimateJpegDataSize(const int num_components,
+                            const std::vector<JpegHistogram>& histograms) {
+  assert(histograms.size() == 2 * num_components);
+  std::vector<JpegHistogram> clustered = histograms;
+  size_t num_dc = num_components;
+  size_t num_ac = num_components;
+  int indexes[kMaxComponents];
+  uint8_t depth[kMaxComponents * JpegHistogram::kSize];
+  return (ClusterHistograms(&clustered[0], &num_dc, indexes, depth) +
+          ClusterHistograms(&clustered[num_components], &num_ac, indexes,
+                            depth));
+}
+
+namespace {
+
+// Writes DHT and SOS marker segments to out and fills in DC/AC Huffman tables
+// for each component of the image.
+bool BuildAndEncodeHuffmanCodes(const JPEGData& jpg, JPEGOutput out,
+                                std::vector<HuffmanCodeTable>* dc_huff_tables,
+                                std::vector<HuffmanCodeTable>* ac_huff_tables) {
+  const int ncomps = jpg.components.size();
+  dc_huff_tables->resize(ncomps);
+  ac_huff_tables->resize(ncomps);
+
+  // Build separate DC histograms for each component.
+  std::vector<JpegHistogram> histograms(ncomps);
+  BuildDCHistograms(jpg, &histograms[0]);
+
+  // Cluster DC histograms.
+  size_t num_dc_histo = ncomps;
+  int dc_histo_indexes[kMaxComponents];
+  std::vector<uint8_t> depths(ncomps * JpegHistogram::kSize);
+  ClusterHistograms(&histograms[0], &num_dc_histo, dc_histo_indexes,
+                    &depths[0]);
+
+  // Build separate AC histograms for each component.
+  histograms.resize(num_dc_histo + ncomps);
+  depths.resize((num_dc_histo + ncomps) * JpegHistogram::kSize);
+  BuildACHistograms(jpg, &histograms[num_dc_histo]);
+
+  // Cluster AC histograms.
+  size_t num_ac_histo = ncomps;
+  int ac_histo_indexes[kMaxComponents];
+  ClusterHistograms(&histograms[num_dc_histo], &num_ac_histo, ac_histo_indexes,
+                    &depths[num_dc_histo * JpegHistogram::kSize]);
+
+  // Compute DHT and SOS marker data sizes and start emitting DHT marker.
+  int num_histo = num_dc_histo + num_ac_histo;
+  histograms.resize(num_histo);
+  int total_count = 0;
+  for (int i = 0; i < histograms.size(); ++i) {
+    total_count += histograms[i].NumSymbols();
+  }
+  const size_t dht_marker_len =
+      2 + num_histo * (kJpegHuffmanMaxBitLength + 1) + total_count;
+  const size_t sos_marker_len = 6 + 2 * ncomps;
+  std::vector<uint8_t> data(dht_marker_len + sos_marker_len + 4);
+  size_t pos = 0;
+  data[pos++] = 0xff;
+  data[pos++] = 0xc4;
+  data[pos++] = dht_marker_len >> 8;
+  data[pos++] = dht_marker_len & 0xff;
+
+  // Compute Huffman codes for each histograms.
+  for (size_t i = 0; i < num_histo; ++i) {
+    const bool is_dc = i < num_dc_histo;
+    const int idx = is_dc ? i : i - num_dc_histo;
+    int counts[kJpegHuffmanMaxBitLength + 1] = { 0 };
+    int values[JpegHistogram::kSize] = { 0 };
+    BuildHuffmanCode(&depths[i * JpegHistogram::kSize], counts, values);
+    HuffmanCodeTable table;
+    for (int j = 0; j < 256; ++j) table.depth[j] = 255;
+    BuildHuffmanCodeTable(counts, values, &table);
+    for (int c = 0; c < ncomps; ++c) {
+      if (is_dc) {
+        if (dc_histo_indexes[c] == idx) (*dc_huff_tables)[c] = table;
+      } else {
+        if (ac_histo_indexes[c] == idx) (*ac_huff_tables)[c] = table;
+      }
+    }
+    int max_length = kJpegHuffmanMaxBitLength;
+    while (max_length > 0 && counts[max_length] == 0) --max_length;
+    --counts[max_length];
+    int total_count = 0;
+    for (int j = 0; j <= max_length; ++j) total_count += counts[j];
+    data[pos++] = is_dc ? i : i - num_dc_histo + 0x10;
+    for (size_t j = 1; j <= kJpegHuffmanMaxBitLength; ++j) {
+      data[pos++] = counts[j];
+    }
+    for (size_t j = 0; j < total_count; ++j) {
+      data[pos++] = values[j];
+    }
+  }
+
+  // Emit SOS marker data.
+  data[pos++] = 0xff;
+  data[pos++] = 0xda;
+  data[pos++] = sos_marker_len >> 8;
+  data[pos++] = sos_marker_len & 0xff;
+  data[pos++] = ncomps;
+  for (int i = 0; i < ncomps; ++i) {
+    data[pos++] = jpg.components[i].id;
+    data[pos++] = (dc_histo_indexes[i] << 4) | ac_histo_indexes[i];
+  }
+  data[pos++] = 0;
+  data[pos++] = 63;
+  data[pos++] = 0;
+  assert(pos == data.size());
+  return JPEGWrite(out, &data[0], data.size());
+}
+
+void EncodeDCTBlockSequential(const coeff_t* coeffs,
+                              const HuffmanCodeTable& dc_huff,
+                              const HuffmanCodeTable& ac_huff,
+                              coeff_t* last_dc_coeff,
+                              BitWriter* bw) {
+  coeff_t temp2;
+  coeff_t temp;
+  temp2 = coeffs[0];
+  temp = temp2 - *last_dc_coeff;
+  *last_dc_coeff = temp2;
+  temp2 = temp;
+  if (temp < 0) {
+    temp = -temp;
+    temp2--;
+  }
+  int nbits = Log2Floor(temp) + 1;
+  bw->WriteBits(dc_huff.depth[nbits], dc_huff.code[nbits]);
+  if (nbits > 0) {
+    bw->WriteBits(nbits, temp2 & ((1 << nbits) - 1));
+  }
+  int r = 0;
+  for (int k = 1; k < 64; ++k) {
+    if ((temp = coeffs[kJPEGNaturalOrder[k]]) == 0) {
+      r++;
+      continue;
+    }
+    if (temp < 0) {
+      temp = -temp;
+      temp2 = ~temp;
+    } else {
+      temp2 = temp;
+    }
+    while (r > 15) {
+      bw->WriteBits(ac_huff.depth[0xf0], ac_huff.code[0xf0]);
+      r -= 16;
+    }
+    int nbits = Log2FloorNonZero(temp) + 1;
+    int symbol = (r << 4) + nbits;
+    bw->WriteBits(ac_huff.depth[symbol], ac_huff.code[symbol]);
+    bw->WriteBits(nbits, temp2 & ((1 << nbits) - 1));
+    r = 0;
+  }
+  if (r > 0) {
+    bw->WriteBits(ac_huff.depth[0], ac_huff.code[0]);
+  }
+}
+
+bool EncodeScan(const JPEGData& jpg,
+                const std::vector<HuffmanCodeTable>& dc_huff_table,
+                const std::vector<HuffmanCodeTable>& ac_huff_table,
+                JPEGOutput out) {
+  coeff_t last_dc_coeff[kMaxComponents] = { 0 };
+  BitWriter bw(1 << 17);
+  for (int mcu_y = 0; mcu_y < jpg.MCU_rows; ++mcu_y) {
+    for (int mcu_x = 0; mcu_x < jpg.MCU_cols; ++mcu_x) {
+      // Encode one MCU
+      for (int i = 0; i < jpg.components.size(); ++i) {
+        const JPEGComponent& c = jpg.components[i];
+        int nblocks_y = c.v_samp_factor;
+        int nblocks_x = c.h_samp_factor;
+        for (int iy = 0; iy < nblocks_y; ++iy) {
+          for (int ix = 0; ix < nblocks_x; ++ix) {
+            int block_y = mcu_y * nblocks_y + iy;
+            int block_x = mcu_x * nblocks_x + ix;
+            int block_idx = block_y * c.width_in_blocks + block_x;
+            const coeff_t* coeffs = &c.coeffs[block_idx << 6];
+            EncodeDCTBlockSequential(coeffs, dc_huff_table[i], ac_huff_table[i],
+                                     &last_dc_coeff[i], &bw);
+          }
+        }
+      }
+      if (bw.pos > (1 << 16)) {
+        if (!JPEGWrite(out, bw.data.get(), bw.pos)) {
+          return false;
+        }
+        bw.pos = 0;
+      }
+    }
+  }
+  bw.JumpToByteBoundary();
+  return !bw.overflow && JPEGWrite(out, bw.data.get(), bw.pos);
+}
+
+}  // namespace
+
+bool WriteJpeg(const JPEGData& jpg, bool strip_metadata, JPEGOutput out) {
+  static const uint8_t kSOIMarker[2] = { 0xff, 0xd8 };
+  static const uint8_t kEOIMarker[2] = { 0xff, 0xd9 };
+  std::vector<HuffmanCodeTable> dc_codes;
+  std::vector<HuffmanCodeTable> ac_codes;
+  return (JPEGWrite(out, kSOIMarker, sizeof(kSOIMarker)) &&
+          EncodeMetadata(jpg, strip_metadata, out) &&
+          EncodeDQT(jpg.quant, out) &&
+          EncodeSOF(jpg, out) &&
+          BuildAndEncodeHuffmanCodes(jpg, out, &dc_codes, &ac_codes) &&
+          EncodeScan(jpg, dc_codes, ac_codes, out) &&
+          JPEGWrite(out, kEOIMarker, sizeof(kEOIMarker)) &&
+          (strip_metadata || JPEGWrite(out, jpg.tail_data)));
+}
+
+int NullOut(void* data, const uint8_t* buf, size_t count) {
+  return count;
+}
+
+void BuildSequentialHuffmanCodes(
+    const JPEGData& jpg,
+    std::vector<HuffmanCodeTable>* dc_huffman_code_tables,
+    std::vector<HuffmanCodeTable>* ac_huffman_code_tables) {
+  JPEGOutput out(NullOut, nullptr);
+  BuildAndEncodeHuffmanCodes(jpg, out, dc_huffman_code_tables,
+                             ac_huffman_code_tables);
+}
+
+}  // namespace guetzli
diff --git a/guetzli/jpeg_data_writer.h b/guetzli/jpeg_data_writer.h
new file mode 100755
index 00000000..941118b4
--- /dev/null
+++ b/guetzli/jpeg_data_writer.h
@@ -0,0 +1,105 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Functions for writing a JPEGData object into a jpeg byte stream.
+
+#ifndef GUETZLI_JPEG_DATA_WRITER_H_
+#define GUETZLI_JPEG_DATA_WRITER_H_
+
+#include <stdint.h>
+#include <string.h>
+#include <vector>
+
+#include "guetzli/jpeg_data.h"
+
+namespace guetzli {
+
+// Function pointer type used to write len bytes into buf. Returns the
+// number of bytes written or -1 on error.
+typedef int (*JPEGOutputHook)(void* data, const uint8_t* buf, size_t len);
+
+// Output callback function with associated data.
+struct JPEGOutput {
+  JPEGOutput(JPEGOutputHook cb, void* data) : cb(cb), data(data) {}
+  bool Write(const uint8_t* buf, size_t len) const {
+    return (len == 0) || (cb(data, buf, len) == len);
+  }
+ private:
+  JPEGOutputHook cb;
+  void* data;
+};
+
+bool WriteJpeg(const JPEGData& jpg, bool strip_metadata, JPEGOutput out);
+
+struct HuffmanCodeTable {
+  uint8_t depth[256];
+  int code[256];
+};
+
+void BuildSequentialHuffmanCodes(
+    const JPEGData& jpg, std::vector<HuffmanCodeTable>* dc_huffman_code_tables,
+    std::vector<HuffmanCodeTable>* ac_huffman_code_tables);
+
+struct JpegHistogram {
+  static const int kSize = kJpegHuffmanAlphabetSize + 1;
+
+  JpegHistogram() { Clear(); }
+  void Clear() {
+    memset(counts, 0, sizeof(counts));
+    counts[kSize - 1] = 1;
+  }
+  void Add(int symbol) {
+    counts[symbol] += 2;
+  }
+  void Add(int symbol, int weight) {
+    counts[symbol] += 2 * weight;
+  }
+  void AddHistogram(const JpegHistogram& other) {
+    for (int i = 0; i + 1 < kSize; ++i) {
+      counts[i] += other.counts[i];
+    }
+    counts[kSize - 1] = 1;
+  }
+  int NumSymbols() const {
+    int n = 0;
+    for (int i = 0; i + 1 < kSize; ++i) {
+      n += (counts[i] > 0 ? 1 : 0);
+    }
+    return n;
+  }
+
+  uint32_t counts[kSize];
+};
+
+void BuildDCHistograms(const JPEGData& jpg, JpegHistogram* histo);
+void BuildACHistograms(const JPEGData& jpg, JpegHistogram* histo);
+size_t JpegHeaderSize(const JPEGData& jpg, bool strip_metadata);
+size_t EstimateJpegDataSize(const int num_components,
+                            const std::vector<JpegHistogram>& histograms);
+
+size_t HistogramEntropyCost(const JpegHistogram& histo,
+                            const uint8_t depths[256]);
+size_t HistogramHeaderCost(const JpegHistogram& histo);
+
+void UpdateACHistogramForDCTBlock(const coeff_t* coeffs,
+                                  JpegHistogram* ac_histogram);
+
+size_t ClusterHistograms(JpegHistogram* histo, size_t* num, int* histo_indexes,
+                         uint8_t* depths);
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_JPEG_DATA_WRITER_H_
diff --git a/guetzli/jpeg_error.h b/guetzli/jpeg_error.h
new file mode 100755
index 00000000..94cb9be9
--- /dev/null
+++ b/guetzli/jpeg_error.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Definition of error codes for parsing jpeg files.
+
+#ifndef GUETZLI_JPEG_ERROR_H_
+#define GUETZLI_JPEG_ERROR_H_
+
+namespace guetzli {
+
+enum JPEGReadError {
+  JPEG_OK = 0,
+  JPEG_SOI_NOT_FOUND,
+  JPEG_SOF_NOT_FOUND,
+  JPEG_UNEXPECTED_EOF,
+  JPEG_MARKER_BYTE_NOT_FOUND,
+  JPEG_UNSUPPORTED_MARKER,
+  JPEG_WRONG_MARKER_SIZE,
+  JPEG_INVALID_PRECISION,
+  JPEG_INVALID_WIDTH,
+  JPEG_INVALID_HEIGHT,
+  JPEG_INVALID_NUMCOMP,
+  JPEG_INVALID_SAMP_FACTOR,
+  JPEG_INVALID_START_OF_SCAN,
+  JPEG_INVALID_END_OF_SCAN,
+  JPEG_INVALID_SCAN_BIT_POSITION,
+  JPEG_INVALID_COMPS_IN_SCAN,
+  JPEG_INVALID_HUFFMAN_INDEX,
+  JPEG_INVALID_QUANT_TBL_INDEX,
+  JPEG_INVALID_QUANT_VAL,
+  JPEG_INVALID_MARKER_LEN,
+  JPEG_INVALID_SAMPLING_FACTORS,
+  JPEG_INVALID_HUFFMAN_CODE,
+  JPEG_INVALID_SYMBOL,
+  JPEG_NON_REPRESENTABLE_DC_COEFF,
+  JPEG_NON_REPRESENTABLE_AC_COEFF,
+  JPEG_INVALID_SCAN,
+  JPEG_OVERLAPPING_SCANS,
+  JPEG_INVALID_SCAN_ORDER,
+  JPEG_EXTRA_ZERO_RUN,
+  JPEG_DUPLICATE_DRI,
+  JPEG_DUPLICATE_SOF,
+  JPEG_WRONG_RESTART_MARKER,
+  JPEG_DUPLICATE_COMPONENT_ID,
+  JPEG_COMPONENT_NOT_FOUND,
+  JPEG_HUFFMAN_TABLE_NOT_FOUND,
+  JPEG_HUFFMAN_TABLE_ERROR,
+  JPEG_QUANT_TABLE_NOT_FOUND,
+  JPEG_EMPTY_DHT,
+  JPEG_EMPTY_DQT,
+  JPEG_OUT_OF_BAND_COEFF,
+  JPEG_EOB_RUN_TOO_LONG,
+  JPEG_IMAGE_TOO_LARGE,
+};
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_JPEG_ERROR_H_
diff --git a/guetzli/jpeg_huffman_decode.cc b/guetzli/jpeg_huffman_decode.cc
new file mode 100755
index 00000000..e5378a65
--- /dev/null
+++ b/guetzli/jpeg_huffman_decode.cc
@@ -0,0 +1,122 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "guetzli/jpeg_huffman_decode.h"
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "guetzli/jpeg_data.h"
+
+namespace guetzli {
+
+// Returns the table width of the next 2nd level table, count is the histogram
+// of bit lengths for the remaining symbols, len is the code length of the next
+// processed symbol.
+static inline int NextTableBitSize(const int* count, int len) {
+  int left = 1 << (len - kJpegHuffmanRootTableBits);
+  while (len < kJpegHuffmanMaxBitLength) {
+    left -= count[len];
+    if (left <= 0) break;
+    ++len;
+    left <<= 1;
+  }
+  return len - kJpegHuffmanRootTableBits;
+}
+
+int BuildJpegHuffmanTable(const int* count_in, const int* symbols,
+                          HuffmanTableEntry* lut) {
+  HuffmanTableEntry code;    // current table entry
+  HuffmanTableEntry* table;  // next available space in table
+  int len;         // current code length
+  int idx;         // symbol index
+  int key;         // prefix code
+  int reps;        // number of replicate key values in current table
+  int low;         // low bits for current root entry
+  int table_bits;  // key length of current table
+  int table_size;  // size of current table
+  int total_size;  // sum of root table size and 2nd level table sizes
+
+  // Make a local copy of the input bit length histogram.
+  int count[kJpegHuffmanMaxBitLength + 1] = { 0 };
+  int total_count = 0;
+  for (len = 1; len <= kJpegHuffmanMaxBitLength; ++len) {
+    count[len] = count_in[len];
+    total_count += count[len];
+  }
+
+  table = lut;
+  table_bits = kJpegHuffmanRootTableBits;
+  table_size = 1 << table_bits;
+  total_size = table_size;
+
+  // Special case code with only one value.
+  if (total_count == 1) {
+    code.bits = 0;
+    code.value = symbols[0];
+    for (key = 0; key < total_size; ++key) {
+      table[key] = code;
+    }
+    return total_size;
+  }
+
+  // Fill in root table.
+  key = 0;
+  idx = 0;
+  for (len = 1; len <= kJpegHuffmanRootTableBits; ++len) {
+    for (; count[len] > 0; --count[len]) {
+      code.bits = len;
+      code.value = symbols[idx++];
+      reps = 1 << (kJpegHuffmanRootTableBits - len);
+      while (reps--) {
+        table[key++] = code;
+      }
+    }
+  }
+
+  // Fill in 2nd level tables and add pointers to root table.
+  table += table_size;
+  table_size = 0;
+  low = 0;
+  for (len = kJpegHuffmanRootTableBits + 1;
+       len <= kJpegHuffmanMaxBitLength; ++len) {
+    for (; count[len] > 0; --count[len]) {
+      // Start a new sub-table if the previous one is full.
+      if (low >= table_size) {
+        table += table_size;
+        table_bits = NextTableBitSize(count, len);
+        table_size = 1 << table_bits;
+        total_size += table_size;
+        low = 0;
+        lut[key].bits = table_bits + kJpegHuffmanRootTableBits;
+        lut[key].value = (table - lut) - key;
+        ++key;
+      }
+      code.bits = len - kJpegHuffmanRootTableBits;
+      code.value = symbols[idx++];
+      reps = 1 << (table_bits - code.bits);
+      while (reps--) {
+        table[low++] = code;
+      }
+    }
+  }
+
+  return total_size;
+}
+
+}  // namespace guetzli
diff --git a/guetzli/jpeg_huffman_decode.h b/guetzli/jpeg_huffman_decode.h
new file mode 100755
index 00000000..0deefa8f
--- /dev/null
+++ b/guetzli/jpeg_huffman_decode.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Utility function for building a Huffman lookup table for the jpeg decoder.
+
+#ifndef GUETZLI_JPEG_HUFFMAN_DECODE_H_
+#define GUETZLI_JPEG_HUFFMAN_DECODE_H_
+
+#include <inttypes.h>
+
+namespace guetzli {
+
+static const int kJpegHuffmanRootTableBits = 8;
+// Maximum huffman lookup table size.
+// According to zlib/examples/enough.c, 758 entries are always enough for
+// an alphabet of 257 symbols (256 + 1 special symbol for the all 1s code) and
+// max bit length 16 if the root table has 8 bits.
+static const int kJpegHuffmanLutSize = 758;
+
+struct HuffmanTableEntry {
+  // Initialize the value to an invalid symbol so that we can recognize it
+  // when reading the bit stream using a Huffman code with space > 0.
+  HuffmanTableEntry() : bits(0), value(0xffff) {}
+
+  uint8_t bits;     // number of bits used for this symbol
+  uint16_t value;   // symbol value or table offset
+};
+
+// Builds jpeg-style Huffman lookup table from the given symbols.
+// The symbols are in order of increasing bit lengths. The number of symbols
+// with bit length n is given in counts[n] for each n >= 1.
+// Returns the size of the lookup table.
+int BuildJpegHuffmanTable(const int* counts, const int* symbols,
+                          HuffmanTableEntry* lut);
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_JPEG_HUFFMAN_DECODE_H_
diff --git a/guetzli/order.inc b/guetzli/order.inc
new file mode 100755
index 00000000..f65fdfc3
--- /dev/null
+++ b/guetzli/order.inc
@@ -0,0 +1,391 @@
+// Automatically generated by guetzli/order:update_c_code
+
+static const float csf[192] = {
+  0.0,
+  1.71014,
+  0.298711,
+  0.233709,
+  0.223126,
+  0.207072,
+  0.192775,
+  0.161201,
+  2.05807,
+  0.222927,
+  0.203406,
+  0.188465,
+  0.184668,
+  0.169993,
+  0.159142,
+  0.130155,
+  0.430518,
+  0.204939,
+  0.206655,
+  0.192231,
+  0.182941,
+  0.169455,
+  0.157599,
+  0.127153,
+  0.234757,
+  0.191098,
+  0.192698,
+  0.17425,
+  0.166503,
+  0.142154,
+  0.126182,
+  0.104196,
+  0.226117,
+  0.185373,
+  0.183825,
+  0.166643,
+  0.159414,
+  0.12636,
+  0.108696,
+  0.0911974,
+  0.207463,
+  0.171517,
+  0.170124,
+  0.141582,
+  0.126213,
+  0.103627,
+  0.0882436,
+  0.0751848,
+  0.196436,
+  0.161947,
+  0.159271,
+  0.126938,
+  0.109125,
+  0.0878027,
+  0.0749842,
+  0.0633859,
+  0.165232,
+  0.132905,
+  0.128679,
+  0.105766,
+  0.0906087,
+  0.0751544,
+  0.0641187,
+  0.0529921,
+  0.0,
+  0.147235,
+  0.11264,
+  0.0757892,
+  0.0493929,
+  0.0280663,
+  0.0075012,
+  -0.000945567,
+  0.149251,
+  0.0964806,
+  0.0786224,
+  0.05206,
+  0.0292758,
+  0.00353094,
+  -0.00277912,
+  -0.00404481,
+  0.115551,
+  0.0793142,
+  0.0623735,
+  0.0405019,
+  0.0152656,
+  -0.00145742,
+  -0.00370369,
+  -0.00375106,
+  0.0791547,
+  0.0537506,
+  0.0413634,
+  0.0193486,
+  0.000609066,
+  -0.00510923,
+  -0.0046452,
+  -0.00385187,
+  0.0544534,
+  0.0334066,
+  0.0153899,
+  0.000539088,
+  -0.00356085,
+  -0.00535661,
+  -0.00429145,
+  -0.00343131,
+  0.0356439,
+  0.00865645,
+  0.00165229,
+  -0.00425931,
+  -0.00507324,
+  -0.00459083,
+  -0.003703,
+  -0.00310327,
+  0.0121926,
+  -0.0009259,
+  -0.00330991,
+  -0.00499378,
+  -0.00437381,
+  -0.00377427,
+  -0.00311731,
+  -0.00255125,
+  -0.000320593,
+  -0.00426043,
+  -0.00416549,
+  -0.00419364,
+  -0.00365418,
+  -0.00317499,
+  -0.00255932,
+  -0.00217917,
+  0.0,
+  0.143471,
+  0.124336,
+  0.0947465,
+  0.0814066,
+  0.0686776,
+  0.0588122,
+  0.0374415,
+  0.146315,
+  0.105334,
+  0.0949415,
+  0.0784241,
+  0.0689064,
+  0.0588304,
+  0.0495961,
+  0.0202342,
+  0.123818,
+  0.0952654,
+  0.0860556,
+  0.0724158,
+  0.0628307,
+  0.0529965,
+  0.0353941,
+  0.00815821,
+  0.097054,
+  0.080422,
+  0.0731085,
+  0.0636154,
+  0.055606,
+  0.0384127,
+  0.0142879,
+  0.00105195,
+  0.0849312,
+  0.071115,
+  0.0631183,
+  0.0552972,
+  0.0369221,
+  0.00798314,
+  0.000716374,
+  -0.00200948,
+  0.0722298,
+  0.0599559,
+  0.054841,
+  0.0387529,
+  0.0107262,
+  0.000355315,
+  -0.00244803,
+  -0.00335222,
+  0.0635335,
+  0.0514196,
+  0.0406309,
+  0.0125833,
+  0.00151305,
+  -0.00140269,
+  -0.00362547,
+  -0.00337649,
+  0.0472024,
+  0.0198725,
+  0.0113437,
+  0.00266305,
+  -0.00137183,
+  -0.00354158,
+  -0.00341292,
+  -0.00290074
+};
+
+static const float bias[192] = {
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0,
+  0.0
+};
diff --git a/guetzli/output_image.cc b/guetzli/output_image.cc
new file mode 100755
index 00000000..e04022f2
--- /dev/null
+++ b/guetzli/output_image.cc
@@ -0,0 +1,449 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "guetzli/output_image.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+#include <cmath>
+
+#include "guetzli/idct.h"
+#include "guetzli/color_transform.h"
+#include "guetzli/dct_double.h"
+#include "guetzli/gamma_correct.h"
+#include "guetzli/preprocess_downsample.h"
+#include "guetzli/quantize.h"
+
+namespace guetzli {
+
+OutputImageComponent::OutputImageComponent(int w, int h)
+    : width_(w), height_(h) {
+  Reset(1, 1);
+}
+
+void OutputImageComponent::Reset(int factor_x, int factor_y) {
+  factor_x_ = factor_x;
+  factor_y_ = factor_y;
+  width_in_blocks_ = (width_ + 8 * factor_x_ - 1) / (8 * factor_x_);
+  height_in_blocks_ = (height_ + 8 * factor_y_ - 1) / (8 * factor_y_);
+  num_blocks_ = width_in_blocks_ * height_in_blocks_;
+  coeffs_ = std::vector<coeff_t>(num_blocks_ * kDCTBlockSize);
+  pixels_ = std::vector<uint16_t>(width_ * height_, 128 << 4);
+  for (int i = 0; i < kDCTBlockSize; ++i) quant_[i] = 1;
+}
+
+bool OutputImageComponent::IsAllZero() const {
+  int numcoeffs = num_blocks_ * kDCTBlockSize;
+  for (int i = 0; i < numcoeffs; ++i) {
+    if (coeffs_[i] != 0) return false;
+  }
+  return true;
+}
+
+void OutputImageComponent::GetCoeffBlock(int block_x, int block_y,
+                                         coeff_t block[kDCTBlockSize]) const {
+  assert(block_x < width_in_blocks_);
+  assert(block_y < height_in_blocks_);
+  int offset = (block_y * width_in_blocks_ + block_x) * kDCTBlockSize;
+  memcpy(block, &coeffs_[offset], kDCTBlockSize * sizeof(coeffs_[0]));
+}
+
+void OutputImageComponent::ToPixels(int xmin, int ymin, int xsize, int ysize,
+                                    uint8_t* out, int stride) const {
+  assert(xmin >= 0);
+  assert(ymin >= 0);
+  assert(xmin < width_);
+  assert(ymin < height_);
+  const int yend1 = ymin + ysize;
+  const int yend0 = std::min(yend1, height_);
+  int y = ymin;
+  for (; y < yend0; ++y) {
+    const int xend1 = xmin + xsize;
+    const int xend0 = std::min(xend1, width_);
+    int x = xmin;
+    int px = y * width_ + xmin;
+    for (; x < xend0; ++x, ++px, out += stride) {
+      *out = static_cast<uint8_t>((pixels_[px] + 8 - (x & 1)) >> 4);
+    }
+    const int offset = -stride;
+    for (; x < xend1; ++x) {
+      *out = out[offset];
+      out += stride;
+    }
+  }
+  for (; y < yend1; ++y) {
+    const int offset = -stride * xsize;
+    for (int x = 0; x < xsize; ++x) {
+      *out = out[offset];
+      out += stride;
+    }
+  }
+}
+
+void OutputImageComponent::ToFloatPixels(float* out, int stride) const {
+  assert(factor_x_ == 1);
+  assert(factor_y_ == 1);
+  for (int block_y = 0; block_y < height_in_blocks_; ++block_y) {
+    for (int block_x = 0; block_x < width_in_blocks_; ++block_x) {
+      coeff_t block[kDCTBlockSize];
+      GetCoeffBlock(block_x, block_y, block);
+      double blockd[kDCTBlockSize];
+      for (int k = 0; k < kDCTBlockSize; ++k) {
+        blockd[k] = block[k];
+      }
+      ComputeBlockIDCTDouble(blockd);
+      for (int iy = 0; iy < 8; ++iy) {
+        for (int ix = 0; ix < 8; ++ix) {
+          int y = block_y * 8 + iy;
+          int x = block_x * 8 + ix;
+          if (y >= height_ || x >= width_) continue;
+          out[(y * width_ + x) * stride] = blockd[8 * iy + ix] + 128.0;
+        }
+      }
+    }
+  }
+}
+
+void OutputImageComponent::SetCoeffBlock(int block_x, int block_y,
+                                         const coeff_t block[kDCTBlockSize]) {
+  assert(block_x < width_in_blocks_);
+  assert(block_y < height_in_blocks_);
+  int offset = (block_y * width_in_blocks_ + block_x) * kDCTBlockSize;
+  memcpy(&coeffs_[offset], block, kDCTBlockSize * sizeof(coeffs_[0]));
+  uint8_t idct[kDCTBlockSize];
+  ComputeBlockIDCT(&coeffs_[offset], idct);
+  UpdatePixelsForBlock(block_x, block_y, idct);
+}
+
+void OutputImageComponent::UpdatePixelsForBlock(
+    int block_x, int block_y, const uint8_t idct[kDCTBlockSize]) {
+  if (factor_x_ == 1 && factor_y_ == 1) {
+    for (int iy = 0; iy < 8; ++iy) {
+      for (int ix = 0; ix < 8; ++ix) {
+        int x = 8 * block_x + ix;
+        int y = 8 * block_y + iy;
+        if (x >= width_ || y >= height_) continue;
+        int p = y * width_ + x;
+        pixels_[p] = idct[8 * iy + ix] << 4;
+      }
+    }
+  } else if (factor_x_ == 2 && factor_y_ == 2) {
+    // Fill in the 10x10 pixel area in the subsampled image that will be the
+    // basis of the upsampling. This area is enough to hold the 3x3 kernel of
+    // the fancy upsampler around each pixel.
+    static const int kSubsampledEdgeSize = 10;
+    uint16_t subsampled[kSubsampledEdgeSize * kSubsampledEdgeSize];
+    for (int j = 0; j < kSubsampledEdgeSize; ++j) {
+      // The order we fill in the rows is:
+      //   8 rows intersecting the block, row below, row above
+      const int y0 = block_y * 16 + (j < 9 ? j * 2 : -2);
+      for (int i = 0; i < kSubsampledEdgeSize; ++i) {
+        // The order we fill in each row is:
+        //   8 pixels within the block, left edge, right edge
+        const int ix = ((j < 9 ? (j + 1) * kSubsampledEdgeSize : 0) +
+                        (i < 9 ? i + 1 : 0));
+        const int x0 = block_x * 16 + (i < 9 ? i * 2 : -2);
+        if (x0 < 0) {
+          subsampled[ix] = subsampled[ix + 1];
+        } else if (y0 < 0) {
+          subsampled[ix] = subsampled[ix + kSubsampledEdgeSize];
+        } else if (x0 >= width_) {
+          subsampled[ix] = subsampled[ix - 1];
+        } else if (y0 >= height_) {
+          subsampled[ix] = subsampled[ix - kSubsampledEdgeSize];
+        } else if (i < 8 && j < 8) {
+          subsampled[ix] = idct[j * 8 + i] << 4;
+        } else {
+          // Reconstruct the subsampled pixels around the edge of the current
+          // block by computing the inverse of the fancy upsampler.
+          const int y1 = std::max(y0 - 1, 0);
+          const int x1 = std::max(x0 - 1, 0);
+          subsampled[ix] = (pixels_[y0 * width_ + x0] * 9 +
+                            pixels_[y1 * width_ + x1] +
+                            pixels_[y0 * width_ + x1] * -3 +
+                            pixels_[y1 * width_ + x0] * -3) >> 2;
+        }
+      }
+    }
+
+    // Determine area to update.
+    int xmin = std::max(block_x * 16 - 1, 0);
+    int xmax = std::min(block_x * 16 + 16, width_ - 1);
+    int ymin = std::max(block_y * 16 - 1, 0);
+    int ymax = std::min(block_y * 16 + 16, height_ - 1);
+
+    // Apply the fancy upsampler on the subsampled block.
+    for (int y = ymin; y <= ymax; ++y) {
+      const int y0 = ((y & ~1) / 2 - block_y * 8 + 1) * kSubsampledEdgeSize;
+      const int dy = ((y & 1) * 2 - 1) * kSubsampledEdgeSize;
+      uint16_t* rowptr = &pixels_[y * width_];
+      for (int x = xmin; x <= xmax; ++x) {
+        const int x0 = (x & ~1) / 2 - block_x * 8 + 1;
+        const int dx = (x & 1) * 2 - 1;
+        const int ix = x0 + y0;
+        rowptr[x] = (subsampled[ix] * 9 + subsampled[ix + dy] * 3 +
+                     subsampled[ix + dx] * 3 + subsampled[ix + dx + dy]) >> 4;
+      }
+    }
+  } else {
+    printf("Sampling ratio not supported: factor_x = %d factor_y = %d\n",
+           factor_x_, factor_y_);
+    exit(1);
+  }
+}
+
+void OutputImageComponent::CopyFromJpegComponent(const JPEGComponent& comp,
+                                                 int factor_x, int factor_y,
+                                                 const int* quant) {
+  Reset(factor_x, factor_y);
+  assert(width_in_blocks_ <= comp.width_in_blocks);
+  assert(height_in_blocks_ <= comp.height_in_blocks);
+  const size_t src_row_size = comp.width_in_blocks * kDCTBlockSize;
+  for (int block_y = 0; block_y < height_in_blocks_; ++block_y) {
+    const coeff_t* src_coeffs = &comp.coeffs[block_y * src_row_size];
+    for (int block_x = 0; block_x < width_in_blocks_; ++block_x) {
+      coeff_t block[kDCTBlockSize];
+      for (int i = 0; i < kDCTBlockSize; ++i) {
+        block[i] = src_coeffs[i] * quant[i];
+      }
+      SetCoeffBlock(block_x, block_y, block);
+      src_coeffs += kDCTBlockSize;
+    }
+  }
+  memcpy(quant_, quant, sizeof(quant_));
+}
+
+void OutputImageComponent::ApplyGlobalQuantization(const int q[kDCTBlockSize]) {
+  for (int block_y = 0; block_y < height_in_blocks_; ++block_y) {
+    for (int block_x = 0; block_x < width_in_blocks_; ++block_x) {
+      coeff_t block[kDCTBlockSize];
+      GetCoeffBlock(block_x, block_y, block);
+      if (QuantizeBlock(block, q)) {
+        SetCoeffBlock(block_x, block_y, block);
+      }
+    }
+  }
+  memcpy(quant_, q, sizeof(quant_));
+}
+
+OutputImage::OutputImage(int w, int h)
+    : width_(w),
+      height_(h),
+      components_(3, OutputImageComponent(w, h)) {}
+
+void OutputImage::CopyFromJpegData(const JPEGData& jpg) {
+  for (int i = 0; i < jpg.components.size(); ++i) {
+    const JPEGComponent& comp = jpg.components[i];
+    assert(jpg.max_h_samp_factor % comp.h_samp_factor == 0);
+    assert(jpg.max_v_samp_factor % comp.v_samp_factor == 0);
+    int factor_x = jpg.max_h_samp_factor / comp.h_samp_factor;
+    int factor_y = jpg.max_v_samp_factor / comp.v_samp_factor;
+    assert(comp.quant_idx < jpg.quant.size());
+    components_[i].CopyFromJpegComponent(comp, factor_x, factor_y,
+                                         &jpg.quant[comp.quant_idx].values[0]);
+  }
+}
+
+namespace {
+
+void SetDownsampledCoefficients(const std::vector<float>& pixels,
+                                int factor_x, int factor_y,
+                                OutputImageComponent* comp) {
+  assert(pixels.size() == comp->width() * comp->height());
+  comp->Reset(factor_x, factor_y);
+  for (int block_y = 0; block_y < comp->height_in_blocks(); ++block_y) {
+    for (int block_x = 0; block_x < comp->width_in_blocks(); ++block_x) {
+      double blockd[kDCTBlockSize];
+      int x0 = 8 * block_x * factor_x;
+      int y0 = 8 * block_y * factor_y;
+      assert(x0 < comp->width());
+      assert(y0 < comp->height());
+      for (int iy = 0; iy < 8; ++iy) {
+        for (int ix = 0; ix < 8; ++ix) {
+          float avg = 0.0;
+          for (int j = 0; j < factor_y; ++j) {
+            for (int i = 0; i < factor_x; ++i) {
+              int x = std::min(x0 + ix * factor_x + i, comp->width() - 1);
+              int y = std::min(y0 + iy * factor_y + j, comp->height() - 1);
+              avg += pixels[y * comp->width() + x];
+            }
+          }
+          avg /= factor_x * factor_y;
+          blockd[iy * 8 + ix] = avg;
+        }
+      }
+      ComputeBlockDCTDouble(blockd);
+      blockd[0] -= 1024.0;
+      coeff_t block[kDCTBlockSize];
+      for (int k = 0; k < kDCTBlockSize; ++k) {
+        block[k] = static_cast<coeff_t>(std::round(blockd[k]));
+      }
+      comp->SetCoeffBlock(block_x, block_y, block);
+    }
+  }
+}
+
+}  // namespace
+
+void OutputImage::Downsample(const DownsampleConfig& cfg) {
+  if (components_[1].IsAllZero() && components_[2].IsAllZero()) {
+    // If the image is already grayscale, nothing to do.
+    return;
+  }
+  if (cfg.use_silver_screen &&
+      cfg.u_factor_x == 2 && cfg.u_factor_y == 2 &&
+      cfg.v_factor_x == 2 && cfg.v_factor_y == 2) {
+    std::vector<uint8_t> rgb = ToSRGB();
+    std::vector<std::vector<float> > yuv = RGBToYUV420(rgb, width_, height_);
+    SetDownsampledCoefficients(yuv[0], 1, 1, &components_[0]);
+    SetDownsampledCoefficients(yuv[1], 2, 2, &components_[1]);
+    SetDownsampledCoefficients(yuv[2], 2, 2, &components_[2]);
+    return;
+  }
+  // Get the floating-point precision YUV array represented by the set of
+  // DCT coefficients.
+  std::vector<std::vector<float> > yuv(3, std::vector<float>(width_ * height_));
+  for (int c = 0; c < 3; ++c) {
+    components_[c].ToFloatPixels(&yuv[c][0], 1);
+  }
+
+  yuv = PreProcessChannel(width_, height_, 2, 1.3, 0.5,
+                          cfg.u_sharpen, cfg.u_blur, yuv);
+  yuv = PreProcessChannel(width_, height_, 1, 1.3, 0.5,
+                          cfg.v_sharpen, cfg.v_blur, yuv);
+
+  // Do the actual downsampling (averaging) and forward-DCT.
+  if (cfg.u_factor_x != 1 || cfg.u_factor_y != 1) {
+    SetDownsampledCoefficients(yuv[1], cfg.u_factor_x, cfg.u_factor_y,
+                               &components_[1]);
+  }
+  if (cfg.v_factor_x != 1 || cfg.v_factor_y != 1) {
+    SetDownsampledCoefficients(yuv[2], cfg.v_factor_x, cfg.v_factor_y,
+                               &components_[2]);
+  }
+}
+
+void OutputImage::ApplyGlobalQuantization(const int q[3][kDCTBlockSize]) {
+  for (int c = 0; c < 3; ++c) {
+    components_[c].ApplyGlobalQuantization(&q[c][0]);
+  }
+}
+
+void OutputImage::SaveToJpegData(JPEGData* jpg) const {
+  assert(components_[0].factor_x() == 1);
+  assert(components_[0].factor_y() == 1);
+  jpg->width = width_;
+  jpg->height = height_;
+  jpg->max_h_samp_factor = 1;
+  jpg->max_v_samp_factor = 1;
+  jpg->MCU_cols = components_[0].width_in_blocks();
+  jpg->MCU_rows = components_[0].height_in_blocks();
+  int ncomp = components_[1].IsAllZero() && components_[2].IsAllZero() ? 1 : 3;
+  for (int i = 1; i < ncomp; ++i) {
+    jpg->max_h_samp_factor = std::max(jpg->max_h_samp_factor,
+                                      components_[i].factor_x());
+    jpg->max_v_samp_factor = std::max(jpg->max_h_samp_factor,
+                                      components_[i].factor_y());
+    jpg->MCU_cols = std::min(jpg->MCU_cols, components_[i].width_in_blocks());
+    jpg->MCU_rows = std::min(jpg->MCU_rows, components_[i].height_in_blocks());
+  }
+  jpg->components.resize(ncomp);
+  int q[3][kDCTBlockSize];
+  for (int c = 0; c < 3; ++c) {
+    memcpy(&q[c][0], components_[c].quant(), kDCTBlockSize * sizeof(q[0][0]));
+  }
+  for (int c = 0; c < ncomp; ++c) {
+    JPEGComponent* comp = &jpg->components[c];
+    assert(jpg->max_h_samp_factor % components_[c].factor_x() == 0);
+    assert(jpg->max_v_samp_factor % components_[c].factor_y() == 0);
+    comp->id = c;
+    comp->h_samp_factor = jpg->max_h_samp_factor / components_[c].factor_x();
+    comp->v_samp_factor = jpg->max_v_samp_factor / components_[c].factor_y();
+    comp->width_in_blocks = jpg->MCU_cols * comp->h_samp_factor;
+    comp->height_in_blocks = jpg->MCU_rows * comp->v_samp_factor;
+    comp->num_blocks = comp->width_in_blocks * comp->height_in_blocks;
+    comp->coeffs.resize(kDCTBlockSize * comp->num_blocks);
+
+    int last_dc = 0;
+    const coeff_t* src_coeffs = components_[c].coeffs();
+    coeff_t* dest_coeffs = &comp->coeffs[0];
+    for (int block_y = 0; block_y < comp->height_in_blocks; ++block_y) {
+      for (int block_x = 0; block_x < comp->width_in_blocks; ++block_x) {
+        if (block_y >= components_[c].height_in_blocks() ||
+            block_x >= components_[c].width_in_blocks()) {
+          dest_coeffs[0] = last_dc;
+          for (int k = 1; k < kDCTBlockSize; ++k) {
+            dest_coeffs[k] = 0;
+          }
+        } else {
+          for (int k = 0; k < kDCTBlockSize; ++k) {
+            const int quant = q[c][k];
+            int coeff = src_coeffs[k];
+            assert(coeff % quant == 0);
+            dest_coeffs[k] = coeff / quant;
+          }
+          src_coeffs += kDCTBlockSize;
+        }
+        last_dc = dest_coeffs[0];
+        dest_coeffs += kDCTBlockSize;
+      }
+    }
+  }
+  SaveQuantTables(q, jpg);
+}
+
+std::vector<uint8_t> OutputImage::ToSRGB(int xmin, int ymin,
+                                         int xsize, int ysize) const {
+  std::vector<uint8_t> rgb(xsize * ysize * 3);
+  for (int c = 0; c < 3; ++c) {
+    components_[c].ToPixels(xmin, ymin, xsize, ysize, &rgb[c], 3);
+  }
+  for (int p = 0; p < rgb.size(); p += 3) {
+    ColorTransformYCbCrToRGB(&rgb[p]);
+  }
+  return rgb;
+}
+
+std::vector<uint8_t> OutputImage::ToSRGB() const {
+  return ToSRGB(0, 0, width_, height_);
+}
+
+void OutputImage::ToLinearRGB(int xmin, int ymin, int xsize, int ysize,
+                              std::vector<std::vector<float> >* rgb) const {
+  const double* lut = Srgb8ToLinearTable();
+  std::vector<uint8_t> rgb_pixels = ToSRGB(xmin, ymin, xsize, ysize);
+  for (int p = 0; p < xsize * ysize; ++p) {
+    for (int i = 0; i < 3; ++i) {
+      (*rgb)[i][p] = lut[rgb_pixels[3 * p + i]];
+    }
+  }
+}
+
+void OutputImage::ToLinearRGB(std::vector<std::vector<float> >* rgb) const {
+  ToLinearRGB(0, 0, width_, height_, rgb);
+}
+
+std::string OutputImage::FrameTypeStr() const {
+  char buf[128];
+  int len = snprintf(buf, sizeof(buf), "f%d%d%d%d%d%d",
+                     component(0).factor_x(), component(0).factor_y(),
+                     component(1).factor_x(), component(1).factor_y(),
+                     component(2).factor_x(), component(2).factor_y());
+  return std::string(buf, len);
+}
+
+}  // namespace guetzli
diff --git a/guetzli/output_image.h b/guetzli/output_image.h
new file mode 100755
index 00000000..1018eeac
--- /dev/null
+++ b/guetzli/output_image.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GUETZLI_OUTPUT_IMAGE_H_
+#define GUETZLI_OUTPUT_IMAGE_H_
+
+#include <stdint.h>
+#include <vector>
+
+#include "guetzli/jpeg_data.h"
+
+namespace guetzli {
+
+class OutputImageComponent {
+ public:
+  OutputImageComponent(int w, int h);
+
+  void Reset(int factor_x, int factor_y);
+
+  int width() const { return width_; }
+  int height() const { return height_; }
+  int factor_x() const { return factor_x_; }
+  int factor_y() const { return factor_y_; }
+  int width_in_blocks() const { return width_in_blocks_; }
+  int height_in_blocks() const { return height_in_blocks_; }
+  const coeff_t* coeffs() const { return &coeffs_[0]; }
+  const int* quant() const { return &quant_[0]; }
+  bool IsAllZero() const;
+
+  // Fills in block[] with the 8x8 coefficient block with block coordinates
+  // (block_x, block_y).
+  // NOTE: If the component is 2x2 subsampled, this corresponds to the 16x16
+  // pixel area with upper-left corner (16 * block_x, 16 * block_y).
+  void GetCoeffBlock(int block_x, int block_y,
+                     coeff_t block[kDCTBlockSize]) const;
+
+  // Fills in out[] array with the 8-bit pixel view of this component cropped
+  // to the specified window. The window's upper-left corner, (xmin, ymin) must
+  // be within the image, but the window may extend past the image. In that
+  // case the edge pixels are duplicated.
+  void ToPixels(int xmin, int ymin, int xsize, int ysize,
+                uint8_t* out, int stride) const;
+
+  // Fills in out[] array with the floating-point precision pixel view of the
+  // component.
+  // REQUIRES: factor_x() == 1 and factor_y() == 1.
+  void ToFloatPixels(float* out, int stride) const;
+
+  // Sets the 8x8 coefficient block with block coordinates (block_x, block_y)
+  // to block[].
+  // NOTE: If the component is 2x2 subsampled, this corresponds to the 16x16
+  // pixel area with upper-left corner (16 * block_x, 16 * block_y).
+  // REQUIRES: block[k] % quant()[k] == 0 for each coefficient index k.
+  void SetCoeffBlock(int block_x, int block_y,
+                     const coeff_t block[kDCTBlockSize]);
+
+  // Requires that comp is not downsampled.
+  void CopyFromJpegComponent(const JPEGComponent& comp,
+                             int factor_x, int factor_y,
+                             const int* quant);
+
+  void ApplyGlobalQuantization(const int q[kDCTBlockSize]);
+
+ private:
+  void UpdatePixelsForBlock(int block_x, int block_y,
+                            const uint8_t idct[kDCTBlockSize]);
+
+  const int width_;
+  const int height_;
+  int factor_x_;
+  int factor_y_;
+  int width_in_blocks_;
+  int height_in_blocks_;
+  int num_blocks_;
+  std::vector<coeff_t> coeffs_;
+  std::vector<uint16_t> pixels_;
+  // Same as last argument of ApplyGlobalQuantization() (default is all 1s).
+  int quant_[kDCTBlockSize];
+};
+
+class OutputImage {
+ public:
+  OutputImage(int w, int h);
+
+  int width() const { return width_; }
+  int height() const { return height_; }
+
+  OutputImageComponent& component(int c) { return components_[c]; }
+  const OutputImageComponent& component(int c) const { return components_[c]; }
+
+  // Requires that jpg is in YUV444 format.
+  void CopyFromJpegData(const JPEGData& jpg);
+
+  void ApplyGlobalQuantization(const int q[3][kDCTBlockSize]);
+
+  // If sharpen or blur are enabled, preprocesses image before downsampling U or
+  // V to improve butteraugli score and/or reduce file size.
+  // u_sharpen: sharpen the u channel in red areas to improve score (not as
+  // effective as v_sharpen, blue is not so important)
+  // u_blur: blur the u channel in some areas to reduce file size
+  // v_sharpen: sharpen the v channel in red areas to improve score
+  // v_blur: blur the v channel in some areas to reduce file size
+  struct DownsampleConfig {
+    // Default is YUV420.
+    DownsampleConfig() : u_factor_x(2), u_factor_y(2),
+                         v_factor_x(2), v_factor_y(2),
+                         u_sharpen(true), u_blur(true),
+                         v_sharpen(true), v_blur(true),
+                         use_silver_screen(false) {}
+    int u_factor_x;
+    int u_factor_y;
+    int v_factor_x;
+    int v_factor_y;
+    bool u_sharpen;
+    bool u_blur;
+    bool v_sharpen;
+    bool v_blur;
+    bool use_silver_screen;
+  };
+
+  void Downsample(const DownsampleConfig& cfg);
+
+  void SaveToJpegData(JPEGData* jpg) const;
+
+  std::vector<uint8_t> ToSRGB() const;
+
+  std::vector<uint8_t> ToSRGB(int xmin, int ymin, int xsize, int ysize) const;
+
+  void ToLinearRGB(std::vector<std::vector<float> >* rgb) const;
+
+  void ToLinearRGB(int xmin, int ymin, int xsize, int ysize,
+                   std::vector<std::vector<float> >* rgb) const;
+
+  std::string FrameTypeStr() const;
+
+ private:
+  const int width_;
+  const int height_;
+  std::vector<OutputImageComponent> components_;
+};
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_OUTPUT_IMAGE_H_
diff --git a/guetzli/preprocess_downsample.cc b/guetzli/preprocess_downsample.cc
new file mode 100755
index 00000000..8b2d256b
--- /dev/null
+++ b/guetzli/preprocess_downsample.cc
@@ -0,0 +1,477 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "guetzli/preprocess_downsample.h"
+
+#include <assert.h>
+#include <string.h>
+#include <cmath>
+
+using std::size_t;
+
+namespace {
+
+// convolve with size*size kernel
+std::vector<float> Convolve2D(const std::vector<float>& image, int w, int h,
+                              const double* kernel, int size) {
+  auto result = image;
+  int size2 = size / 2;
+  for (int i = 0; i < image.size(); i++) {
+    int x = i % w;
+    int y = i / w;
+    // Avoid non-normalized results at boundary by skipping edges.
+    if (x < size2 || x + size - size2 - 1 >= w
+        || y < size2 || y + size - size2 - 1 >= h) {
+      continue;
+    }
+    float v = 0;
+    for (int j = 0; j < size * size; j++) {
+      int x2 = x + j % size - size2;
+      int y2 = y + j / size - size2;
+      v += kernel[j] * image[y2 * w + x2];
+    }
+    result[i] = v;
+  }
+  return result;
+}
+
+// convolve horizontally and vertically with 1D kernel
+std::vector<float> Convolve2X(const std::vector<float>& image, int w, int h,
+                              const double* kernel, int size, double mul) {
+  auto temp = image;
+  int size2 = size / 2;
+  for (int i = 0; i < image.size(); i++) {
+    int x = i % w;
+    int y = i / w;
+    // Avoid non-normalized results at boundary by skipping edges.
+    if (x < size2 || x + size - size2 - 1 >= w) continue;
+    float v = 0;
+    for (int j = 0; j < size; j++) {
+      int x2 = x + j - size2;
+      v += kernel[j] * image[y * w + x2];
+    }
+    temp[i] = v * mul;
+  }
+  auto result = temp;
+  for (int i = 0; i < temp.size(); i++) {
+    int x = i % w;
+    int y = i / w;
+    // Avoid non-normalized results at boundary by skipping edges.
+    if (y < size2 || y + size - size2 - 1 >= h) continue;
+    float v = 0;
+    for (int j = 0; j < size; j++) {
+      int y2 = y + j - size2;
+      v += kernel[j] * temp[y2 * w + x];
+    }
+    result[i] = v * mul;
+  }
+  return result;
+}
+
+double Normal(double x, double sigma) {
+  static const double kInvSqrt2Pi = 0.3989422804014327;
+  return std::exp(-x * x / (2 * sigma * sigma)) * kInvSqrt2Pi / sigma;
+}
+
+std::vector<float> Sharpen(const std::vector<float>& image, int w, int h,
+                           float sigma, float amount) {
+  // This is only made for small sigma, e.g. 1.3.
+  std::vector<double> kernel(5);
+  for (int i = 0; i < kernel.size(); i++) {
+    kernel[i] = Normal(1.0 * i - kernel.size() / 2, sigma);
+  }
+
+  double sum = 0;
+  for (int i = 0; i < kernel.size(); i++) sum += kernel[i];
+  const double mul = 1.0 / sum;
+
+  std::vector<float> result =
+      Convolve2X(image, w, h, kernel.data(), kernel.size(), mul);
+  for (size_t i = 0; i < image.size(); i++) {
+    result[i] = image[i] + (image[i] - result[i]) * amount;
+  }
+  return result;
+}
+
+void Erode(int w, int h, std::vector<bool>* image) {
+  std::vector<bool> temp = *image;
+  for (int y = 1; y + 1 < h; y++) {
+    for (int x = 1; x + 1 < w; x++) {
+      size_t index = y * w + x;
+      if (!(temp[index] && temp[index - 1] && temp[index + 1]
+          && temp[index - w] && temp[index + w])) {
+        (*image)[index] = 0;
+      }
+    }
+  }
+}
+
+void Dilate(int w, int h, std::vector<bool>* image) {
+  std::vector<bool> temp = *image;
+  for (int y = 1; y + 1 < h; y++) {
+    for (int x = 1; x + 1 < w; x++) {
+      size_t index = y * w + x;
+      if (temp[index] || temp[index - 1] || temp[index + 1]
+          || temp[index - w] || temp[index + w]) {
+        (*image)[index] = 1;
+      }
+    }
+  }
+}
+
+std::vector<float> Blur(const std::vector<float>& image, int w, int h) {
+    // This is only made for small sigma, e.g. 1.3.
+    static const double kSigma = 1.3;
+    std::vector<double> kernel(5);
+    for (int i = 0; i < kernel.size(); i++) {
+      kernel[i] = Normal(1.0 * i - kernel.size() / 2, kSigma);
+    }
+
+    double sum = 0;
+    for (int i = 0; i < kernel.size(); i++) sum += kernel[i];
+    const double mul = 1.0 / sum;
+
+    return Convolve2X(image, w, h, kernel.data(), kernel.size(), mul);
+}
+
+}  // namespace
+
+namespace guetzli {
+
+// Do the sharpening to the v channel, but only in areas where it will help
+// channel should be 2 for v sharpening, or 1 for less effective u sharpening
+std::vector<std::vector<float>> PreProcessChannel(
+    int w, int h, int channel, float sigma, float amount, bool blur,
+    bool sharpen, const std::vector<std::vector<float>>& image) {
+  if (!blur && !sharpen) return image;
+
+  // Bring in range 0.0-1.0 for Y, -0.5 - 0.5 for U and V
+  auto yuv = image;
+  for (int i = 0; i < yuv[0].size(); i++) {
+    yuv[0][i] /= 255.0;
+    yuv[1][i] = yuv[1][i] / 255.0 - 0.5;
+    yuv[2][i] = yuv[2][i] / 255.0 - 0.5;
+  }
+
+  // Map of areas where the image is not too bright to apply the effect.
+  std::vector<bool> darkmap(image[0].size(), false);
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      size_t index = y * w + x;
+      float y = yuv[0][index];
+      float u = yuv[1][index];
+      float v = yuv[2][index];
+
+      float r = y + 1.402 * v;
+      float g = y - 0.34414 * u - 0.71414 * v;
+      float b = y + 1.772 * u;
+
+      // Parameters tuned to avoid sharpening in too bright areas, where the
+      // effect makes it worse instead of better.
+      if (channel == 2 && g < 0.85 && b < 0.85 && r < 0.9) {
+        darkmap[index] = true;
+      }
+      if (channel == 1 && r < 0.85 && g < 0.85 && b < 0.9) {
+        darkmap[index] = true;
+      }
+    }
+  }
+
+  Erode(w, h, &darkmap);
+  Erode(w, h, &darkmap);
+  Erode(w, h, &darkmap);
+
+  // Map of areas where the image is red enough (blue in case of u channel).
+  std::vector<bool> redmap(image[0].size(), false);
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      size_t index = y * w + x;
+      float u = yuv[1][index];
+      float v = yuv[2][index];
+
+      // Parameters tuned to allow only colors on which sharpening is useful.
+      if (channel == 2 && 2.116 * v > -0.34414 * u + 0.2
+          && 1.402 * v > 1.772 * u + 0.2) {
+        redmap[index] = true;
+      }
+      if (channel == 1 && v < 1.263 * u - 0.1 && u > -0.33741 * v) {
+        redmap[index] = true;
+      }
+    }
+  }
+
+  Dilate(w, h, &redmap);
+  Dilate(w, h, &redmap);
+  Dilate(w, h, &redmap);
+
+  // Map of areas where to allow sharpening by combining red and dark areas
+  std::vector<bool> sharpenmap(image[0].size(), 0);
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      size_t index = y * w + x;
+      sharpenmap[index] = redmap[index] && darkmap[index];
+    }
+  }
+
+  // Threshold for where considered an edge.
+  const double threshold = (channel == 2 ? 0.02 : 1.0) * 127.5;
+
+  static const double kEdgeMatrix[9] = {
+    0, -1, 0,
+    -1, 4, -1,
+    0, -1, 0
+  };
+
+  // Map of areas where to allow blurring, only where it is not too sharp
+  std::vector<bool> blurmap(image[0].size(), false);
+  std::vector<float> edge = Convolve2D(yuv[channel], w, h, kEdgeMatrix, 3);
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      size_t index = y * w + x;
+      float u = yuv[1][index];
+      float v = yuv[2][index];
+      if (sharpenmap[index]) continue;
+      if (!darkmap[index]) continue;
+      if (fabs(edge[index]) < threshold && v < -0.162 * u) {
+        blurmap[index] = true;
+      }
+    }
+  }
+  Erode(w, h, &blurmap);
+  Erode(w, h, &blurmap);
+
+  // Choose sharpened, blurred or original per pixel
+  std::vector<float> sharpened = Sharpen(yuv[channel], w, h, sigma, amount);
+  std::vector<float> blurred = Blur(yuv[channel], w, h);
+  for (int y = 0; y < h; y++) {
+    for (int x = 0; x < w; x++) {
+      size_t index = y * w + x;
+
+      if (sharpenmap[index] > 0) {
+        if (sharpen) yuv[channel][index] = sharpened[index];
+      } else if (blurmap[index] > 0) {
+        if (blur) yuv[channel][index] = blurred[index];
+      }
+    }
+  }
+
+  // Bring back to range 0-255
+  for (int i = 0; i < yuv[0].size(); i++) {
+    yuv[0][i] *= 255.0;
+    yuv[1][i] = (yuv[1][i] + 0.5) * 255.0;
+    yuv[2][i] = (yuv[2][i] + 0.5) * 255.0;
+  }
+  return yuv;
+}
+
+namespace {
+
+inline float Clip(float val) {
+  return std::max(0.0f, std::min(255.0f, val));
+}
+
+inline float RGBToY(float r, float g, float b) {
+  return 0.299f * r + 0.587f * g + 0.114f * b;
+}
+
+inline float RGBToU(float r, float g, float b) {
+  return -0.16874f * r - 0.33126f * g + 0.5f * b + 128.0;
+}
+
+inline float RGBToV(float r, float g, float b) {
+  return 0.5f * r - 0.41869f * g - 0.08131f * b + 128.0;
+}
+
+inline float YUVToR(float y, float u, float v) {
+  return y + 1.402 * (v - 128.0);
+}
+
+inline float YUVToG(float y, float u, float v) {
+  return y - 0.344136 * (u - 128.0) - 0.714136 * (v - 128.0);
+}
+
+inline float YUVToB(float y, float u, float v) {
+  return y + 1.772 * (u - 128.0);
+}
+
+// TODO(user) Use SRGB->linear conversion and a lookup-table.
+inline float GammaToLinear(float x) {
+  return std::pow(x / 255.0, 2.2);
+}
+
+// TODO(user) Use linear->SRGB conversion and a lookup-table.
+inline float LinearToGamma(float x) {
+  return 255.0 * std::pow(x, 1.0 / 2.2);
+}
+
+std::vector<float> LinearlyAveragedLuma(const std::vector<float>& rgb) {
+  assert(rgb.size() % 3 == 0);
+  std::vector<float> y(rgb.size() / 3);
+  for (int i = 0, p = 0; p < rgb.size(); ++i, p += 3) {
+    y[i] = LinearToGamma(RGBToY(GammaToLinear(rgb[p + 0]),
+                                GammaToLinear(rgb[p + 1]),
+                                GammaToLinear(rgb[p + 2])));
+  }
+  return y;
+}
+
+std::vector<float> LinearlyDownsample2x2(const std::vector<float>& rgb_in,
+                                         const int width, const int height) {
+  assert(rgb_in.size() == 3 * width * height);
+  int w = (width + 1) / 2;
+  int h = (height + 1) / 2;
+  std::vector<float> rgb_out(3 * w * h);
+  for (int y = 0, p = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x) {
+      for (int i = 0; i < 3; ++i, ++p) {
+        rgb_out[p] = 0.0;
+        for (int iy = 0; iy < 2; ++iy) {
+          for (int ix = 0; ix < 2; ++ix) {
+            int yy = std::min(height - 1, 2 * y + iy);
+            int xx = std::min(width - 1, 2 * x + ix);
+            rgb_out[p] += GammaToLinear(rgb_in[3 * (yy * width + xx) + i]);
+          }
+        }
+        rgb_out[p] = LinearToGamma(0.25 * rgb_out[p]);
+      }
+    }
+  }
+  return rgb_out;
+}
+
+std::vector<std::vector<float> > RGBToYUV(const std::vector<float>& rgb) {
+  std::vector<std::vector<float> > yuv(3, std::vector<float>(rgb.size() / 3));
+  for (int i = 0, p = 0; p < rgb.size(); ++i, p += 3) {
+    const float r = rgb[p + 0];
+    const float g = rgb[p + 1];
+    const float b = rgb[p + 2];
+    yuv[0][i] = RGBToY(r, g, b);
+    yuv[1][i] = RGBToU(r, g, b);
+    yuv[2][i] = RGBToV(r, g, b);
+  }
+  return yuv;
+}
+
+std::vector<float> YUVToRGB(const std::vector<std::vector<float> >& yuv) {
+  std::vector<float> rgb(3 * yuv[0].size());
+  for (int i = 0, p = 0; p < rgb.size(); ++i, p += 3) {
+    const float y = yuv[0][i];
+    const float u = yuv[1][i];
+    const float v = yuv[2][i];
+    rgb[p + 0] = Clip(YUVToR(y, u, v));
+    rgb[p + 1] = Clip(YUVToG(y, u, v));
+    rgb[p + 2] = Clip(YUVToB(y, u, v));
+  }
+  return rgb;
+}
+
+// Upsamples img_in with a box-filter, and returns an image with output
+// dimensions width x height.
+std::vector<float> Upsample2x2(const std::vector<float>& img_in,
+                               const int width, const int height) {
+  int w = (width + 1) / 2;
+  int h = (height + 1) / 2;
+  assert(img_in.size() == w * h);
+  std::vector<float> img_out(width * height);
+  for (int y = 0, p = 0; y < h; ++y) {
+    for (int x = 0; x < w; ++x, ++p) {
+      for (int iy = 0; iy < 2; ++iy) {
+        for (int ix = 0; ix < 2; ++ix) {
+          int yy = std::min(height - 1, 2 * y + iy);
+          int xx = std::min(width - 1, 2 * x + ix);
+          img_out[yy * width + xx] = img_in[p];
+        }
+      }
+    }
+  }
+  return img_out;
+}
+
+// Apply the "fancy upsample" filter used by libjpeg.
+std::vector<float> Blur(const std::vector<float>& img,
+                        const int width, const int height) {
+  std::vector<float> img_out(width * height);
+  for (int y0 = 0; y0 < height; y0 += 2) {
+    for (int x0 = 0; x0 < width; x0 += 2) {
+      for (int iy = 0; iy < 2 && y0 + iy < height; ++iy) {
+        for (int ix = 0; ix < 2 && x0 + ix < width; ++ix) {
+          int dy = 4 * iy - 2;
+          int dx = 4 * ix - 2;
+          int x1 = std::min(width - 1, std::max(0, x0 + dx));
+          int y1 = std::min(height - 1, std::max(0, y0 + dy));
+          img_out[(y0 + iy) * width + x0 + ix] =
+              (9.0 * img[y0 * width + x0] +
+               3.0 * img[y0 * width + x1] +
+               3.0 * img[y1 * width + x0] +
+               1.0 * img[y1 * width + x1]) / 16.0;
+        }
+      }
+    }
+  }
+  return img_out;
+}
+
+std::vector<float> YUV420ToRGB(const std::vector<std::vector<float> >& yuv420,
+                               const int width, const int height) {
+  std::vector<std::vector<float> > yuv;
+  yuv.push_back(yuv420[0]);
+  std::vector<float> u = Upsample2x2(yuv420[1], width, height);
+  std::vector<float> v = Upsample2x2(yuv420[2], width, height);
+  yuv.push_back(Blur(u, width, height));
+  yuv.push_back(Blur(v, width, height));
+  return YUVToRGB(yuv);
+}
+
+void UpdateGuess(const std::vector<float>& target,
+                 const std::vector<float>& reconstructed,
+                 std::vector<float>* guess) {
+  assert(reconstructed.size() == guess->size());
+  assert(target.size() == guess->size());
+  for (int i = 0; i < guess->size(); ++i) {
+    // TODO(user): Evaluate using a decaying constant here.
+    (*guess)[i] = Clip((*guess)[i] - (reconstructed[i] - target[i]));
+  }
+}
+
+}  // namespace
+
+std::vector<std::vector<float> > RGBToYUV420(
+    const std::vector<uint8_t>& rgb_in, const int width, const int height) {
+  std::vector<float> rgbf(rgb_in.size());
+  for (int i = 0; i < rgb_in.size(); ++i) {
+    rgbf[i] = static_cast<float>(rgb_in[i]);
+  }
+  std::vector<float> y_target = LinearlyAveragedLuma(rgbf);
+  std::vector<std::vector<float> > yuv_target =
+      RGBToYUV(LinearlyDownsample2x2(rgbf, width, height));
+  std::vector<std::vector<float> > yuv_guess = yuv_target;
+  yuv_guess[0] = Upsample2x2(yuv_guess[0], width, height);
+  // TODO(user): Stop early if the error is small enough.
+  for (int iter = 0; iter < 20; ++iter) {
+    std::vector<float> rgb_rec = YUV420ToRGB(yuv_guess, width, height);
+    std::vector<float> y_rec = LinearlyAveragedLuma(rgb_rec);
+    std::vector<std::vector<float> > yuv_rec =
+        RGBToYUV(LinearlyDownsample2x2(rgb_rec, width, height));
+    UpdateGuess(y_target, y_rec, &yuv_guess[0]);
+    UpdateGuess(yuv_target[1], yuv_rec[1], &yuv_guess[1]);
+    UpdateGuess(yuv_target[2], yuv_rec[2], &yuv_guess[2]);
+  }
+  yuv_guess[1] = Upsample2x2(yuv_guess[1], width, height);
+  yuv_guess[2] = Upsample2x2(yuv_guess[2], width, height);
+  return yuv_guess;
+}
+
+}  // namespace guetzli
diff --git a/guetzli/preprocess_downsample.h b/guetzli/preprocess_downsample.h
new file mode 100755
index 00000000..bd3d86aa
--- /dev/null
+++ b/guetzli/preprocess_downsample.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Preprocesses U and V channel for better results after downsampling.
+
+#ifndef GUETZLI_PREPROCESS_DOWNSAMPLE_H_
+#define GUETZLI_PREPROCESS_DOWNSAMPLE_H_
+
+#include <stdint.h>
+#include <vector>
+
+namespace guetzli {
+
+// Preprocesses the u (1) or v (2) channel of the given YUV image (range 0-255).
+std::vector<std::vector<float>> PreProcessChannel(
+    int w, int h, int channel, float sigma, float amount, bool blur,
+    bool sharpen, const std::vector<std::vector<float>>& image);
+
+// Gamma-compensated chroma subsampling.
+// Returns Y, U, V image planes, each with width x height dimensions, but the
+// U and V planes are composed of 2x2 blocks with the same values.
+std::vector<std::vector<float> > RGBToYUV420(
+    const std::vector<uint8_t>& rgb_in, const int width, const int height);
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_PREPROCESS_DOWNSAMPLE_H_
diff --git a/guetzli/processor.cc b/guetzli/processor.cc
new file mode 100755
index 00000000..ea242fbf
--- /dev/null
+++ b/guetzli/processor.cc
@@ -0,0 +1,879 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "guetzli/processor.h"
+
+#include <algorithm>
+#include <set>
+#include <vector>
+
+#include "guetzli/butteraugli_comparator.h"
+#include "guetzli/comparator.h"
+#include "guetzli/debug_print.h"
+#include "guetzli/fast_log.h"
+#include "guetzli/jpeg_data_decoder.h"
+#include "guetzli/jpeg_data_encoder.h"
+#include "guetzli/jpeg_data_reader.h"
+#include "guetzli/jpeg_data_writer.h"
+#include "guetzli/output_image.h"
+#include "guetzli/quantize.h"
+
+namespace guetzli {
+
+namespace {
+
+static const size_t kBlockSize = 3 * kDCTBlockSize;
+
+struct CoeffData {
+  int idx;
+  float block_err;
+};
+struct QuantData {
+  int q[3][kDCTBlockSize];
+  bool dist_ok;
+  GuetzliOutput out;
+};
+class Processor {
+ public:
+  bool ProcessJpegData(const Params& params, const JPEGData& jpg_in,
+                       Comparator* comparator, GuetzliOutput* out,
+                       ProcessStats* stats);
+
+ private:
+  void SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
+                              const uint8_t comp_mask, const double target_mul,
+                              bool stop_early);
+  void ComputeBlockZeroingOrder(
+      const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize],
+      const int block_x, const int block_y, const int factor_x,
+      const int factor_y, const uint8_t comp_mask, OutputImage* img,
+      std::vector<CoeffData>* output_order);
+  bool SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample,
+                         int best_q[3][kDCTBlockSize],
+                         GuetzliOutput* quantized_out);
+  QuantData TryQuantMatrix(const JPEGData& jpg_in,
+                           const float target_mul,
+                           int q[3][kDCTBlockSize]);
+  void MaybeOutput(const std::string& encoded_jpg);
+  void DownsampleImage(OutputImage* img);
+  void OutputJpeg(const JPEGData& in, std::string* out);
+
+  Params params_;
+  Comparator* comparator_;
+  GuetzliOutput* final_output_;
+  ProcessStats* stats_;
+};
+
+void RemoveOriginalQuantization(JPEGData* jpg, int q_in[3][kDCTBlockSize]) {
+  for (int i = 0; i < 3; ++i) {
+    JPEGComponent& c = jpg->components[i];
+    const int* q = &jpg->quant[c.quant_idx].values[0];
+    memcpy(&q_in[i][0], q, kDCTBlockSize * sizeof(q[0]));
+    for (int j = 0; j < c.coeffs.size(); ++j) {
+      c.coeffs[j] *= q[j % kDCTBlockSize];
+    }
+  }
+  int q[3][kDCTBlockSize];
+  for (int i = 0; i < 3; ++i)
+    for (int j = 0; j < kDCTBlockSize; ++j) q[i][j] = 1;
+  SaveQuantTables(q, jpg);
+}
+
+void Processor::DownsampleImage(OutputImage* img) {
+  if (img->component(1).factor_x() > 1 || img->component(1).factor_y() > 1) {
+    return;
+  }
+  OutputImage::DownsampleConfig cfg;
+  cfg.use_silver_screen = params_.use_silver_screen;
+  img->Downsample(cfg);
+}
+
+}  // namespace
+
+int GuetzliStringOut(void* data, const uint8_t* buf, size_t count) {
+  std::string* sink =
+      reinterpret_cast<std::string*>(data);
+  sink->append(reinterpret_cast<const char*>(buf), count);
+  return count;
+}
+
+void Processor::OutputJpeg(const JPEGData& jpg,
+                           std::string* out) {
+  out->clear();
+  JPEGOutput output(GuetzliStringOut, out);
+  if (!WriteJpeg(jpg, params_.clear_metadata, output)) {
+    assert(0);
+  }
+}
+
+void Processor::MaybeOutput(const std::string& encoded_jpg) {
+  double score = comparator_->ScoreOutputSize(encoded_jpg.size());
+  GUETZLI_LOG(stats_, " Score[%.4f]", score);
+  if (score < final_output_->score || final_output_->score < 0) {
+    final_output_->jpeg_data = encoded_jpg;
+    final_output_->distmap = comparator_->distmap();
+    final_output_->distmap_aggregate = comparator_->distmap_aggregate();
+    final_output_->score = score;
+    GUETZLI_LOG(stats_, " (*)");
+  }
+  GUETZLI_LOG(stats_, "\n");
+}
+
+bool CompareQuantData(const QuantData& a, const QuantData& b) {
+  if (a.dist_ok && !b.dist_ok) return true;
+  if (!a.dist_ok && b.dist_ok) return false;
+  return a.out.jpeg_data.size() < b.out.jpeg_data.size();
+}
+
+// Compares a[0..kBlockSize) and b[0..kBlockSize) vectors, and returns
+//   0 : if they are equal
+//  -1 : if a is everywhere <= than b and in at least one coordinate <
+//   1 : if a is everywhere >= than b and in at least one coordinate >
+//   2 : if a and b are uncomparable (some coordinate smaller and some greater)
+int CompareQuantMatrices(const int* a, const int* b) {
+  int i = 0;
+  while (i < kBlockSize && a[i] == b[i]) ++i;
+  if (i == kBlockSize) {
+    return 0;
+  }
+  if (a[i] < b[i]) {
+    for (++i; i < kBlockSize; ++i) {
+      if (a[i] > b[i]) return 2;
+    }
+    return -1;
+  } else {
+    for (++i; i < kBlockSize; ++i) {
+      if (a[i] < b[i]) return 2;
+    }
+    return 1;
+  }
+}
+
+double ContrastSensitivity(int k) {
+  return 1.0 / (1.0 + kJPEGZigZagOrder[k] / 2.0);
+}
+
+double QuantMatrixHeuristicScore(const int q[3][kDCTBlockSize]) {
+  double score = 0.0;
+  for (int c = 0; c < 3; ++c) {
+    for (int k = 0; k < kDCTBlockSize; ++k) {
+      score += 0.5 * (q[c][k] - 1.0) * ContrastSensitivity(k);
+    }
+  }
+  return score;
+}
+
+class QuantMatrixGenerator {
+ public:
+  QuantMatrixGenerator(bool downsample, ProcessStats* stats)
+      : downsample_(downsample), hscore_a_(-1.0), hscore_b_(-1.0),
+        total_csf_(0.0), stats_(stats) {
+    for (int k = 0; k < kDCTBlockSize; ++k) {
+      total_csf_ += 3.0 * ContrastSensitivity(k);
+    }
+  }
+
+  bool GetNext(int q[3][kDCTBlockSize]) {
+    // This loop should terminate by return. This 1000 iteration limit is just a
+    // precaution.
+    for (int iter = 0; iter < 1000; iter++) {
+      double hscore;
+      if (hscore_b_ == -1.0) {
+        if (hscore_a_ == -1.0) {
+          hscore = downsample_ ? 0.0 : total_csf_;
+        } else {
+          hscore = hscore_a_ + total_csf_;
+        }
+        if (hscore > 100 * total_csf_) {
+          // We could not find a quantization matrix that creates enough
+          // butteraugli error. This can happen if all dct coefficients are
+          // close to zero in the original image.
+          return false;
+        }
+      } else if (hscore_b_ == 0.0) {
+        return false;
+      } else if (hscore_a_ == -1.0) {
+        hscore = 0.0;
+      } else {
+        int lower_q[3][kDCTBlockSize];
+        int upper_q[3][kDCTBlockSize];
+        constexpr double kEps = 0.05;
+        GetQuantMatrixWithHeuristicScore(
+            (1 - kEps) * hscore_a_ + kEps * 0.5 * (hscore_a_ + hscore_b_),
+            lower_q);
+        GetQuantMatrixWithHeuristicScore(
+            (1 - kEps) * hscore_b_ + kEps * 0.5 * (hscore_a_ + hscore_b_),
+            upper_q);
+        if (CompareQuantMatrices(&lower_q[0][0], &upper_q[0][0]) == 0)
+          return false;
+        hscore = (hscore_a_ + hscore_b_) * 0.5;
+      }
+      GetQuantMatrixWithHeuristicScore(hscore, q);
+      bool retry = false;
+      for (int i = 0; i < quants_.size(); ++i) {
+        if (CompareQuantMatrices(&q[0][0], &quants_[i].q[0][0]) == 0) {
+          if (quants_[i].dist_ok) {
+            hscore_a_ = hscore;
+          } else {
+            hscore_b_ = hscore;
+          }
+          retry = true;
+          break;
+        }
+      }
+      if (!retry) return true;
+    }
+    return false;
+  }
+
+  void Add(const QuantData& data) {
+    quants_.push_back(data);
+    double hscore = QuantMatrixHeuristicScore(data.q);
+    if (data.dist_ok) {
+      hscore_a_ = std::max(hscore_a_, hscore);
+    } else {
+      hscore_b_ = hscore_b_ == -1.0 ? hscore : std::min(hscore_b_, hscore);
+    }
+  }
+
+ private:
+  void GetQuantMatrixWithHeuristicScore(double score,
+                                        int q[3][kDCTBlockSize]) const {
+    int level = static_cast<int>(score / total_csf_);
+    score -= level * total_csf_;
+    for (int k = kDCTBlockSize - 1; k >= 0; --k) {
+      for (int c = 0; c < 3; ++c) {
+        q[c][kJPEGNaturalOrder[k]] = 2 * level + (score > 0.0 ? 3 : 1);
+      }
+      score -= 3.0 * ContrastSensitivity(kJPEGNaturalOrder[k]);
+    }
+  }
+
+  const bool downsample_;
+  // Lower bound for quant matrix heuristic score used in binary search.
+  double hscore_a_;
+  // Upper boun for quant matrix heuristic score used in binary search, or 0.0
+  // if no upper bound is found yet.
+  double hscore_b_;
+  // Cached value of the sum of all ContrastSensitivity() values over all
+  // quant matrix elements.
+  double total_csf_;
+  std::vector<QuantData> quants_;
+
+  ProcessStats* stats_;
+};
+
+QuantData Processor::TryQuantMatrix(const JPEGData& jpg_in,
+                                    const float target_mul,
+                                    int q[3][kDCTBlockSize]) {
+  QuantData data;
+  memcpy(data.q, q, sizeof(data.q));
+  OutputImage img(jpg_in.width, jpg_in.height);
+  img.CopyFromJpegData(jpg_in);
+  img.ApplyGlobalQuantization(data.q);
+  JPEGData jpg_out = jpg_in;
+  img.SaveToJpegData(&jpg_out);
+  std::string encoded_jpg;
+  OutputJpeg(jpg_out, &encoded_jpg);
+  GUETZLI_LOG(stats_, "Iter %2d: %s quantization matrix:\n",
+              stats_->counters[kNumItersCnt] + 1,
+              img.FrameTypeStr().c_str());
+  GUETZLI_LOG_QUANT(stats_, q);
+  GUETZLI_LOG(stats_, "Iter %2d: %s GQ[%5.2f] Out[%7zd]",
+              stats_->counters[kNumItersCnt] + 1,
+              img.FrameTypeStr().c_str(),
+              QuantMatrixHeuristicScore(q), encoded_jpg.size());
+  ++stats_->counters[kNumItersCnt];
+  comparator_->Compare(img);
+  data.dist_ok = comparator_->DistanceOK(target_mul);
+  data.out.jpeg_data = encoded_jpg;
+  data.out.distmap = comparator_->distmap();
+  data.out.distmap_aggregate = comparator_->distmap_aggregate();
+  data.out.score = comparator_->ScoreOutputSize(encoded_jpg.size());
+  MaybeOutput(encoded_jpg);
+  return data;
+}
+
+bool Processor::SelectQuantMatrix(const JPEGData& jpg_in, const bool downsample,
+                                  int best_q[3][kDCTBlockSize],
+                                  GuetzliOutput* quantized_out) {
+  QuantMatrixGenerator qgen(downsample, stats_);
+  // Don't try to go up to exactly the target distance when selecting a
+  // quantization matrix, since we will need some slack to do the frequency
+  // masking later.
+  const float target_mul_high = 0.97;
+  const float target_mul_low = 0.95;
+
+  QuantData best = TryQuantMatrix(jpg_in, target_mul_high, best_q);
+  for (;;) {
+    int q_next[3][kDCTBlockSize];
+    if (!qgen.GetNext(q_next)) {
+      break;
+    }
+
+    QuantData data =
+        TryQuantMatrix(jpg_in, target_mul_high, q_next);
+    qgen.Add(data);
+    if (CompareQuantData(data, best)) {
+      best = data;
+      if (data.dist_ok && !comparator_->DistanceOK(target_mul_low)) {
+        break;
+      }
+    }
+  }
+
+  memcpy(&best_q[0][0], &best.q[0][0], kBlockSize * sizeof(best_q[0][0]));
+  *quantized_out = best.out;
+  GUETZLI_LOG(stats_, "\n%s selected quantization matrix:\n",
+              downsample ? "YUV420" : "YUV444");
+  GUETZLI_LOG_QUANT(stats_, best_q);
+  return best.dist_ok;
+}
+
+
+// REQUIRES: block[c*64...(c*64+63)] is all zero if (comp_mask & (1<<c)) == 0.
+void Processor::ComputeBlockZeroingOrder(
+    const coeff_t block[kBlockSize], const coeff_t orig_block[kBlockSize],
+    const int block_x, const int block_y, const int factor_x,
+    const int factor_y, const uint8_t comp_mask, OutputImage* img,
+    std::vector<CoeffData>* output_order) {
+  static const uint8_t oldCsf[kDCTBlockSize] = {
+      10, 10, 20, 40, 60, 70, 80, 90,
+      10, 20, 30, 60, 70, 80, 90, 90,
+      20, 30, 60, 70, 80, 90, 90, 90,
+      40, 60, 70, 80, 90, 90, 90, 90,
+      60, 70, 80, 90, 90, 90, 90, 90,
+      70, 80, 90, 90, 90, 90, 90, 90,
+      80, 90, 90, 90, 90, 90, 90, 90,
+      90, 90, 90, 90, 90, 90, 90, 90,
+  };
+  static const double kWeight[3] = { 1.0, 0.22, 0.20 };
+#include "guetzli/order.inc"
+  std::vector<std::pair<int, float> > input_order;
+  for (int c = 0; c < 3; ++c) {
+    if (!(comp_mask & (1 << c))) continue;
+    for (int k = 1; k < kDCTBlockSize; ++k) {
+      int idx = c * kDCTBlockSize + k;
+      if (block[idx] != 0) {
+        float score;
+        if (params_.new_zeroing_model) {
+          score = std::abs(orig_block[idx]) * csf[idx] + bias[idx];
+        } else {
+          score = (std::abs(orig_block[idx]) - kJPEGZigZagOrder[k] / 64.0) *
+                  kWeight[c] / oldCsf[k];
+        }
+        input_order.push_back(std::make_pair(idx, score));
+      }
+    }
+  }
+  std::sort(input_order.begin(), input_order.end(),
+            [](const std::pair<int, float>& a, const std::pair<int, float>& b) {
+              return a.second < b.second; });
+  coeff_t processed_block[kBlockSize];
+  memcpy(processed_block, block, sizeof(processed_block));
+  while (!input_order.empty()) {
+    float best_err = 1e17;
+    int best_i = -1;
+    for (int i = 0; i < std::min<size_t>(params_.zeroing_greedy_lookahead,
+                                         input_order.size());
+         ++i) {
+      coeff_t candidate_block[kBlockSize];
+      memcpy(candidate_block, processed_block, sizeof(candidate_block));
+      const int idx = input_order[i].first;
+      candidate_block[idx] = 0;
+      for (int c = 0; c < 3; ++c) {
+        if (comp_mask & (1 << c)) {
+          img->component(c).SetCoeffBlock(
+              block_x, block_y, &candidate_block[c * kDCTBlockSize]);
+        }
+      }
+      float max_err = 0;
+      for (int iy = 0; iy < factor_y; ++iy) {
+        for (int ix = 0; ix < factor_x; ++ix) {
+          int block_xx = block_x * factor_x + ix;
+          int block_yy = block_y * factor_y + iy;
+          if (8 * block_xx < img->width() && 8 * block_yy < img->height()) {
+            float err = comparator_->CompareBlock(*img, block_xx, block_yy);
+            max_err = std::max(max_err, err);
+          }
+        }
+      }
+      if (max_err < best_err) {
+        best_err = max_err;
+        best_i = i;
+      }
+    }
+    int idx = input_order[best_i].first;
+    processed_block[idx] = 0;
+    input_order.erase(input_order.begin() + best_i);
+    output_order->push_back({idx, best_err});
+    for (int c = 0; c < 3; ++c) {
+      if (comp_mask & (1 << c)) {
+        img->component(c).SetCoeffBlock(
+            block_x, block_y, &processed_block[c * kDCTBlockSize]);
+      }
+    }
+  }
+  // Make the block error values monotonic.
+  float min_err = 1e10;
+  for (int i = output_order->size() - 1; i >= 0; --i) {
+    min_err = std::min(min_err, (*output_order)[i].block_err);
+    (*output_order)[i].block_err = min_err;
+  }
+  // Cut off at the block error limit.
+  int num = 0;
+  while (num < output_order->size() &&
+         (*output_order)[num].block_err <= comparator_->BlockErrorLimit()) {
+    ++num;
+  }
+  output_order->resize(num);
+  // Restore *img to the same state as it was at the start of this function.
+  for (int c = 0; c < 3; ++c) {
+    if (comp_mask & (1 << c)) {
+      img->component(c).SetCoeffBlock(
+          block_x, block_y, &block[c * kDCTBlockSize]);
+    }
+  }
+}
+
+namespace {
+
+void UpdateACHistogram(const int weight,
+                       const coeff_t* coeffs,
+                       const int* q,
+                       JpegHistogram* ac_histogram) {
+  int r = 0;
+  for (int k = 1; k < 64; ++k) {
+    const int k_nat = kJPEGNaturalOrder[k];
+    coeff_t coeff = coeffs[k_nat];
+    if (coeff == 0) {
+      r++;
+      continue;
+    }
+    while (r > 15) {
+      ac_histogram->Add(0xf0, weight);
+      r -= 16;
+    }
+    int nbits = Log2FloorNonZero(std::abs(coeff / q[k_nat])) + 1;
+    int symbol = (r << 4) + nbits;
+    ac_histogram->Add(symbol, weight);
+    r = 0;
+  }
+  if (r > 0) {
+    ac_histogram->Add(0, weight);
+  }
+}
+
+size_t ComputeEntropyCodes(const std::vector<JpegHistogram>& histograms,
+                           std::vector<uint8_t>* depths) {
+  std::vector<JpegHistogram> clustered = histograms;
+  size_t num = histograms.size();
+  std::vector<int> indexes(histograms.size());
+  std::vector<uint8_t> clustered_depths(
+      histograms.size() * JpegHistogram::kSize);
+  ClusterHistograms(&clustered[0], &num, &indexes[0], &clustered_depths[0]);
+  depths->resize(clustered_depths.size());
+  for (int i = 0; i < histograms.size(); ++i) {
+    memcpy(&(*depths)[i * JpegHistogram::kSize],
+           &clustered_depths[indexes[i] * JpegHistogram::kSize],
+           JpegHistogram::kSize);
+  }
+  size_t histogram_size = 0;
+  for (int i = 0; i < num; ++i) {
+    histogram_size += HistogramHeaderCost(clustered[i]) / 8;
+  }
+  return histogram_size;
+}
+
+size_t EntropyCodedDataSize(const std::vector<JpegHistogram>& histograms,
+                            const std::vector<uint8_t>& depths) {
+  size_t numbits = 0;
+  for (int i = 0; i < histograms.size(); ++i) {
+    numbits += HistogramEntropyCost(
+        histograms[i], &depths[i * JpegHistogram::kSize]);
+  }
+  return (numbits + 7) / 8;
+}
+
+size_t EstimateDCSize(const JPEGData& jpg) {
+  std::vector<JpegHistogram> histograms(jpg.components.size());
+  BuildDCHistograms(jpg, &histograms[0]);
+  size_t num = histograms.size();
+  std::vector<int> indexes(num);
+  std::vector<uint8_t> depths(num * JpegHistogram::kSize);
+  return ClusterHistograms(&histograms[0], &num, &indexes[0], &depths[0]);
+}
+
+}  // namespace
+
+void Processor::SelectFrequencyMasking(const JPEGData& jpg, OutputImage* img,
+                                       const uint8_t comp_mask,
+                                       const double target_mul,
+                                       bool stop_early) {
+  const int width = img->width();
+  const int height = img->height();
+  const int last_c = Log2FloorNonZero(comp_mask);
+  if (last_c >= jpg.components.size()) return;
+  const int factor_x = img->component(last_c).factor_x();
+  const int factor_y = img->component(last_c).factor_y();
+  const int block_width = (width + 8 * factor_x - 1) / (8 * factor_x);
+  const int block_height = (height + 8 * factor_y - 1) / (8 * factor_y);
+  const int num_blocks = block_width * block_height;
+
+  std::vector<std::vector<CoeffData> > orders(num_blocks);
+  for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
+    for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
+      coeff_t block[kBlockSize] = { 0 };
+      coeff_t orig_block[kBlockSize] = { 0 };
+      for (int c = 0; c < 3; ++c) {
+        if (comp_mask & (1 << c)) {
+          assert(img->component(c).factor_x() == factor_x);
+          assert(img->component(c).factor_y() == factor_y);
+          img->component(c).GetCoeffBlock(block_x, block_y,
+                                          &block[c * kDCTBlockSize]);
+          const JPEGComponent& comp = jpg.components[c];
+          int jpg_block_ix = block_y * comp.width_in_blocks + block_x;
+          memcpy(&orig_block[c * kDCTBlockSize],
+                 &comp.coeffs[jpg_block_ix * kDCTBlockSize],
+                 kDCTBlockSize * sizeof(orig_block[0]));
+        }
+      }
+      ComputeBlockZeroingOrder(block, orig_block, block_x, block_y, factor_x,
+                               factor_y, comp_mask, img,
+                               &orders[block_ix]);
+    }
+  }
+
+  JPEGData jpg_out = jpg;
+  img->SaveToJpegData(&jpg_out);
+  const int jpg_header_size = JpegHeaderSize(jpg_out, params_.clear_metadata);
+  const int dc_size = EstimateDCSize(jpg_out);
+  std::vector<JpegHistogram> ac_histograms(jpg_out.components.size());
+  BuildACHistograms(jpg_out, &ac_histograms[0]);
+  std::vector<uint8_t> ac_depths;
+  int ac_histogram_size = ComputeEntropyCodes(ac_histograms, &ac_depths);
+  int base_size = jpg_header_size + dc_size + ac_histogram_size +
+      EntropyCodedDataSize(ac_histograms, ac_depths);
+  int prev_size = base_size;
+
+  std::vector<float> max_block_error(num_blocks);
+  std::vector<int> last_indexes(num_blocks);
+  std::vector<float> distmap(width * height);
+
+  bool first_up_iter = true;
+  for (int direction : {1, -1}) {
+    for (;;) {
+      if (stop_early && direction == -1) {
+        if (prev_size > 1.01 * final_output_->jpeg_data.size()) {
+          // If we are down-adjusting the error, the output size will only keep
+          // increasing.
+          // TODO(user): Do this check always by comparing only the size
+          // of the currently processed components.
+          break;
+        }
+      }
+      std::vector<std::pair<int, float> > global_order;
+      int blocks_to_change;
+      std::vector<float> block_weight;
+      for (int rblock = 1; rblock <= 4; ++rblock) {
+        block_weight = std::vector<float>(num_blocks);
+        comparator_->ComputeBlockErrorAdjustmentWeights(
+            direction, rblock, target_mul, factor_x, factor_y, distmap,
+            &block_weight);
+        global_order.clear();
+        blocks_to_change = 0;
+        for (int block_y = 0, block_ix = 0; block_y < block_height; ++block_y) {
+          for (int block_x = 0; block_x < block_width; ++block_x, ++block_ix) {
+            const int last_index = last_indexes[block_ix];
+            const std::vector<CoeffData>& order = orders[block_ix];
+            const float max_err = max_block_error[block_ix];
+            if (block_weight[block_ix] == 0) {
+              continue;
+            }
+            if (direction > 0) {
+              for (int i = last_index; i < order.size(); ++i) {
+                float val = ((order[i].block_err - max_err) /
+                             block_weight[block_ix]);
+                global_order.push_back(std::make_pair(block_ix, val));
+              }
+              blocks_to_change += (last_index < order.size() ? 1 : 0);
+            } else {
+              for (int i = last_index - 1; i >= 0; --i) {
+                float val = ((max_err - order[i].block_err) /
+                             block_weight[block_ix]);
+                global_order.push_back(std::make_pair(block_ix, val));
+              }
+              blocks_to_change += (last_index > 0 ? 1 : 0);
+            }
+          }
+        }
+        if (!global_order.empty()) {
+          // If we found something to adjust with the current block adjustment
+          // radius, we can stop and adjust the blocks we have.
+          break;
+        }
+      }
+
+      if (global_order.empty()) {
+        break;
+      }
+
+      std::sort(global_order.begin(), global_order.end(),
+                [](const std::pair<int, float>& a,
+                   const std::pair<int, float>& b) {
+                  return a.second < b.second; });
+
+      double rel_size_delta = direction > 0 ? 0.01 : 0.0005;
+      if (direction > 0 && comparator_->DistanceOK(1.0)) {
+        rel_size_delta = 0.05;
+      }
+      size_t min_size_delta = base_size * rel_size_delta;
+
+      float coeffs_to_change_per_block =
+          direction > 0 ? 2.0 : factor_x * factor_y * 0.2;
+      int min_coeffs_to_change = coeffs_to_change_per_block * blocks_to_change;
+
+      if (first_up_iter) {
+        const float limit = 0.75 * comparator_->BlockErrorLimit();
+        auto it = std::partition_point(global_order.begin(), global_order.end(),
+                                       [=](const std::pair<int, float>& a) {
+                                         return a.second < limit; });
+        min_coeffs_to_change = std::max<int>(min_coeffs_to_change,
+                                             it - global_order.begin());
+        first_up_iter = false;
+      }
+
+      std::set<int> changed_blocks;
+      float val_threshold = 0.0;
+      int changed_coeffs = 0;
+      int est_jpg_size = prev_size;
+      for (int i = 0; i < global_order.size(); ++i) {
+        const int block_ix = global_order[i].first;
+        const int block_x = block_ix % block_width;
+        const int block_y = block_ix / block_width;
+        const int last_idx = last_indexes[block_ix];
+        const std::vector<CoeffData>& order = orders[block_ix];
+        const int idx = order[last_idx + std::min(direction, 0)].idx;
+        const int c = idx / kDCTBlockSize;
+        const int k = idx % kDCTBlockSize;
+        const int* quant = img->component(c).quant();
+        const JPEGComponent& comp = jpg.components[c];
+        const int jpg_block_ix = block_y * comp.width_in_blocks + block_x;
+        const int newval = direction > 0 ? 0 : Quantize(
+            comp.coeffs[jpg_block_ix * kDCTBlockSize + k], quant[k]);
+        coeff_t block[kDCTBlockSize] = { 0 };
+        img->component(c).GetCoeffBlock(block_x, block_y, block);
+        UpdateACHistogram(-1, block, quant, &ac_histograms[c]);
+        block[k] = newval;
+        UpdateACHistogram(1, block, quant, &ac_histograms[c]);
+        img->component(c).SetCoeffBlock(block_x, block_y, block);
+        last_indexes[block_ix] += direction;
+        changed_blocks.insert(block_ix);
+        val_threshold = global_order[i].second;
+        ++changed_coeffs;
+        static const int kEntropyCodeUpdateFreq = 10;
+        if (i % kEntropyCodeUpdateFreq == 0) {
+          ac_histogram_size = ComputeEntropyCodes(ac_histograms, &ac_depths);
+        }
+        est_jpg_size = jpg_header_size + dc_size + ac_histogram_size +
+            EntropyCodedDataSize(ac_histograms, ac_depths);
+        if (changed_coeffs > min_coeffs_to_change &&
+            std::abs(est_jpg_size - prev_size) > min_size_delta) {
+          break;
+        }
+      }
+
+      for (int i = 0; i < num_blocks; ++i) {
+        max_block_error[i] += block_weight[i] * val_threshold * direction;
+      }
+
+      ++stats_->counters[kNumItersCnt];
+      ++stats_->counters[direction > 0 ? kNumItersUpCnt : kNumItersDownCnt];
+      JPEGData jpg_out = jpg;
+      img->SaveToJpegData(&jpg_out);
+      std::string encoded_jpg;
+      OutputJpeg(jpg_out, &encoded_jpg);
+      GUETZLI_LOG(stats_,
+                  "Iter %2d: %s(%d) %s Coeffs[%d/%zd] "
+                  "Blocks[%zd/%d/%d] ValThres[%.4f] Out[%7zd] EstErr[%.2f%%]",
+                  stats_->counters[kNumItersCnt], img->FrameTypeStr().c_str(),
+                  comp_mask, direction > 0 ? "up" : "down", changed_coeffs,
+                  global_order.size(), changed_blocks.size(),
+                  blocks_to_change, num_blocks, val_threshold,
+                  encoded_jpg.size(),
+                  100.0 - (100.0 * est_jpg_size) / encoded_jpg.size());
+      comparator_->Compare(*img);
+      MaybeOutput(encoded_jpg);
+      distmap = comparator_->distmap();
+      prev_size = est_jpg_size;
+    }
+  }
+}
+
+bool IsGrayscale(const JPEGData& jpg) {
+  for (int c = 1; c < 3; ++c) {
+    const JPEGComponent& comp = jpg.components[c];
+    for (size_t i = 0; i < comp.coeffs.size(); ++i) {
+      if (comp.coeffs[i] != 0) return false;
+    }
+  }
+  return true;
+}
+
+bool Processor::ProcessJpegData(const Params& params, const JPEGData& jpg_in,
+                                Comparator* comparator, GuetzliOutput* out,
+                                ProcessStats* stats) {
+  params_ = params;
+  comparator_ = comparator;
+  final_output_ = out;
+  stats_ = stats;
+
+  if (params.butteraugli_target > 2.0f) {
+    fprintf(stderr,
+            "Guetzli should be called with quality >= 84, otherwise the\n"
+            "output will have noticeable artifacts. If you want to\n"
+            "proceed anyway, please edit the source code.\n");
+    return false;
+  }
+  if (jpg_in.components.size() != 3 || !HasYCbCrColorSpace(jpg_in)) {
+    fprintf(stderr, "Only YUV color space input jpeg is supported\n");
+    return false;
+  }
+  bool input_is_420;
+  if (jpg_in.Is444()) {
+    input_is_420 = false;
+  } else if (jpg_in.Is420()) {
+    input_is_420 = true;
+  } else {
+    fprintf(stderr, "Unsupported sampling factors:");
+    for (int i = 0; i < jpg_in.components.size(); ++i) {
+      fprintf(stderr, " %dx%d", jpg_in.components[i].h_samp_factor,
+              jpg_in.components[i].v_samp_factor);
+    }
+    fprintf(stderr, "\n");
+    return false;
+  }
+  JPEGData jpg = jpg_in;
+  int q_in[3][kDCTBlockSize];
+  // Output the original image, in case we do not manage to create anything
+  // with a good enough quality.
+  std::string encoded_jpg;
+  OutputJpeg(jpg, &encoded_jpg);
+  final_output_->score = -1;
+  GUETZLI_LOG(stats, "Original Out[%7zd]", encoded_jpg.size());
+  if (jpg.width < 2 || jpg.height < 2) {
+    GUETZLI_LOG(stats, " <image too small for Butteraugli>\n");
+    final_output_->jpeg_data = encoded_jpg;
+    final_output_->distmap = std::vector<float>(jpg.width * jpg.height, 0.0);
+    final_output_->distmap_aggregate = 0;
+    final_output_->score = encoded_jpg.size();
+    // Butteraugli doesn't work with images this small.
+    return true;
+  }
+  RemoveOriginalQuantization(&jpg, q_in);
+  OutputImage img(jpg.width, jpg.height);
+  img.CopyFromJpegData(jpg);
+  comparator_->Compare(img);
+  MaybeOutput(encoded_jpg);
+  if (jpg.width < 32 || jpg.height < 32) {
+    return true;
+  }
+  int try_420 = (input_is_420 || params_.force_420 ||
+                 (params_.try_420 && !IsGrayscale(jpg))) ? 1 : 0;
+  int force_420 = (input_is_420 || params_.force_420) ? 1 : 0;
+  for (int downsample = force_420; downsample <= try_420; ++downsample) {
+    OutputImage img(jpg.width, jpg.height);
+    img.CopyFromJpegData(jpg);
+    JPEGData tmp_jpg;
+    if (downsample) {
+      DownsampleImage(&img);
+      img.SaveToJpegData(&tmp_jpg);
+    } else {
+      tmp_jpg = jpg;
+    }
+    int best_q[3][kDCTBlockSize];
+    memcpy(best_q, q_in, sizeof(best_q));
+    GuetzliOutput quantized_out;
+    if (!SelectQuantMatrix(tmp_jpg, downsample, best_q, &quantized_out)) {
+      for (int c = 0; c < 3; ++c) {
+        for (int i = 0; i < kDCTBlockSize; ++i) {
+          best_q[c][i] = 1;
+        }
+      }
+    }
+    img.ApplyGlobalQuantization(best_q);
+
+    if (!downsample) {
+      SelectFrequencyMasking(tmp_jpg, &img, 7, 1.0, false);
+    } else {
+      const float ymul = tmp_jpg.components.size() == 1 ? 1.0 : 0.97;
+      SelectFrequencyMasking(tmp_jpg, &img, 1, ymul, false);
+      SelectFrequencyMasking(tmp_jpg, &img, 6, 1.0, true);
+    }
+  }
+
+  return true;
+}
+
+bool ProcessJpegData(const Params& params, const JPEGData& jpg_in,
+                     Comparator* comparator, GuetzliOutput* out,
+                     ProcessStats* stats) {
+  Processor processor;
+  return processor.ProcessJpegData(params, jpg_in, comparator, out, stats);
+}
+
+bool Process(const Params& params, ProcessStats* stats,
+             const std::string& data,
+             std::string* jpg_out) {
+  JPEGData jpg;
+  if (!ReadJpeg(data, JPEG_READ_ALL, &jpg)) {
+    fprintf(stderr, "Can't read jpg data from input file\n");
+    return false;
+  }
+  std::vector<uint8_t> rgb = DecodeJpegToRGB(jpg);
+  GuetzliOutput out;
+  ProcessStats dummy_stats;
+  if (stats == nullptr) {
+    stats = &dummy_stats;
+  }
+  ButteraugliComparator comparator(jpg.width, jpg.height, rgb,
+                                   params.butteraugli_target, stats);
+  bool ok = ProcessJpegData(params, jpg, &comparator, &out, stats);
+  *jpg_out = out.jpeg_data;
+  return ok;
+}
+
+bool Process(const Params& params, ProcessStats* stats,
+             const std::vector<uint8_t>& rgb, int w, int h,
+             std::string* jpg_out) {
+  JPEGData jpg;
+  if (!EncodeRGBToJpeg(rgb, w, h, &jpg)) {
+    fprintf(stderr, "Could not create jpg data from rgb pixels\n");
+    return false;
+  }
+  GuetzliOutput out;
+  ProcessStats dummy_stats;
+  if (stats == nullptr) {
+    stats = &dummy_stats;
+  }
+  ButteraugliComparator comparator(jpg.width, jpg.height, rgb,
+                                   params.butteraugli_target, stats);
+  bool ok = ProcessJpegData(params, jpg, &comparator, &out, stats);
+  *jpg_out = out.jpeg_data;
+  return ok;
+}
+
+}  // namespace guetzli
diff --git a/guetzli/processor.h b/guetzli/processor.h
new file mode 100755
index 00000000..bdbe5302
--- /dev/null
+++ b/guetzli/processor.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GUETZLI_PROCESSOR_H_
+#define GUETZLI_PROCESSOR_H_
+
+#include <string>
+#include <vector>
+
+#include "guetzli/comparator.h"
+#include "guetzli/jpeg_data.h"
+#include "guetzli/stats.h"
+
+namespace guetzli {
+
+struct Params {
+  float butteraugli_target = 1.0;
+  bool clear_metadata = false;
+  bool try_420 = false;
+  bool force_420 = false;
+  bool use_silver_screen = false;
+  int zeroing_greedy_lookahead = 3;
+  bool new_zeroing_model = true;
+};
+
+bool Process(const Params& params, ProcessStats* stats,
+             const std::string& in_data,
+             std::string* out_data);
+
+struct GuetzliOutput {
+  std::string jpeg_data;
+  std::vector<float> distmap;
+  double distmap_aggregate;
+  double score;
+};
+
+bool ProcessJpegData(const Params& params, const JPEGData& jpg_in,
+                     Comparator* comparator, GuetzliOutput* out,
+                     ProcessStats* stats);
+
+// Sets *out to a jpeg encoded string that will decode to an image that is
+// visually indistinguishable from the input rgb image.
+bool Process(const Params& params, ProcessStats* stats,
+             const std::vector<uint8_t>& rgb, int w, int h,
+             std::string* out);
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_PROCESSOR_H_
diff --git a/guetzli/quality.cc b/guetzli/quality.cc
new file mode 100755
index 00000000..681c0cb1
--- /dev/null
+++ b/guetzli/quality.cc
@@ -0,0 +1,87 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "guetzli/quality.h"
+
+namespace guetzli {
+
+namespace {
+
+constexpr int kLowestQuality = 70;
+constexpr int kHighestQuality = 110;
+
+// Butteraugli scores that correspond to JPEG quality levels, starting at
+// kLowestQuality. They were computed by taking median BA scores of JPEGs
+// generated using libjpeg-turbo at given quality from a set of PNGs.
+// The scores above quality level 100 are just linearly decreased so that score
+// for 110 is 90% of the score for 100.
+const double kScoreForQuality[] = {
+  2.810761,  // 70
+  2.729300,
+  2.689687,
+  2.636811,
+  2.547863,
+  2.525400,
+  2.473416,
+  2.366133,
+  2.338078,
+  2.318654,
+  2.201674,  // 80
+  2.145517,
+  2.087322,
+  2.009328,
+  1.945456,
+  1.900112,
+  1.805701,
+  1.750194,
+  1.644175,
+  1.562165,
+  1.473608,  // 90
+  1.382021,
+  1.294298,
+  1.185402,
+  1.066781,
+  0.971769,  // 95
+  0.852901,
+  0.724544,
+  0.611302,
+  0.443185,
+  0.211578,  // 100
+  0.209462,
+  0.207346,
+  0.205230,
+  0.203114,
+  0.200999,  // 105
+  0.198883,
+  0.196767,
+  0.194651,
+  0.192535,
+  0.190420,  // 110
+  0.190420,
+};
+
+}  // namespace
+
+double ButteraugliScoreForQuality(double quality) {
+  if (quality < kLowestQuality) quality = kLowestQuality;
+  if (quality > kHighestQuality) quality = kHighestQuality;
+  int index = static_cast<int>(quality);
+  double mix = quality - index;
+  return kScoreForQuality[index - kLowestQuality] * (1 - mix) +
+      kScoreForQuality[index - kLowestQuality + 1] * mix;
+}
+
+}  // namespace guetzli
diff --git a/guetzli/quality.h b/guetzli/quality.h
new file mode 100755
index 00000000..a7ede82b
--- /dev/null
+++ b/guetzli/quality.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GUETZLI_QUALITY_H_
+#define GUETZLI_QUALITY_H_
+
+namespace guetzli {
+
+double ButteraugliScoreForQuality(double quality);
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_QUALITY_H_
diff --git a/guetzli/quantize.cc b/guetzli/quantize.cc
new file mode 100755
index 00000000..f8c25ace
--- /dev/null
+++ b/guetzli/quantize.cc
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "guetzli/quantize.h"
+
+namespace guetzli {
+
+bool QuantizeBlock(coeff_t block[kDCTBlockSize],
+                   const int q[kDCTBlockSize]) {
+  bool changed = false;
+  for (int k = 0; k < kDCTBlockSize; ++k) {
+    coeff_t coeff = Quantize(block[k], q[k]);
+    changed = changed || (coeff != block[k]);
+    block[k] = coeff;
+  }
+  return changed;
+}
+
+}  // namespace guetzli
diff --git a/guetzli/quantize.h b/guetzli/quantize.h
new file mode 100755
index 00000000..2e9a189d
--- /dev/null
+++ b/guetzli/quantize.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GUETZLI_QUANTIZE_H_
+#define GUETZLI_QUANTIZE_H_
+
+#include "guetzli/jpeg_data.h"
+
+namespace guetzli {
+
+inline coeff_t Quantize(coeff_t raw_coeff, int quant) {
+  const int r = raw_coeff % quant;
+  const coeff_t delta =
+      2 * r > quant ? quant - r : (-2) * r > quant ? -quant - r : -r;
+  return raw_coeff + delta;
+}
+
+bool QuantizeBlock(coeff_t block[kDCTBlockSize], const int q[kDCTBlockSize]);
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_QUANTIZE_H_
diff --git a/guetzli/score.cc b/guetzli/score.cc
new file mode 100755
index 00000000..542cfc6d
--- /dev/null
+++ b/guetzli/score.cc
@@ -0,0 +1,43 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "guetzli/score.h"
+
+#include <cmath>
+
+namespace guetzli {
+
+double ScoreJPEG(double butteraugli_distance, int size,
+                 double butteraugli_target) {
+  constexpr double kScale = 50;
+  constexpr double kMaxExponent = 10;
+  constexpr double kLargeSize = 1e30;
+  // TODO(user): The score should also depend on distance below target (and be
+  // smooth).
+  double diff = butteraugli_distance - butteraugli_target;
+  if (diff <= 0.0) {
+    return size;
+  } else {
+    double exponent = kScale * diff;
+    if (exponent > kMaxExponent) {
+      return kLargeSize * std::exp(kMaxExponent) * diff + size;
+    } else {
+      return std::exp(exponent) * size;
+    }
+  }
+}
+
+}  // namespace guetzli
diff --git a/guetzli/score.h b/guetzli/score.h
new file mode 100755
index 00000000..e4207a95
--- /dev/null
+++ b/guetzli/score.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GUETZLI_SCORE_H_
+#define GUETZLI_SCORE_H_
+
+#include <vector>
+
+namespace guetzli {
+
+double ScoreJPEG(double butteraugli_distance, int size,
+                 double butteraugli_target);
+
+}  // namespace guetzli
+#endif  // GUETZLI_SCORE_H_
diff --git a/guetzli/stats.h b/guetzli/stats.h
new file mode 100755
index 00000000..031d9c4f
--- /dev/null
+++ b/guetzli/stats.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright 2016 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GUETZLI_STATS_H_
+#define GUETZLI_STATS_H_
+
+#include <cstdio>
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+
+
+namespace guetzli {
+
+static const char* kNumItersCnt = "number of iterations";
+static const char* kNumItersUpCnt = "number of iterations up";
+static const char* kNumItersDownCnt = "number of iterations down";
+
+struct ProcessStats {
+  ProcessStats() {}
+  std::map<std::string, int> counters;
+  std::string* debug_output = nullptr;
+  FILE* debug_output_file = nullptr;
+
+  std::string filename;
+};
+
+}  // namespace guetzli
+
+#endif  // GUETZLI_STATS_H_
diff --git a/png.BUILD b/png.BUILD
new file mode 100755
index 00000000..9ff982bc
--- /dev/null
+++ b/png.BUILD
@@ -0,0 +1,33 @@
+# Description:
+#   libpng is the official PNG reference library.
+
+licenses(["notice"])  # BSD/MIT-like license
+
+cc_library(
+    name = "png",
+    srcs = [
+        "png.c",
+        "pngerror.c",
+        "pngget.c",
+        "pngmem.c",
+        "pngpread.c",
+        "pngread.c",
+        "pngrio.c",
+        "pngrtran.c",
+        "pngrutil.c",
+        "pngset.c",
+        "pngtrans.c",
+        "pngwio.c",
+        "pngwrite.c",
+        "pngwtran.c",
+        "pngwutil.c",
+    ],
+    hdrs = [
+        "png.h",
+        "pngconf.h",
+    ],
+    includes = ["."],
+    linkopts = ["-lm"],
+    visibility = ["//visibility:public"],
+    deps = ["@zlib_archive//:zlib"],
+)
diff --git a/zlib.BUILD b/zlib.BUILD
new file mode 100755
index 00000000..edb77fdf
--- /dev/null
+++ b/zlib.BUILD
@@ -0,0 +1,36 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # BSD/MIT-like license (for zlib)
+
+cc_library(
+    name = "zlib",
+    srcs = [
+        "adler32.c",
+        "compress.c",
+        "crc32.c",
+        "crc32.h",
+        "deflate.c",
+        "deflate.h",
+        "gzclose.c",
+        "gzguts.h",
+        "gzlib.c",
+        "gzread.c",
+        "gzwrite.c",
+        "infback.c",
+        "inffast.c",
+        "inffast.h",
+        "inffixed.h",
+        "inflate.c",
+        "inflate.h",
+        "inftrees.c",
+        "inftrees.h",
+        "trees.c",
+        "trees.h",
+        "uncompr.c",
+        "zconf.h",
+        "zutil.c",
+        "zutil.h",
+    ],
+    hdrs = ["zlib.h"],
+    includes = ["."],
+)