diff --git a/third_party/lib_nn b/third_party/lib_nn
index c383d28c4..80749b260 160000
--- a/third_party/lib_nn
+++ b/third_party/lib_nn
@@ -1 +1 @@
-Subproject commit c383d28c45a4556ed9528c3a34a0d3e293a9360b
+Subproject commit 80749b2609004f96e6dac26951a57a02dd8fa92b
diff --git a/third_party/lib_tflite_micro b/third_party/lib_tflite_micro
index b2409cb7c..51aa1f05f 160000
--- a/third_party/lib_tflite_micro
+++ b/third_party/lib_tflite_micro
@@ -1 +1 @@
-Subproject commit b2409cb7ccedf4808533b92b797e257cfe067960
+Subproject commit 51aa1f05fdc3c4100a38faef2b3da3f6281ab075
diff --git a/xformer/IR/XCoreOps.td b/xformer/IR/XCoreOps.td
index a7944e769..8a6dc12e8 100644
--- a/xformer/IR/XCoreOps.td
+++ b/xformer/IR/XCoreOps.td
@@ -197,6 +197,24 @@ def XC_Beta_FcF32Op : XC_Op<"beta_fcf32", [Pure]> {
   let results = (outs TensorOf<[F32]> : $output);
 }
 
+def XC_MaxPool2DOp : XC_Op<"maxpool2d", [Pure]> {
+  let summary = "MaxPool2D op";
+
+  let description = [{MaxPool2D op.}];
+
+  let arguments = (ins
+    TensorOf<[QI8]>:$input,
+    StrAttr:$memcpy_fn_param,
+    StrAttr:$aggregate_fn_param,
+    StrAttr:$output_transform_fn_param,
+    I32Attr:$scratch_bytes,
+    I32Attr:$thread_count,
+    StrArrayAttr:$abstract_kernel_params
+  );
+
+  let results = (outs TensorOf<[QI8]> : $output);
+}
+
 def XC_Conv2DV2Op : XC_Op<"conv2d_v2", [Pure]> {
   let summary = "Conv2D V2 op";
 
diff --git a/xformer/Transforms/Passes.cpp b/xformer/Transforms/Passes.cpp
index a53154bf4..eb5e9de31 100644
--- a/xformer/Transforms/Passes.cpp
+++ b/xformer/Transforms/Passes.cpp
@@ -30,6 +30,7 @@ void buildXCorePassPipeline(OpPassManager &pm) {
 
   // XC passes
   pm.addPass(createReplaceAddPass());
+  pm.addPass(createReplaceMaxPoolPass());
   pm.addPass(createReplaceMulPass());
   pm.addPass(createReplaceStridedSlicePass());
   pm.addPass(createReplaceConv2DPass());
diff --git a/xformer/Transforms/Passes.h b/xformer/Transforms/Passes.h
index 5b813e82c..b1d240397 100644
--- a/xformer/Transforms/Passes.h
+++ b/xformer/Transforms/Passes.h
@@ -31,6 +31,7 @@ std::unique_ptr<OperationPass<func::FuncOp>> createOpSplitPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createApplyTFLPatternsPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createReplaceAddPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createReplaceMulPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createReplaceMaxPoolPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createReplaceStridedSlicePass();
 std::unique_ptr<OperationPass<func::FuncOp>> createReplaceConv2DPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createApplyXCPatternsPass();
diff --git a/xformer/Transforms/ReplaceConv2D.cpp b/xformer/Transforms/ReplaceConv2D.cpp
index 47ed454b0..fc97d2745 100644
--- a/xformer/Transforms/ReplaceConv2D.cpp
+++ b/xformer/Transforms/ReplaceConv2D.cpp
@@ -69,12 +69,11 @@ ReplaceWithXCConv2DBase<ConcreteType, ConvOpType, ArgsType>::matchAndRewrite(
   std::vector<int16_t> mulsBiasesOrThresholdsData;
 
   // Obtain thread count from command-line option
-  const int threadCount = threadCountOption;
   llvm::SmallVector<std::string> strParams;
   int scratchBytes = 0;
   // Get image region splits for multiple threads
   args.imageRegionSplits = utils::getImageRegionThreadSplits(
-      threadCount, args.Y.height, args.Y.width);
+      threadCountOption, args.Y.height, args.Y.width);
 
   // Obtain serialized params and calculated tensors from lib_nn for the
   // conv2d kernel type
diff --git a/xformer/Transforms/ReplaceMaxPool2D.cpp b/xformer/Transforms/ReplaceMaxPool2D.cpp
new file mode 100644
index 000000000..258b4e184
--- /dev/null
+++ b/xformer/Transforms/ReplaceMaxPool2D.cpp
@@ -0,0 +1,113 @@
+#include "IR/XCoreOps.h"
+#include "Transforms/Options.h"
+
+#include "Utils/ThreadSupport.h"
+#include "lib_nn/api/AbstractKernel.hpp"
+#include "lib_nn/api/AggregateFn.hpp"
+#include "lib_nn/api/MemCpyFn.hpp"
+#include "lib_nn/api/OutputTransformFn.hpp"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
+
+namespace mlir {
+namespace xcore {
+
+namespace {
+struct ReplaceMaxPool2D
+    : public PassWrapper<ReplaceMaxPool2D, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ReplaceMaxPool2D)
+  void getDependentDialects(DialectRegistry &registry) const final {
+    registry.insert<TFL::TensorFlowLiteDialect>();
+  }
+  StringRef getArgument() const final { return "xcore-replace-maxpool2d"; }
+  StringRef getDescription() const final {
+    return "Replace TFL MaxPool2D with MaxPool2D for XCore.";
+  }
+  void runOnOperation() override;
+};
+
+struct ReplaceMaxPool2DPattern : public OpRewritePattern<TFL::MaxPool2DOp> {
+  using OpRewritePattern<TFL::MaxPool2DOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TFL::MaxPool2DOp mPoolOp,
+                                PatternRewriter &rewriter) const override {
+    auto inputType =
+        mPoolOp.getInput().getType().template dyn_cast<RankedTensorType>();
+    auto outputType =
+        mPoolOp.getOutput().getType().template dyn_cast<RankedTensorType>();
+    auto inputHeight = inputType.getDimSize(1);
+    auto inputWidth = inputType.getDimSize(2);
+    auto inputDepth = inputType.getDimSize(3);
+    auto outputHeight = outputType.getDimSize(1);
+    auto outputWidth = outputType.getDimSize(2);
+    auto outputDepth = outputType.getDimSize(3);
+    auto splits = utils::getImageRegionThreadSplits(threadCountOption,
+                                                    outputHeight, outputWidth);
+
+    auto actualThreadCount = splits.size();
+    // Create a string array attr from a vector of strings
+    auto getStringArrayAttr = [&](llvm::SmallVector<std::string> value) {
+      auto attrs = llvm::to_vector<8>(
+          llvm::map_range(value, [&](std::string v) -> Attribute {
+            return rewriter.getStringAttr(v);
+          }));
+      return rewriter.getArrayAttr(attrs);
+    };
+    int32_t scratchByteParam =
+        nn::MatMulInt8::get_scratch_mem_bytes(mPoolOp.getFilterWidth() *
+                                              mPoolOp.getFilterHeight()) +
+        32; //[asj] FIXME
+    nn::ImageGeometry X(inputHeight, inputWidth, inputDepth);
+    nn::ImageGeometry Y(outputHeight, outputWidth, outputDepth);
+    llvm::SmallVector<std::string> akp;
+    for (auto &region : splits) {
+      nn::ImageRegion ir(region[0], region[1], 0, region[2], region[3],
+                         outputDepth);
+      nn::AbstractKernel ak(Y, ir, VPU_INT8_ACC_PERIOD);
+      auto akParams = ak.getParams();
+      auto akpStr = std::string((char *)&akParams, sizeof(akParams));
+      akp.push_back(akpStr);
+    }
+    nn::ImageRegion ir(0, 0, 0, outputHeight, outputWidth, outputDepth);
+    nn::WindowGeometry window(
+        mPoolOp.getFilterHeight(), mPoolOp.getFilterWidth(), 1, 0, 0,
+        mPoolOp.getStrideH(), mPoolOp.getStrideW(), 1, 1, 1);
+    nn::DerefInputFn mf(X, window);
+    nn::MatMulDirectFn_DW af(X, window);
+    // TODO
+    nn::OT_int8_channelwise ot(outputDepth, 0);
+    auto mfParams = mf.getParams();
+    auto afParams = af.getParams();
+    auto otParams = ot.getParams();
+    auto mfStr = std::string((char *)&mfParams, sizeof(mfParams));
+    auto afStr = std::string((char *)&afParams, sizeof(afParams));
+    auto otStr = std::string((char *)&otParams, sizeof(otParams));
+
+    auto xcMaxPool2DOp = rewriter.create<MaxPool2DOp>(
+        mPoolOp.getLoc(), mPoolOp.getType(), mPoolOp.getInput(),
+        rewriter.getStringAttr(mfStr), rewriter.getStringAttr(afStr),
+        rewriter.getStringAttr(otStr),
+        rewriter.getI32IntegerAttr(scratchByteParam),
+        rewriter.getI32IntegerAttr(actualThreadCount), getStringArrayAttr(akp));
+    rewriter.replaceOp(mPoolOp, xcMaxPool2DOp.getOutput());
+    return success();
+  }
+};
+
+void ReplaceMaxPool2D::runOnOperation() {
+  auto *ctx = &getContext();
+  func::FuncOp func = getOperation();
+  RewritePatternSet patterns(ctx);
+  patterns.insert<ReplaceMaxPool2DPattern>(ctx);
+  (void)applyPatternsAndFoldGreedily(func, std::move(patterns));
+}
+} // namespace
+std::unique_ptr<OperationPass<func::FuncOp>> createReplaceMaxPoolPass() {
+  return std::make_unique<ReplaceMaxPool2D>();
+}
+
+static PassRegistration<ReplaceMaxPool2D> pass;
+
+} // namespace xcore
+} // namespace mlir
diff --git a/xformer/Transforms/TranslateToCustomOp.cpp b/xformer/Transforms/TranslateToCustomOp.cpp
index 12550e927..2a7e74a90 100644
--- a/xformer/Transforms/TranslateToCustomOp.cpp
+++ b/xformer/Transforms/TranslateToCustomOp.cpp
@@ -131,6 +131,31 @@ std::vector<uint8_t> Conv2DV2Op::buildCustomOptions() {
   return fbb.GetBuffer();
 }
 
+std::vector<uint8_t> MaxPool2DOp::buildCustomOptions() {
+  // TODO: Is the alignement messed up?
+  flexbuffers::Builder fbb;
+  auto rootMap = fbb.StartMap();
+  fbb.String("mp", getMemcpyFnParam().str());
+  fbb.String("a", getAggregateFnParam().str());
+  fbb.String("o", getOutputTransformFnParam().str());
+  int threadCount = (int)getThreadCount();
+  auto akpVec = fbb.StartVector("p");
+  for (int i = 0; i < threadCount; ++i) {
+    fbb.String(getAbstractKernelParams()
+                   .cast<ArrayAttr>()[i]
+                   .cast<StringAttr>()
+                   .getValue()
+                   .str() +
+               "00");
+  }
+  fbb.EndVector(akpVec, false, false);
+  fbb.Int("s", (int32_t)getScratchBytes());
+
+  fbb.EndMap(rootMap);
+  fbb.Finish();
+  return fbb.GetBuffer();
+}
+
 namespace {
 /// This pass translates XCore ops to TFLite custom ops.
 struct TranslateToCustomOp
@@ -172,6 +197,7 @@ void TranslateToCustomOp::runOnOperation() {
   patterns.insert<RewriteToCustomOp<AddOp>>(ctx);
   patterns.insert<RewriteToCustomOp<Bsign8Op>>(ctx);
   patterns.insert<RewriteToCustomOp<Conv2DV2Op>>(ctx);
+  patterns.insert<RewriteToCustomOp<MaxPool2DOp>>(ctx);
   patterns.insert<RewriteToCustomOp<LoadFlashOp>>(ctx);
   patterns.insert<RewriteToCustomOp<LookupOp>>(ctx);
   patterns.insert<RewriteToCustomOp<MulOp>>(ctx);
diff --git a/xformer/WORKSPACE b/xformer/WORKSPACE
index b02077222..9c50f9e9b 100644
--- a/xformer/WORKSPACE
+++ b/xformer/WORKSPACE
@@ -31,7 +31,7 @@ load("@bazel_skylib//lib:paths.bzl", "paths")
 ############################### Compile Commands ###############################
 # Hedron's Compile Commands Extractor for Bazel, used to get clangd to work
 # Replace commit hash with latest version, later setup automatic update tool maybe?
-BCCE_HASH = "e16062717d9b098c3c2ac95717d2b3e661c50608"
+BCCE_HASH = "eca42c63700fccdc49cf58177e0a96f0f6075a68"
 http_archive(
     name = "hedron_compile_commands",
     url = "https://github.com/hedronvision/bazel-compile-commands-extractor/archive/{hash}.tar.gz".format(hash = BCCE_HASH),
diff --git a/xformer/lib_tflite_micro.BUILD b/xformer/lib_tflite_micro.BUILD
index 11a06f374..0c83673e5 100644
--- a/xformer/lib_tflite_micro.BUILD
+++ b/xformer/lib_tflite_micro.BUILD
@@ -28,6 +28,7 @@ filegroup(
         "lib_tflite_micro/src/tflite-xcore-kernels/xcore_custom_options.cc",
         "lib_tflite_micro/src/tflite-xcore-kernels/xcore_bsign.cc",
         "lib_tflite_micro/src/tflite-xcore-kernels/xcore_conv2d_v2.cc",
+        "lib_tflite_micro/src/tflite-xcore-kernels/xcore_maxpool2d.cc",
         "lib_tflite_micro/src/tflite-xcore-kernels/xcore_detection_post.cc",
         "lib_tflite_micro/src/tflite-xcore-kernels/xcore_load_from_flash.cc",
         "lib_tflite_micro/src/tflite-xcore-kernels/xcore_lookup.cc",
diff --git a/xformer/toolchain/BUILD b/xformer/toolchain/BUILD
deleted file mode 100644
index 369886450..000000000
--- a/xformer/toolchain/BUILD
+++ /dev/null
@@ -1,72 +0,0 @@
-#
-# Copyright 2015 Google Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-package(default_visibility = ["//visibility:public"])
-
-#filegroup(
-#    name = "toolchain",
-#    srcs = [
-#        ":cc-compiler-linux",
-#    ],
-#)
-
-#filegroup(name = "clang_suite")
-
-cc_toolchain_suite(
-    name = "gnu_suite",
-    toolchains = {
-        "k8": ":k8_toolchain",
-    },
-)
-
-filegroup(name = "empty")
-
-cc_toolchain(
-    name = "k8_toolchain",
-    toolchain_identifier = "k8-toolchain",
-    toolchain_config = ":k8_toolchain_config",
-    all_files = ":empty",
-    compiler_files = ":empty",
-    dwp_files = ":empty",
-    linker_files = ":empty",
-    objcopy_files = ":empty",
-    strip_files = ":empty",
-    supports_param_files = 0,
-)
-
-load(":cc_toolchain_config.bzl", "cc_toolchain_config")
-
-cc_toolchain_config(name = "k8_toolchain_config")
-
-#cc_toolchain(
-#    name = "cc_linux",
-#    toolchain_config = ":linux_toolchain_config",
-#    all_files = ":empty",
-#    compiler_files = ":empty",
-#    dwp_files = ":empty",
-#    linker_files = ":empty",
-#    objcopy_files = ":empty",
-#    strip_files = ":empty",
-#    supports_param_files = 0,
-#)
-#
-#toolchain_type(name = "toolchain_type")
-#
-#toolchain(
-#     name = "cc-compiler-linux",
-#     toolchain = ":cc_linux",
-#     toolchain_type = ":toolchain_type",
-#     toolchain_config = ":linux_toolchain_config",
-###)
diff --git a/xformer/toolchain/cc_toolchain_config.bzl b/xformer/toolchain/cc_toolchain_config.bzl
deleted file mode 100644
index 77cc66ca5..000000000
--- a/xformer/toolchain/cc_toolchain_config.bzl
+++ /dev/null
@@ -1,97 +0,0 @@
-# NEW
-load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
-# NEW
-load(
-   "@bazel_tools//tools/cpp:cc_toolchain_config_lib.bzl",
-   "feature",
-   "flag_group",
-   "flag_set",
-   "tool_path",
-)
-
-all_link_actions = [ # NEW
-   ACTION_NAMES.cpp_link_executable,
-   ACTION_NAMES.cpp_link_dynamic_library,
-   ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-]
-
-def _impl(ctx):
-   tool_paths = [ # NEW
-       tool_path(
-           name = "gcc",
-           path = "/opt/xmos/gcc/11.2.0/bin/gcc",
-       ),
-       tool_path(
-           name = "ld",
-           path = "/usr/bin/ld",
-       ),
-       tool_path(
-           name = "ar",
-           path = "/opt/xmos/gcc/11.2.0/bin/gcc-ar",
-       ),
-       tool_path(
-           name = "cpp",
-           path = "/opt/xmos/gcc/11.2.0/bin/cpp",
-       ),
-       tool_path(
-           name = "gcov",
-           path = "/bin/false",
-       ),
-       tool_path(
-           name = "nm",
-           path = "/bin/false",
-       ),
-       tool_path(
-           name = "objdump",
-           path = "/bin/false",
-       ),
-       tool_path(
-           name = "strip",
-           path = "/bin/false",
-       ),
-   ]
-
-   features = [ # NEW
-       feature(
-	   name = "default_linker_flags",
-	   enabled = True,
-	   flag_sets = [
-	       flag_set(
-		   actions = all_link_actions,
-		   flag_groups = ([
-		       flag_group(
-			   flags = [
-			       "-lstdc++",
-			   ],
-		       ),
-		   ]),
-	       ),
-	   ],
-       ),
-   ]
-
-   return cc_common.create_cc_toolchain_config_info(
-       ctx = ctx,
-       features = features, # NEW
-       cxx_builtin_include_directories = [
-	   "/opt/xmos/gcc/11.2.0/lib/gcc/x86_64-pc-linux-gnu/11.2.0/include",
-	   "/opt/xmos/gcc/11.2.0/lib/gcc/x86_64-pc-linux-gnu/11.2.0/include-fixed",
-	   "/opt/xmos/gcc/11.2.0/include",
-	   "/usr/include",
-       ],
-       toolchain_identifier = "local",
-       host_system_name = "local",
-       target_system_name = "local",
-       target_cpu = "k8",
-       target_libc = "unknown",
-       compiler = "gnu",
-       abi_version = "unknown",
-       abi_libc_version = "unknown",
-       tool_paths = tool_paths
-   )
-
-cc_toolchain_config = rule(
-   implementation = _impl,
-   attrs = {},
-   provides = [CcToolchainConfigInfo],
-)