diff --git a/tools/compare_commits/compare.py b/tools/compare_commits/compare.py
new file mode 100644
index 00000000..01d2c89f
--- /dev/null
+++ b/tools/compare_commits/compare.py
@@ -0,0 +1,429 @@
+#!/usr/bin/env python
+
+import unittest
+"""
+compare.py - versatile benchmark output compare tool
+"""
+
+import argparse
+from argparse import ArgumentParser
+import json
+import sys
+import gbench
+from gbench import util, report
+from gbench.util import *
+
+
+def check_inputs(in1, in2, flags):
+    """
+    Perform checking on the user provided inputs and diagnose any abnormalities
+    """
+    in1_kind, in1_err = classify_input_file(in1)
+    in2_kind, in2_err = classify_input_file(in2)
+    output_file = find_benchmark_flag('--benchmark_out=', flags)
+    output_type = find_benchmark_flag('--benchmark_out_format=', flags)
+    if in1_kind == IT_Executable and in2_kind == IT_Executable and output_file:
+        print(("WARNING: '--benchmark_out=%s' will be passed to both "
+               "benchmarks causing it to be overwritten") % output_file)
+    if in1_kind == IT_JSON and in2_kind == IT_JSON and len(flags) > 0:
+        print("WARNING: passing optional flags has no effect since both "
+              "inputs are JSON")
+    if output_type is not None and output_type != 'json':
+        print(("ERROR: passing '--benchmark_out_format=%s' to 'compare.py`"
+               " is not supported.") % output_type)
+        sys.exit(1)
+
+
+def create_parser():
+    parser = ArgumentParser(
+        description='versatile benchmark output compare tool')
+
+    parser.add_argument(
+        '-a',
+        '--display_aggregates_only',
+        dest='display_aggregates_only',
+        action="store_true",
+        help="If there are repetitions, by default, we display everything - the"
+             " actual runs, and the aggregates computed. Sometimes, it is "
+             "desirable to only view the aggregates. E.g. when there are a lot "
+             "of repetitions. Do note that only the display is affected. "
+             "Internally, all the actual runs are still used, e.g. for U test.")
+
+    parser.add_argument(
+        '--no-color',
+        dest='color',
+        default=True,
+        action="store_false",
+        help="Do not use colors in the terminal output"
+    )
+
+    parser.add_argument(
+        '-d',
+        '--dump_to_json',
+        dest='dump_to_json',
+        help="Additionally, dump benchmark comparison output to this file in JSON format.")
+
+    utest = parser.add_argument_group()
+    utest.add_argument(
+        '--no-utest',
+        dest='utest',
+        default=True,
+        action="store_false",
+        help="The tool can do a two-tailed Mann-Whitney U test with the null hypothesis that it is equally likely that a randomly selected value from one sample will be less than or greater than a randomly selected value from a second sample.\nWARNING: requires **LARGE** (no less than {}) number of repetitions to be meaningful!\nThe test is being done by default, if at least {} repetitions were done.\nThis option can disable the U Test.".format(report.UTEST_OPTIMAL_REPETITIONS, report.UTEST_MIN_REPETITIONS))
+    alpha_default = 0.05
+    utest.add_argument(
+        "--alpha",
+        dest='utest_alpha',
+        default=alpha_default,
+        type=float,
+        help=("significance level alpha. if the calculated p-value is below this value, then the result is said to be statistically significant and the null hypothesis is rejected.\n(default: %0.4f)") %
+        alpha_default)
+
+    subparsers = parser.add_subparsers(
+        help='This tool has multiple modes of operation:',
+        dest='mode')
+
+    parser_a = subparsers.add_parser(
+        'benchmarks',
+        help='The most simple use-case, compare all the output of these two benchmarks')
+    baseline = parser_a.add_argument_group(
+        'baseline', 'The benchmark baseline')
+    baseline.add_argument(
+        'test_baseline',
+        metavar='test_baseline',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='A benchmark executable or JSON output file')
+    contender = parser_a.add_argument_group(
+        'contender', 'The benchmark that will be compared against the baseline')
+    contender.add_argument(
+        'test_contender',
+        metavar='test_contender',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='A benchmark executable or JSON output file')
+    parser_a.add_argument(
+        'benchmark_options',
+        metavar='benchmark_options',
+        nargs=argparse.REMAINDER,
+        help='Arguments to pass when running benchmark executables')
+
+    parser_b = subparsers.add_parser(
+        'filters', help='Compare filter one with the filter two of benchmark')
+    baseline = parser_b.add_argument_group(
+        'baseline', 'The benchmark baseline')
+    baseline.add_argument(
+        'test',
+        metavar='test',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='A benchmark executable or JSON output file')
+    baseline.add_argument(
+        'filter_baseline',
+        metavar='filter_baseline',
+        type=str,
+        nargs=1,
+        help='The first filter, that will be used as baseline')
+    contender = parser_b.add_argument_group(
+        'contender', 'The benchmark that will be compared against the baseline')
+    contender.add_argument(
+        'filter_contender',
+        metavar='filter_contender',
+        type=str,
+        nargs=1,
+        help='The second filter, that will be compared against the baseline')
+    parser_b.add_argument(
+        'benchmark_options',
+        metavar='benchmark_options',
+        nargs=argparse.REMAINDER,
+        help='Arguments to pass when running benchmark executables')
+
+    parser_c = subparsers.add_parser(
+        'benchmarksfiltered',
+        help='Compare filter one of first benchmark with filter two of the second benchmark')
+    baseline = parser_c.add_argument_group(
+        'baseline', 'The benchmark baseline')
+    baseline.add_argument(
+        'test_baseline',
+        metavar='test_baseline',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='A benchmark executable or JSON output file')
+    baseline.add_argument(
+        'filter_baseline',
+        metavar='filter_baseline',
+        type=str,
+        nargs=1,
+        help='The first filter, that will be used as baseline')
+    contender = parser_c.add_argument_group(
+        'contender', 'The benchmark that will be compared against the baseline')
+    contender.add_argument(
+        'test_contender',
+        metavar='test_contender',
+        type=argparse.FileType('r'),
+        nargs=1,
+        help='The second benchmark executable or JSON output file, that will be compared against the baseline')
+    contender.add_argument(
+        'filter_contender',
+        metavar='filter_contender',
+        type=str,
+        nargs=1,
+        help='The second filter, that will be compared against the baseline')
+    parser_c.add_argument(
+        'benchmark_options',
+        metavar='benchmark_options',
+        nargs=argparse.REMAINDER,
+        help='Arguments to pass when running benchmark executables')
+
+    return parser
+
+
+def main():
+    # Parse the command line flags
+    parser = create_parser()
+    args, unknown_args = parser.parse_known_args()
+    if args.mode is None:
+        parser.print_help()
+        exit(1)
+    assert not unknown_args
+    benchmark_options = args.benchmark_options
+
+    if args.mode == 'benchmarks':
+        test_baseline = args.test_baseline[0].name
+        test_contender = args.test_contender[0].name
+        filter_baseline = ''
+        filter_contender = ''
+
+        # NOTE: if test_baseline == test_contender, you are analyzing the stdev
+
+        description = 'Comparing %s to %s' % (test_baseline, test_contender)
+    elif args.mode == 'filters':
+        test_baseline = args.test[0].name
+        test_contender = args.test[0].name
+        filter_baseline = args.filter_baseline[0]
+        filter_contender = args.filter_contender[0]
+
+        # NOTE: if filter_baseline == filter_contender, you are analyzing the
+        # stdev
+
+        description = 'Comparing %s to %s (from %s)' % (
+            filter_baseline, filter_contender, args.test[0].name)
+    elif args.mode == 'benchmarksfiltered':
+        test_baseline = args.test_baseline[0].name
+        test_contender = args.test_contender[0].name
+        filter_baseline = args.filter_baseline[0]
+        filter_contender = args.filter_contender[0]
+
+        # NOTE: if test_baseline == test_contender and
+        # filter_baseline == filter_contender, you are analyzing the stdev
+
+        description = 'Comparing %s (from %s) to %s (from %s)' % (
+            filter_baseline, test_baseline, filter_contender, test_contender)
+    else:
+        # should never happen
+        print("Unrecognized mode of operation: '%s'" % args.mode)
+        parser.print_help()
+        exit(1)
+
+    check_inputs(test_baseline, test_contender, benchmark_options)
+
+    if args.display_aggregates_only:
+        benchmark_options += ['--benchmark_display_aggregates_only=true']
+
+    options_baseline = []
+    options_contender = []
+
+    if filter_baseline and filter_contender:
+        options_baseline = ['--benchmark_filter=%s' % filter_baseline]
+        options_contender = ['--benchmark_filter=%s' % filter_contender]
+
+    # Run the benchmarks and report the results
+    json1 = json1_orig = gbench.util.sort_benchmark_results(gbench.util.run_or_load_benchmark(
+        test_baseline, benchmark_options + options_baseline))
+    json2 = json2_orig = gbench.util.sort_benchmark_results(gbench.util.run_or_load_benchmark(
+        test_contender, benchmark_options + options_contender))
+
+    # Now, filter the benchmarks so that the difference report can work
+    if filter_baseline and filter_contender:
+        replacement = '[%s vs. %s]' % (filter_baseline, filter_contender)
+        json1 = gbench.report.filter_benchmark(
+            json1_orig, filter_baseline, replacement)
+        json2 = gbench.report.filter_benchmark(
+            json2_orig, filter_contender, replacement)
+
+    diff_report = gbench.report.get_difference_report(
+        json1, json2, args.utest)
+    output_lines = gbench.report.print_difference_report(
+        diff_report,
+        args.display_aggregates_only,
+        args.utest, args.utest_alpha, args.color)
+    print(description)
+    for ln in output_lines:
+        print(ln)
+
+    # Optionally, diff and output to JSON
+    if args.dump_to_json is not None:
+        with open(args.dump_to_json, 'w') as f_json:
+            json.dump(diff_report, f_json)
+
+class TestParser(unittest.TestCase):
+    def setUp(self):
+        self.parser = create_parser()
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'gbench',
+            'Inputs')
+        self.testInput0 = os.path.join(testInputs, 'test1_run1.json')
+        self.testInput1 = os.path.join(testInputs, 'test1_run2.json')
+
+    def test_benchmarks_basic(self):
+        parsed = self.parser.parse_args(
+            ['benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_without_utest(self):
+        parsed = self.parser.parse_args(
+            ['--no-utest', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertFalse(parsed.utest)
+        self.assertEqual(parsed.utest_alpha, 0.05)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_display_aggregates_only(self):
+        parsed = self.parser.parse_args(
+            ['-a', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertTrue(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_with_utest_alpha(self):
+        parsed = self.parser.parse_args(
+            ['--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.utest_alpha, 0.314)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_basic_without_utest_with_utest_alpha(self):
+        parsed = self.parser.parse_args(
+            ['--no-utest', '--alpha=0.314', 'benchmarks', self.testInput0, self.testInput1])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertFalse(parsed.utest)
+        self.assertEqual(parsed.utest_alpha, 0.314)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarks_with_remainder(self):
+        parsed = self.parser.parse_args(
+            ['benchmarks', self.testInput0, self.testInput1, 'd'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.benchmark_options, ['d'])
+
+    def test_benchmarks_with_remainder_after_doubleminus(self):
+        parsed = self.parser.parse_args(
+            ['benchmarks', self.testInput0, self.testInput1, '--', 'e'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarks')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.benchmark_options, ['e'])
+
+    def test_filters_basic(self):
+        parsed = self.parser.parse_args(
+            ['filters', self.testInput0, 'c', 'd'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'filters')
+        self.assertEqual(parsed.test[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.filter_contender[0], 'd')
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_filters_with_remainder(self):
+        parsed = self.parser.parse_args(
+            ['filters', self.testInput0, 'c', 'd', 'e'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'filters')
+        self.assertEqual(parsed.test[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.filter_contender[0], 'd')
+        self.assertEqual(parsed.benchmark_options, ['e'])
+
+    def test_filters_with_remainder_after_doubleminus(self):
+        parsed = self.parser.parse_args(
+            ['filters', self.testInput0, 'c', 'd', '--', 'f'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'filters')
+        self.assertEqual(parsed.test[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.filter_contender[0], 'd')
+        self.assertEqual(parsed.benchmark_options, ['f'])
+
+    def test_benchmarksfiltered_basic(self):
+        parsed = self.parser.parse_args(
+            ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarksfiltered')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.filter_contender[0], 'e')
+        self.assertFalse(parsed.benchmark_options)
+
+    def test_benchmarksfiltered_with_remainder(self):
+        parsed = self.parser.parse_args(
+            ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', 'f'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarksfiltered')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.filter_contender[0], 'e')
+        self.assertEqual(parsed.benchmark_options[0], 'f')
+
+    def test_benchmarksfiltered_with_remainder_after_doubleminus(self):
+        parsed = self.parser.parse_args(
+            ['benchmarksfiltered', self.testInput0, 'c', self.testInput1, 'e', '--', 'g'])
+        self.assertFalse(parsed.display_aggregates_only)
+        self.assertTrue(parsed.utest)
+        self.assertEqual(parsed.mode, 'benchmarksfiltered')
+        self.assertEqual(parsed.test_baseline[0].name, self.testInput0)
+        self.assertEqual(parsed.filter_baseline[0], 'c')
+        self.assertEqual(parsed.test_contender[0].name, self.testInput1)
+        self.assertEqual(parsed.filter_contender[0], 'e')
+        self.assertEqual(parsed.benchmark_options[0], 'g')
+
+
+if __name__ == '__main__':
+    # unittest.main()
+    main()
+
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
+# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
+# kate: indent-mode python; remove-trailing-spaces modified;
diff --git a/tools/compare_commits/gbench/Inputs/test1_run1.json b/tools/compare_commits/gbench/Inputs/test1_run1.json
new file mode 100644
index 00000000..9daed0bc
--- /dev/null
+++ b/tools/compare_commits/gbench/Inputs/test1_run1.json
@@ -0,0 +1,127 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_SameTimes",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 10,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_2xFaster",
+      "iterations": 1000,
+      "real_time": 50,
+      "cpu_time": 50,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_2xSlower",
+      "iterations": 1000,
+      "real_time": 50,
+      "cpu_time": 50,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_1PercentFaster",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_1PercentSlower",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentFaster",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentSlower",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_100xSlower",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_100xFaster",
+      "iterations": 1000,
+      "real_time": 10000,
+      "cpu_time": 10000,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentCPUToTime",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_ThirdFaster",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "MyComplexityTest_BigO",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "BigO",
+      "cpu_coefficient": 4.2749856294592886e+00,
+      "real_coefficient": 6.4789275289789780e+00,
+      "big_o": "N",
+      "time_unit": "ns"
+    },
+    {
+      "name": "MyComplexityTest_RMS",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "RMS",
+      "rms": 4.5097802512472874e-03
+    },
+    {
+      "name": "BM_NotBadTimeUnit",
+      "iterations": 1000,
+      "real_time": 0.4,
+      "cpu_time": 0.5,
+      "time_unit": "s"
+    },
+    {
+      "name": "BM_DifferentTimeUnit",
+      "iterations": 1,
+      "real_time": 1,
+      "cpu_time": 1,
+      "time_unit": "s"
+    },
+    {
+      "name": "BM_hasLabel",
+      "label": "a label",
+      "iterations": 1,
+      "real_time": 1,
+      "cpu_time": 1,
+      "time_unit": "s"
+    }
+  ]
+}
diff --git a/tools/compare_commits/gbench/Inputs/test1_run2.json b/tools/compare_commits/gbench/Inputs/test1_run2.json
new file mode 100644
index 00000000..dc52970a
--- /dev/null
+++ b/tools/compare_commits/gbench/Inputs/test1_run2.json
@@ -0,0 +1,127 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_SameTimes",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 10,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_2xFaster",
+      "iterations": 1000,
+      "real_time": 25,
+      "cpu_time": 25,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_2xSlower",
+      "iterations": 20833333,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_1PercentFaster",
+      "iterations": 1000,
+      "real_time": 98.9999999,
+      "cpu_time": 98.9999999,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_1PercentSlower",
+      "iterations": 1000,
+      "real_time": 100.9999999,
+      "cpu_time": 100.9999999,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentFaster",
+      "iterations": 1000,
+      "real_time": 90,
+      "cpu_time": 90,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentSlower",
+      "iterations": 1000,
+      "real_time": 110,
+      "cpu_time": 110,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_100xSlower",
+      "iterations": 1000,
+      "real_time": 1.0000e+04,
+      "cpu_time": 1.0000e+04,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_100xFaster",
+      "iterations": 1000,
+      "real_time": 100,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_10PercentCPUToTime",
+      "iterations": 1000,
+      "real_time": 110,
+      "cpu_time": 90,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_ThirdFaster",
+      "iterations": 1000,
+      "real_time": 66.665,
+      "cpu_time": 66.664,
+      "time_unit": "ns"
+    },
+    {
+      "name": "MyComplexityTest_BigO",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "BigO",
+      "cpu_coefficient": 5.6215779594361486e+00,
+      "real_coefficient": 5.6288314793554610e+00,
+      "big_o": "N",
+      "time_unit": "ns"
+    },
+    {
+      "name": "MyComplexityTest_RMS",
+      "run_name": "MyComplexityTest",
+      "run_type": "aggregate",
+      "aggregate_name": "RMS",
+      "rms": 3.3128901852342174e-03
+    },
+    {
+      "name": "BM_NotBadTimeUnit",
+      "iterations": 1000,
+      "real_time": 0.04,
+      "cpu_time": 0.6,
+      "time_unit": "s"
+    },
+    {
+      "name": "BM_DifferentTimeUnit",
+      "iterations": 1,
+      "real_time": 1,
+      "cpu_time": 1,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_hasLabel",
+      "label": "a label",
+      "iterations": 1,
+      "real_time": 1,
+      "cpu_time": 1,
+      "time_unit": "s"
+    }
+  ]
+}
diff --git a/tools/compare_commits/gbench/Inputs/test2_run.json b/tools/compare_commits/gbench/Inputs/test2_run.json
new file mode 100644
index 00000000..15bc6980
--- /dev/null
+++ b/tools/compare_commits/gbench/Inputs/test2_run.json
@@ -0,0 +1,81 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_Hi",
+      "iterations": 1234,
+      "real_time": 42,
+      "cpu_time": 24,
+      "time_unit": "ms"
+    },
+    {
+      "name": "BM_Zero",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 10,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Zero/4",
+      "iterations": 4000,
+      "real_time": 40,
+      "cpu_time": 40,
+      "time_unit": "ns"
+    },
+    {
+      "name": "Prefix/BM_Zero",
+      "iterations": 2000,
+      "real_time": 20,
+      "cpu_time": 20,
+      "time_unit": "ns"
+    },
+    {
+      "name": "Prefix/BM_Zero/3",
+      "iterations": 3000,
+      "real_time": 30,
+      "cpu_time": 30,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_One",
+      "iterations": 5000,
+      "real_time": 5,
+      "cpu_time": 5,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_One/4",
+      "iterations": 2000,
+      "real_time": 20,
+      "cpu_time": 20,
+      "time_unit": "ns"
+    },
+    {
+      "name": "Prefix/BM_One",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 10,
+      "time_unit": "ns"
+    },
+    {
+      "name": "Prefix/BM_One/3",
+      "iterations": 1500,
+      "real_time": 15,
+      "cpu_time": 15,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Bye",
+      "iterations": 5321,
+      "real_time": 11,
+      "cpu_time": 63,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/tools/compare_commits/gbench/Inputs/test3_run0.json b/tools/compare_commits/gbench/Inputs/test3_run0.json
new file mode 100644
index 00000000..49f8b061
--- /dev/null
+++ b/tools/compare_commits/gbench/Inputs/test3_run0.json
@@ -0,0 +1,65 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_One",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 100,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Two",
+      "iterations": 1000,
+      "real_time": 9,
+      "cpu_time": 90,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Two",
+      "iterations": 1000,
+      "real_time": 8,
+      "cpu_time": 86,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 8,
+      "cpu_time": 80,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 8,
+      "cpu_time": 77,
+      "time_unit": "ns"
+    },
+    {
+      "name": "medium",
+      "run_type": "iteration",
+      "iterations": 1000,
+      "real_time": 8,
+      "cpu_time": 80,
+      "time_unit": "ns"
+    },
+    {
+      "name": "medium",
+      "run_type": "iteration",
+      "iterations": 1000,
+      "real_time": 9,
+      "cpu_time": 82,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/tools/compare_commits/gbench/Inputs/test3_run1.json b/tools/compare_commits/gbench/Inputs/test3_run1.json
new file mode 100644
index 00000000..acc5ba17
--- /dev/null
+++ b/tools/compare_commits/gbench/Inputs/test3_run1.json
@@ -0,0 +1,65 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "BM_One",
+      "iterations": 1000,
+      "real_time": 9,
+      "cpu_time": 110,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Two",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 10,
+      "cpu_time": 89,
+      "time_unit": "ns"
+    },
+    {
+      "name": "BM_Two",
+      "iterations": 1000,
+      "real_time": 7,
+      "cpu_time": 72,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 1000,
+      "real_time": 7,
+      "cpu_time": 75,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "aggregate",
+      "iterations": 762,
+      "real_time": 4.54,
+      "cpu_time": 66.6,
+      "time_unit": "ns"
+    },
+    {
+      "name": "short",
+      "run_type": "iteration",
+      "iterations": 1000,
+      "real_time": 800,
+      "cpu_time": 1,
+      "time_unit": "ns"
+    },
+    {
+      "name": "medium",
+      "run_type": "iteration",
+      "iterations": 1200,
+      "real_time": 5,
+      "cpu_time": 53,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/tools/compare_commits/gbench/Inputs/test4_run.json b/tools/compare_commits/gbench/Inputs/test4_run.json
new file mode 100644
index 00000000..eaa005f3
--- /dev/null
+++ b/tools/compare_commits/gbench/Inputs/test4_run.json
@@ -0,0 +1,96 @@
+{
+  "benchmarks": [
+    {
+      "name": "99 family 0 instance 0 repetition 0",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "repetition_index": 0
+    },
+    {
+      "name": "98 family 0 instance 0 repetition 1",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "repetition_index": 1
+    },
+    {
+      "name": "97 family 0 instance 0 aggregate",
+      "run_type": "aggregate",
+      "family_index": 0,
+      "per_family_instance_index": 0,
+      "aggregate_name": "9 aggregate"
+    },
+
+
+    {
+      "name": "96 family 0 instance 1 repetition 0",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 1,
+      "repetition_index": 0
+    },
+    {
+      "name": "95 family 0 instance 1 repetition 1",
+      "run_type": "iteration",
+      "family_index": 0,
+      "per_family_instance_index": 1,
+      "repetition_index": 1
+    },
+    {
+      "name": "94 family 0 instance 1 aggregate",
+      "run_type": "aggregate",
+      "family_index": 0,
+      "per_family_instance_index": 1,
+      "aggregate_name": "9 aggregate"
+    },
+
+
+
+
+    {
+      "name": "93 family 1 instance 0 repetition 0",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 0,
+      "repetition_index": 0
+    },
+    {
+      "name": "92 family 1 instance 0 repetition 1",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 0,
+      "repetition_index": 1
+    },
+    {
+      "name": "91 family 1 instance 0 aggregate",
+      "run_type": "aggregate",
+      "family_index": 1,
+      "per_family_instance_index": 0,
+      "aggregate_name": "9 aggregate"
+    },
+
+
+    {
+      "name": "90 family 1 instance 1 repetition 0",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 1,
+      "repetition_index": 0
+    },
+    {
+      "name": "89 family 1 instance 1 repetition 1",
+      "run_type": "iteration",
+      "family_index": 1,
+      "per_family_instance_index": 1,
+      "repetition_index": 1
+    },
+    {
+      "name": "88 family 1 instance 1 aggregate",
+      "run_type": "aggregate",
+      "family_index": 1,
+      "per_family_instance_index": 1,
+      "aggregate_name": "9 aggregate"
+    }
+  ]
+}
diff --git a/tools/compare_commits/gbench/Inputs/test4_run0.json b/tools/compare_commits/gbench/Inputs/test4_run0.json
new file mode 100644
index 00000000..54cf1275
--- /dev/null
+++ b/tools/compare_commits/gbench/Inputs/test4_run0.json
@@ -0,0 +1,21 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "whocares",
+      "run_type": "aggregate",
+      "aggregate_name": "zz",
+      "aggregate_unit": "percentage",
+      "iterations": 1000,
+      "real_time": 0.01,
+      "cpu_time": 0.10,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/tools/compare_commits/gbench/Inputs/test4_run1.json b/tools/compare_commits/gbench/Inputs/test4_run1.json
new file mode 100644
index 00000000..25d56050
--- /dev/null
+++ b/tools/compare_commits/gbench/Inputs/test4_run1.json
@@ -0,0 +1,21 @@
+{
+  "context": {
+    "date": "2016-08-02 17:44:46",
+    "num_cpus": 4,
+    "mhz_per_cpu": 4228,
+    "cpu_scaling_enabled": false,
+    "library_build_type": "release"
+  },
+  "benchmarks": [
+    {
+      "name": "whocares",
+      "run_type": "aggregate",
+      "aggregate_name": "zz",
+      "aggregate_unit": "percentage",
+      "iterations": 1000,
+      "real_time": 0.005,
+      "cpu_time": 0.15,
+      "time_unit": "ns"
+    }
+  ]
+}
diff --git a/tools/compare_commits/gbench/__init__.py b/tools/compare_commits/gbench/__init__.py
new file mode 100644
index 00000000..fce1a1ac
--- /dev/null
+++ b/tools/compare_commits/gbench/__init__.py
@@ -0,0 +1,8 @@
+"""Google Benchmark tooling"""
+
+__author__ = 'Eric Fiselier'
+__email__ = 'eric@efcs.ca'
+__versioninfo__ = (0, 5, 0)
+__version__ = '.'.join(str(v) for v in __versioninfo__) + 'dev'
+
+__all__ = []
diff --git a/tools/compare_commits/gbench/report.py b/tools/compare_commits/gbench/report.py
new file mode 100644
index 00000000..b2bbfb9f
--- /dev/null
+++ b/tools/compare_commits/gbench/report.py
@@ -0,0 +1,1201 @@
+"""report.py - Utilities for reporting statistics about benchmark results
+"""
+
+import unittest
+import os
+import re
+import copy
+import random
+
+from scipy.stats import mannwhitneyu, gmean
+from numpy import array
+
+
+class BenchmarkColor(object):
+    def __init__(self, name, code):
+        self.name = name
+        self.code = code
+
+    def __repr__(self):
+        return '%s%r' % (self.__class__.__name__,
+                         (self.name, self.code))
+
+    def __format__(self, format):
+        return self.code
+
+
+# Benchmark Colors Enumeration
+BC_NONE = BenchmarkColor('NONE', '')
+BC_MAGENTA = BenchmarkColor('MAGENTA', '\033[95m')
+BC_CYAN = BenchmarkColor('CYAN', '\033[96m')
+BC_OKBLUE = BenchmarkColor('OKBLUE', '\033[94m')
+BC_OKGREEN = BenchmarkColor('OKGREEN', '\033[32m')
+BC_HEADER = BenchmarkColor('HEADER', '\033[92m')
+BC_WARNING = BenchmarkColor('WARNING', '\033[93m')
+BC_WHITE = BenchmarkColor('WHITE', '\033[97m')
+BC_FAIL = BenchmarkColor('FAIL', '\033[91m')
+BC_ENDC = BenchmarkColor('ENDC', '\033[0m')
+BC_BOLD = BenchmarkColor('BOLD', '\033[1m')
+BC_UNDERLINE = BenchmarkColor('UNDERLINE', '\033[4m')
+
+UTEST_MIN_REPETITIONS = 2
+UTEST_OPTIMAL_REPETITIONS = 9  # Lowest reasonable number, More is better.
+UTEST_COL_NAME = "_pvalue"
+
+_TIME_UNIT_TO_SECONDS_MULTIPLIER = {
+    "s": 1.0,
+    "ms": 1e-3,
+    "us": 1e-6,
+    "ns": 1e-9,
+}
+
+
+def color_format(use_color, fmt_str, *args, **kwargs):
+    """
+    Return the result of 'fmt_str.format(*args, **kwargs)' after transforming
+    'args' and 'kwargs' according to the value of 'use_color'. If 'use_color'
+    is False then all color codes in 'args' and 'kwargs' are replaced with
+    the empty string.
+    """
+    assert use_color is True or use_color is False
+    if not use_color:
+        args = [arg if not isinstance(arg, BenchmarkColor) else BC_NONE
+                for arg in args]
+        kwargs = {key: arg if not isinstance(arg, BenchmarkColor) else BC_NONE
+                  for key, arg in kwargs.items()}
+    return fmt_str.format(*args, **kwargs)
+
+
+def find_longest_name(benchmark_list):
+    """
+    Return the length of the longest benchmark name in a given list of
+    benchmark JSON objects
+    """
+    longest_name = 1
+    for bc in benchmark_list:
+        if len(bc['name']) > longest_name:
+            longest_name = len(bc['name'])
+    return longest_name
+
+
+def calculate_change(old_val, new_val):
+    """
+    Return a float representing the decimal change between old_val and new_val.
+    """
+    if old_val == 0 and new_val == 0:
+        return 0.0
+    if old_val == 0:
+        return float(new_val - old_val) / (float(old_val + new_val) / 2)
+    return float(new_val - old_val) / abs(old_val)
+
+
+def filter_benchmark(json_orig, family, replacement=""):
+    """
+    Apply a filter to the json, and only leave the 'family' of benchmarks.
+    """
+    regex = re.compile(family)
+    filtered = {}
+    filtered['benchmarks'] = []
+    for be in json_orig['benchmarks']:
+        if not regex.search(be['name']):
+            continue
+        filteredbench = copy.deepcopy(be)  # Do NOT modify the old name!
+        filteredbench['name'] = regex.sub(replacement, filteredbench['name'])
+        filtered['benchmarks'].append(filteredbench)
+    return filtered
+
+
+def get_unique_benchmark_names(json):
+    """
+    While *keeping* the order, give all the unique 'names' used for benchmarks.
+    """
+    seen = set()
+    uniqued = [x['name'] for x in json['benchmarks']
+               if x['name'] not in seen and
+               (seen.add(x['name']) or True)]
+    return uniqued
+
+
+def intersect(list1, list2):
+    """
+    Given two lists, get a new list consisting of the elements only contained
+    in *both of the input lists*, while preserving the ordering.
+    """
+    return [x for x in list1 if x in list2]
+
+
+def is_potentially_comparable_benchmark(x):
+    return ('time_unit' in x and 'real_time' in x and 'cpu_time' in x)
+
+
+def partition_benchmarks(json1, json2):
+    """
+    While preserving the ordering, find benchmarks with the same names in
+    both of the inputs, and group them.
+    (i.e. partition/filter into groups with common name)
+    """
+    json1_unique_names = get_unique_benchmark_names(json1)
+    json2_unique_names = get_unique_benchmark_names(json2)
+    names = intersect(json1_unique_names, json2_unique_names)
+    partitions = []
+    for name in names:
+        time_unit = None
+        # Pick the time unit from the first entry of the lhs benchmark.
+        # We should be careful not to crash with unexpected input.
+        for x in json1['benchmarks']:
+            if (x['name'] == name and is_potentially_comparable_benchmark(x)):
+                time_unit = x['time_unit']
+                break
+        if time_unit is None:
+            continue
+        # Filter by name and time unit.
+        # All the repetitions are assumed to be comparable.
+        lhs = [x for x in json1['benchmarks'] if x['name'] == name and
+               x['time_unit'] == time_unit]
+        rhs = [x for x in json2['benchmarks'] if x['name'] == name and
+               x['time_unit'] == time_unit]
+        partitions.append([lhs, rhs])
+    return partitions
+
+
+def get_timedelta_field_as_seconds(benchmark, field_name):
+    """
+    Get value of field_name field of benchmark, which is time with time unit
+    time_unit, as time in seconds.
+    """
+    timedelta = benchmark[field_name]
+    time_unit = benchmark.get('time_unit', 's')
+    return timedelta * _TIME_UNIT_TO_SECONDS_MULTIPLIER.get(time_unit)
+
+
+def calculate_geomean(json):
+    """
+    Extract all real/cpu times from all the benchmarks as seconds,
+    and calculate their geomean.
+    """
+    times = []
+    for benchmark in json['benchmarks']:
+        if 'run_type' in benchmark and benchmark['run_type'] == 'aggregate':
+            continue
+        times.append([get_timedelta_field_as_seconds(benchmark, 'real_time'),
+                      get_timedelta_field_as_seconds(benchmark, 'cpu_time')])
+    return gmean(times) if times else array([])
+
+
+def extract_field(partition, field_name):
+    # The count of elements may be different. We want *all* of them.
+    lhs = [x[field_name] for x in partition[0]]
+    rhs = [x[field_name] for x in partition[1]]
+    return [lhs, rhs]
+
+
+def calc_utest(timings_cpu, timings_time):
+    min_rep_cnt = min(len(timings_time[0]),
+                      len(timings_time[1]),
+                      len(timings_cpu[0]),
+                      len(timings_cpu[1]))
+
+    # Does *everything* has at least UTEST_MIN_REPETITIONS repetitions?
+    if min_rep_cnt < UTEST_MIN_REPETITIONS:
+        return False, None, None
+
+    time_pvalue = mannwhitneyu(
+        timings_time[0], timings_time[1], alternative='two-sided').pvalue
+    cpu_pvalue = mannwhitneyu(
+        timings_cpu[0], timings_cpu[1], alternative='two-sided').pvalue
+
+    return (min_rep_cnt >= UTEST_OPTIMAL_REPETITIONS), cpu_pvalue, time_pvalue
+
+
+def print_utest(bc_name, utest, utest_alpha, first_col_width, use_color=True):
+    def get_utest_color(pval):
+        return BC_FAIL if pval >= utest_alpha else BC_OKGREEN
+
+    # Check if we failed miserably with minimum required repetitions for utest
+    if not utest['have_optimal_repetitions'] and utest['cpu_pvalue'] is None and utest['time_pvalue'] is None:
+        return []
+
+    dsc = "U Test, Repetitions: {} vs {}".format(
+        utest['nr_of_repetitions'], utest['nr_of_repetitions_other'])
+    dsc_color = BC_OKGREEN
+
+    # We still got some results to show but issue a warning about it.
+    if not utest['have_optimal_repetitions']:
+        dsc_color = BC_WARNING
+        dsc += ". WARNING: Results unreliable! {}+ repetitions recommended.".format(
+            UTEST_OPTIMAL_REPETITIONS)
+
+    special_str = "{}{:<{}s}{endc}{}{:16.4f}{endc}{}{:16.4f}{endc}{}      {}"
+
+    return [color_format(use_color,
+                         special_str,
+                         BC_HEADER,
+                         "{}{}".format(bc_name, UTEST_COL_NAME),
+                         first_col_width,
+                         get_utest_color(
+                             utest['time_pvalue']), utest['time_pvalue'],
+                         get_utest_color(
+                             utest['cpu_pvalue']), utest['cpu_pvalue'],
+                         dsc_color, dsc,
+                         endc=BC_ENDC)]
+
+
+def get_difference_report(
+        json1,
+        json2,
+        utest=False):
+    """
+    Calculate and report the difference between each test of two benchmarks
+    runs specified as 'json1' and 'json2'. Output is another json containing
+    relevant details for each test run.
+    """
+    assert utest is True or utest is False
+
+    diff_report = []
+    partitions = partition_benchmarks(json1, json2)
+    for partition in partitions:
+        benchmark_name = partition[0][0]['name']
+        label = partition[0][0]['label'] if 'label' in partition[0][0] else ''
+        time_unit = partition[0][0]['time_unit']
+        measurements = []
+        utest_results = {}
+        # Careful, we may have different repetition count.
+        for i in range(min(len(partition[0]), len(partition[1]))):
+            bn = partition[0][i]
+            other_bench = partition[1][i]
+            measurements.append({
+                'real_time': bn['real_time'],
+                'cpu_time': bn['cpu_time'],
+                'real_time_other': other_bench['real_time'],
+                'cpu_time_other': other_bench['cpu_time'],
+                'time': calculate_change(bn['real_time'], other_bench['real_time']),
+                'cpu': calculate_change(bn['cpu_time'], other_bench['cpu_time'])
+            })
+
+        # After processing the whole partition, if requested, do the U test.
+        if utest:
+            timings_cpu = extract_field(partition, 'cpu_time')
+            timings_time = extract_field(partition, 'real_time')
+            have_optimal_repetitions, cpu_pvalue, time_pvalue = calc_utest(
+                timings_cpu, timings_time)
+            if cpu_pvalue and time_pvalue:
+                utest_results = {
+                    'have_optimal_repetitions': have_optimal_repetitions,
+                    'cpu_pvalue': cpu_pvalue,
+                    'time_pvalue': time_pvalue,
+                    'nr_of_repetitions': len(timings_cpu[0]),
+                    'nr_of_repetitions_other': len(timings_cpu[1])
+                }
+
+        # Store only if we had any measurements for given benchmark.
+        # E.g. partition_benchmarks will filter out the benchmarks having
+        # time units which are not compatible with other time units in the
+        # benchmark suite.
+        if measurements:
+            run_type = partition[0][0]['run_type'] if 'run_type' in partition[0][0] else ''
+            aggregate_name = partition[0][0]['aggregate_name'] if run_type == 'aggregate' and 'aggregate_name' in partition[0][0] else ''
+            diff_report.append({
+                'name': benchmark_name,
+                'label': label,
+                'measurements': measurements,
+                'time_unit': time_unit,
+                'run_type': run_type,
+                'aggregate_name': aggregate_name,
+                'utest': utest_results
+            })
+
+    lhs_gmean = calculate_geomean(json1)
+    rhs_gmean = calculate_geomean(json2)
+    if lhs_gmean.any() and rhs_gmean.any():
+        diff_report.append({
+            'name': 'OVERALL_GEOMEAN',
+            'label': '',
+            'measurements': [{
+                'real_time': lhs_gmean[0],
+                'cpu_time': lhs_gmean[1],
+                'real_time_other': rhs_gmean[0],
+                'cpu_time_other': rhs_gmean[1],
+                'time': calculate_change(lhs_gmean[0], rhs_gmean[0]),
+                'cpu': calculate_change(lhs_gmean[1], rhs_gmean[1])
+            }],
+            'time_unit': 's',
+            'run_type': 'aggregate',
+            'aggregate_name': 'geomean',
+            'utest': {}
+        })
+
+    return diff_report
+
+
+def print_difference_report(
+        json_diff_report,
+        include_aggregates_only=False,
+        utest=False,
+        utest_alpha=0.05,
+        use_color=True):
+    """
+    Calculate and report the difference between each test of two benchmarks
+    runs specified as 'json1' and 'json2'.
+    """
+    assert utest is True or utest is False
+
+    def get_color(res):
+        if res > 0.05:
+            return BC_FAIL
+        elif res > -0.07:
+            return BC_WHITE
+        else:
+            return BC_CYAN
+
+    first_col_width = find_longest_name(json_diff_report)
+    first_col_width = max(
+        first_col_width,
+        len('Benchmark'))
+    first_col_width += len(UTEST_COL_NAME)
+    first_line = "{:<{}s}Time             CPU      Time Old      Time New       CPU Old       CPU New".format(
+        'Benchmark', 12 + first_col_width)
+    output_strs = [first_line, '-' * len(first_line)]
+
+    fmt_str = "{}{:<{}s}{endc}{}{:+16.4f}{endc}{}{:+16.4f}{endc}{:14.0f}{:14.0f}{endc}{:14.0f}{:14.0f}"
+    for benchmark in json_diff_report:
+        # *If* we were asked to only include aggregates,
+        # and if it is non-aggregate, then don't print it.
+        if not include_aggregates_only or not 'run_type' in benchmark or benchmark['run_type'] == 'aggregate':
+            for measurement in benchmark['measurements']:
+                output_strs += [color_format(use_color,
+                                             fmt_str,
+                                             BC_HEADER,
+                                             benchmark['name'],
+                                             first_col_width,
+                                             get_color(measurement['time']),
+                                             measurement['time'],
+                                             get_color(measurement['cpu']),
+                                             measurement['cpu'],
+                                             measurement['real_time'],
+                                             measurement['real_time_other'],
+                                             measurement['cpu_time'],
+                                             measurement['cpu_time_other'],
+                                             endc=BC_ENDC)]
+
+        # After processing the measurements, if requested and
+        # if applicable (e.g. u-test exists for given benchmark),
+        # print the U test.
+        if utest and benchmark['utest']:
+            output_strs += print_utest(benchmark['name'],
+                                       benchmark['utest'],
+                                       utest_alpha=utest_alpha,
+                                       first_col_width=first_col_width,
+                                       use_color=use_color)
+
+    return output_strs
+
+
+###############################################################################
+# Unit tests
+
+
+class TestGetUniqueBenchmarkNames(unittest.TestCase):
+    def load_results(self):
+        import json
+        testInputs = os.path.join(
+            os.path.dirname(
+                os.path.realpath(__file__)),
+            'Inputs')
+        testOutput = os.path.join(testInputs, 'test3_run0.json')
+        with open(testOutput, 'r') as f:
+            json = json.load(f)
+        return json
+
+    def test_basic(self):
+        expect_lines = [
+            'BM_One',
+            'BM_Two',
+            'short',  # These two are not sorted
+            'medium',  # These two are not sorted
+        ]
+        json = self.load_results()
+        output_lines = get_unique_benchmark_names(json)
+        print("\n")
+        print("\n".join(output_lines))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            self.assertEqual(expect_lines[i], output_lines[i])
+
+
+class TestReportDifference(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test1_run1.json')
+            testOutput2 = os.path.join(testInputs, 'test1_run2.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(json1, json2)
+
+    def test_json_diff_report_pretty_printing(self):
+        expect_lines = [
+            ['BM_SameTimes', '+0.0000', '+0.0000', '10', '10', '10', '10'],
+            ['BM_2xFaster', '-0.5000', '-0.5000', '50', '25', '50', '25'],
+            ['BM_2xSlower', '+1.0000', '+1.0000', '50', '100', '50', '100'],
+            ['BM_1PercentFaster', '-0.0100', '-0.0100', '100', '99', '100', '99'],
+            ['BM_1PercentSlower', '+0.0100', '+0.0100', '100', '101', '100', '101'],
+            ['BM_10PercentFaster', '-0.1000', '-0.1000', '100', '90', '100', '90'],
+            ['BM_10PercentSlower', '+0.1000', '+0.1000', '100', '110', '100', '110'],
+            ['BM_100xSlower', '+99.0000', '+99.0000',
+                '100', '10000', '100', '10000'],
+            ['BM_100xFaster', '-0.9900', '-0.9900',
+                '10000', '100', '10000', '100'],
+            ['BM_10PercentCPUToTime', '+0.1000',
+                '-0.1000', '100', '110', '100', '90'],
+            ['BM_ThirdFaster', '-0.3333', '-0.3334', '100', '67', '100', '67'],
+            ['BM_NotBadTimeUnit', '-0.9000', '+0.2000', '0', '0', '0', '1'],
+            ['BM_hasLabel', '+0.0000', '+0.0000', '1', '1', '1', '1'],
+            ['OVERALL_GEOMEAN', '-0.8113', '-0.7779', '0', '0', '0', '0']
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(len(parts), 7)
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report_output(self):
+        expected_output = [
+            {
+                'name': 'BM_SameTimes',
+                'label': '',
+                'measurements': [{'time': 0.0000, 'cpu': 0.0000,
+                                  'real_time': 10, 'real_time_other': 10,
+                                  'cpu_time': 10, 'cpu_time_other': 10}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_2xFaster',
+                'label': '',
+                'measurements': [{'time': -0.5000, 'cpu': -0.5000,
+                                  'real_time': 50, 'real_time_other': 25,
+                                  'cpu_time': 50, 'cpu_time_other': 25}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_2xSlower',
+                'label': '',
+                'measurements': [{'time': 1.0000, 'cpu': 1.0000,
+                                  'real_time': 50, 'real_time_other': 100,
+                                  'cpu_time': 50, 'cpu_time_other': 100}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_1PercentFaster',
+                'label': '',
+                'measurements': [{'time': -0.0100, 'cpu': -0.0100,
+                                  'real_time': 100, 'real_time_other': 98.9999999,
+                                  'cpu_time': 100, 'cpu_time_other': 98.9999999}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_1PercentSlower',
+                'label': '',
+                'measurements': [{'time': 0.0100, 'cpu': 0.0100,
+                                  'real_time': 100, 'real_time_other': 101,
+                                  'cpu_time': 100, 'cpu_time_other': 101}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_10PercentFaster',
+                'label': '',
+                'measurements': [{'time': -0.1000, 'cpu': -0.1000,
+                                  'real_time': 100, 'real_time_other': 90,
+                                  'cpu_time': 100, 'cpu_time_other': 90}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_10PercentSlower',
+                'label': '',
+                'measurements': [{'time': 0.1000, 'cpu': 0.1000,
+                                  'real_time': 100, 'real_time_other': 110,
+                                  'cpu_time': 100, 'cpu_time_other': 110}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_100xSlower',
+                'label': '',
+                'measurements': [{'time': 99.0000, 'cpu': 99.0000,
+                                  'real_time': 100, 'real_time_other': 10000,
+                                  'cpu_time': 100, 'cpu_time_other': 10000}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_100xFaster',
+                'label': '',
+                'measurements': [{'time': -0.9900, 'cpu': -0.9900,
+                                  'real_time': 10000, 'real_time_other': 100,
+                                  'cpu_time': 10000, 'cpu_time_other': 100}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_10PercentCPUToTime',
+                'label': '',
+                'measurements': [{'time': 0.1000, 'cpu': -0.1000,
+                                  'real_time': 100, 'real_time_other': 110,
+                                  'cpu_time': 100, 'cpu_time_other': 90}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_ThirdFaster',
+                'label': '',
+                'measurements': [{'time': -0.3333, 'cpu': -0.3334,
+                                  'real_time': 100, 'real_time_other': 67,
+                                  'cpu_time': 100, 'cpu_time_other': 67}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'BM_NotBadTimeUnit',
+                'label': '',
+                'measurements': [{'time': -0.9000, 'cpu': 0.2000,
+                                  'real_time': 0.4, 'real_time_other': 0.04,
+                                  'cpu_time': 0.5, 'cpu_time_other': 0.6}],
+                'time_unit': 's',
+                'utest': {}
+            },
+            {
+                'name': 'BM_hasLabel',
+                'label': 'a label',
+                'measurements': [{'time': 0.0000, 'cpu': 0.0000,
+                                  'real_time': 1, 'real_time_other': 1,
+                                  'cpu_time': 1, 'cpu_time_other': 1}],
+                'time_unit': 's',
+                'utest': {}
+            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'label': '',
+                'measurements': [{'real_time': 3.1622776601683826e-06, 'cpu_time': 3.2130844755623912e-06,
+                                  'real_time_other': 1.9768988699420897e-07, 'cpu_time_other': 2.397447755209533e-07,
+                                  'time': -0.8112976497120911, 'cpu': -0.7778551721181174}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean', 'utest': {}
+            },
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['label'], expected['label'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+class TestReportDifferenceBetweenFamilies(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_result():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput = os.path.join(testInputs, 'test2_run.json')
+            with open(testOutput, 'r') as f:
+                json = json.load(f)
+            return json
+
+        json = load_result()
+        json1 = filter_benchmark(json, "BM_Z.ro", ".")
+        json2 = filter_benchmark(json, "BM_O.e", ".")
+        cls.json_diff_report = get_difference_report(json1, json2)
+
+    def test_json_diff_report_pretty_printing(self):
+        expect_lines = [
+            ['.', '-0.5000', '-0.5000', '10', '5', '10', '5'],
+            ['./4', '-0.5000', '-0.5000', '40', '20', '40', '20'],
+            ['Prefix/.', '-0.5000', '-0.5000', '20', '10', '20', '10'],
+            ['Prefix/./3', '-0.5000', '-0.5000', '30', '15', '30', '15'],
+            ['OVERALL_GEOMEAN', '-0.5000', '-0.5000', '0', '0', '0', '0']
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(len(parts), 7)
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                'name': u'.',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 10, 'real_time_other': 5, 'cpu_time': 10, 'cpu_time_other': 5}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'./4',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 40, 'real_time_other': 20, 'cpu_time': 40, 'cpu_time_other': 20}],
+                'time_unit': 'ns',
+                'utest': {},
+            },
+            {
+                'name': u'Prefix/.',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 20, 'real_time_other': 10, 'cpu_time': 20, 'cpu_time_other': 10}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'Prefix/./3',
+                'measurements': [{'time': -0.5, 'cpu': -0.5, 'real_time': 30, 'real_time_other': 15, 'cpu_time': 30, 'cpu_time_other': 15}],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'measurements': [{'real_time': 2.213363839400641e-08, 'cpu_time': 2.213363839400641e-08,
+                                  'real_time_other': 1.1066819197003185e-08, 'cpu_time_other': 1.1066819197003185e-08,
+                                  'time': -0.5000000000000009, 'cpu': -0.5000000000000009}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean',
+                'utest': {}
+            }
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+class TestReportDifferenceWithUTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test3_run0.json')
+            testOutput2 = os.path.join(testInputs, 'test3_run1.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(
+            json1, json2, utest=True)
+
+    def test_json_diff_report_pretty_printing(self):
+        expect_lines = [
+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
+            ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
+            ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
+            ['BM_Two_pvalue',
+             '1.0000',
+             '0.6667',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '2.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.2000',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
+            ['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report_pretty_printing_aggregates_only(self):
+        expect_lines = [
+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
+            ['BM_Two_pvalue',
+             '1.0000',
+             '0.6667',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '2.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.2000',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report, include_aggregates_only=True, utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                'name': u'BM_One',
+                'measurements': [
+                    {'time': -0.1,
+                     'cpu': 0.1,
+                     'real_time': 10,
+                     'real_time_other': 9,
+                     'cpu_time': 100,
+                     'cpu_time_other': 110}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'BM_Two',
+                'measurements': [
+                    {'time': 0.1111111111111111,
+                     'cpu': -0.011111111111111112,
+                     'real_time': 9,
+                     'real_time_other': 10,
+                     'cpu_time': 90,
+                     'cpu_time_other': 89},
+                    {'time': -0.125, 'cpu': -0.16279069767441862, 'real_time': 8,
+                        'real_time_other': 7, 'cpu_time': 86, 'cpu_time_other': 72}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6666666666666666, 'time_pvalue': 1.0
+                }
+            },
+            {
+                'name': u'short',
+                'measurements': [
+                    {'time': -0.125,
+                     'cpu': -0.0625,
+                     'real_time': 8,
+                     'real_time_other': 7,
+                     'cpu_time': 80,
+                     'cpu_time_other': 75},
+                    {'time': -0.4325,
+                     'cpu': -0.13506493506493514,
+                     'real_time': 8,
+                     'real_time_other': 4.54,
+                     'cpu_time': 77,
+                     'cpu_time_other': 66.6}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.2, 'time_pvalue': 0.7670968684102772
+                }
+            },
+            {
+                'name': u'medium',
+                'measurements': [
+                    {'time': -0.375,
+                     'cpu': -0.3375,
+                     'real_time': 8,
+                     'real_time_other': 5,
+                     'cpu_time': 80,
+                     'cpu_time_other': 53}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'measurements': [{'real_time': 8.48528137423858e-09, 'cpu_time': 8.441336246629233e-08,
+                                  'real_time_other': 2.2405267593145244e-08, 'cpu_time_other': 2.5453661413660466e-08,
+                                  'time': 1.6404861082353634, 'cpu': -0.6984640740519662}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean',
+                'utest': {}
+            }
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+class TestReportDifferenceWithUTestWhileDisplayingAggregatesOnly(
+        unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test3_run0.json')
+            testOutput2 = os.path.join(testInputs, 'test3_run1.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(
+            json1, json2, utest=True)
+
+    def test_json_diff_report_pretty_printing(self):
+        expect_lines = [
+            ['BM_One', '-0.1000', '+0.1000', '10', '9', '100', '110'],
+            ['BM_Two', '+0.1111', '-0.0111', '9', '10', '90', '89'],
+            ['BM_Two', '-0.1250', '-0.1628', '8', '7', '86', '72'],
+            ['BM_Two_pvalue',
+             '1.0000',
+             '0.6667',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '2.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['short', '-0.1250', '-0.0625', '8', '7', '80', '75'],
+            ['short', '-0.4325', '-0.1351', '8', '5', '77', '67'],
+            ['short_pvalue',
+             '0.7671',
+             '0.2000',
+             'U',
+             'Test,',
+             'Repetitions:',
+             '2',
+             'vs',
+             '3.',
+             'WARNING:',
+             'Results',
+             'unreliable!',
+             '9+',
+             'repetitions',
+             'recommended.'],
+            ['medium', '-0.3750', '-0.3375', '8', '5', '80', '53'],
+            ['OVERALL_GEOMEAN', '+1.6405', '-0.6985', '0', '0', '0', '0']
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report,
+            utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                'name': u'BM_One',
+                'measurements': [
+                    {'time': -0.1,
+                     'cpu': 0.1,
+                     'real_time': 10,
+                     'real_time_other': 9,
+                     'cpu_time': 100,
+                     'cpu_time_other': 110}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
+            },
+            {
+                'name': u'BM_Two',
+                'measurements': [
+                    {'time': 0.1111111111111111,
+                     'cpu': -0.011111111111111112,
+                     'real_time': 9,
+                     'real_time_other': 10,
+                     'cpu_time': 90,
+                     'cpu_time_other': 89},
+                    {'time': -0.125, 'cpu': -0.16279069767441862, 'real_time': 8,
+                        'real_time_other': 7, 'cpu_time': 86, 'cpu_time_other': 72}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.6666666666666666, 'time_pvalue': 1.0
+                }
+            },
+            {
+                'name': u'short',
+                'measurements': [
+                    {'time': -0.125,
+                     'cpu': -0.0625,
+                     'real_time': 8,
+                     'real_time_other': 7,
+                     'cpu_time': 80,
+                     'cpu_time_other': 75},
+                    {'time': -0.4325,
+                     'cpu': -0.13506493506493514,
+                     'real_time': 8,
+                     'real_time_other': 4.54,
+                     'cpu_time': 77,
+                     'cpu_time_other': 66.6}
+                ],
+                'time_unit': 'ns',
+                'utest': {
+                    'have_optimal_repetitions': False, 'cpu_pvalue': 0.2, 'time_pvalue': 0.7670968684102772
+                }
+            },
+            {
+                'name': u'medium',
+                'measurements': [
+                    {'real_time_other': 5,
+                     'cpu_time': 80,
+                     'time': -0.375,
+                     'real_time': 8,
+                     'cpu_time_other': 53,
+                     'cpu': -0.3375
+                     }
+                ],
+                'utest': {},
+                'time_unit': u'ns',
+                'aggregate_name': ''
+            },
+            {
+                'name': 'OVERALL_GEOMEAN',
+                'measurements': [{'real_time': 8.48528137423858e-09, 'cpu_time': 8.441336246629233e-08,
+                                  'real_time_other': 2.2405267593145244e-08, 'cpu_time_other': 2.5453661413660466e-08,
+                                  'time': 1.6404861082353634, 'cpu': -0.6984640740519662}],
+                'time_unit': 's',
+                'run_type': 'aggregate',
+                'aggregate_name': 'geomean',
+                'utest': {}
+            }
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+class TestReportDifferenceForPercentageAggregates(
+        unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_results():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput1 = os.path.join(testInputs, 'test4_run0.json')
+            testOutput2 = os.path.join(testInputs, 'test4_run1.json')
+            with open(testOutput1, 'r') as f:
+                json1 = json.load(f)
+            with open(testOutput2, 'r') as f:
+                json2 = json.load(f)
+            return json1, json2
+
+        json1, json2 = load_results()
+        cls.json_diff_report = get_difference_report(
+            json1, json2, utest=True)
+
+    def test_json_diff_report_pretty_printing(self):
+        expect_lines = [
+            ['whocares', '-0.5000', '+0.5000', '0', '0', '0', '0']
+        ]
+        output_lines_with_header = print_difference_report(
+            self.json_diff_report,
+            utest=True, utest_alpha=0.05, use_color=False)
+        output_lines = output_lines_with_header[2:]
+        print("\n")
+        print("\n".join(output_lines_with_header))
+        self.assertEqual(len(output_lines), len(expect_lines))
+        for i in range(0, len(output_lines)):
+            parts = [x for x in output_lines[i].split(' ') if x]
+            self.assertEqual(expect_lines[i], parts)
+
+    def test_json_diff_report(self):
+        expected_output = [
+            {
+                'name': u'whocares',
+                'measurements': [
+                    {'time': -0.5,
+                     'cpu': 0.5,
+                     'real_time': 0.01,
+                     'real_time_other': 0.005,
+                     'cpu_time': 0.10,
+                     'cpu_time_other': 0.15}
+                ],
+                'time_unit': 'ns',
+                'utest': {}
+            }
+        ]
+        self.assertEqual(len(self.json_diff_report), len(expected_output))
+        for out, expected in zip(
+                self.json_diff_report, expected_output):
+            self.assertEqual(out['name'], expected['name'])
+            self.assertEqual(out['time_unit'], expected['time_unit'])
+            assert_utest(self, out, expected)
+            assert_measurements(self, out, expected)
+
+
+class TestReportSorting(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        def load_result():
+            import json
+            testInputs = os.path.join(
+                os.path.dirname(
+                    os.path.realpath(__file__)),
+                'Inputs')
+            testOutput = os.path.join(testInputs, 'test4_run.json')
+            with open(testOutput, 'r') as f:
+                json = json.load(f)
+            return json
+
+        cls.json = load_result()
+
+    def test_json_diff_report_pretty_printing(self):
+        import util
+
+        expected_names = [
+            "99 family 0 instance 0 repetition 0",
+            "98 family 0 instance 0 repetition 1",
+            "97 family 0 instance 0 aggregate",
+            "96 family 0 instance 1 repetition 0",
+            "95 family 0 instance 1 repetition 1",
+            "94 family 0 instance 1 aggregate",
+            "93 family 1 instance 0 repetition 0",
+            "92 family 1 instance 0 repetition 1",
+            "91 family 1 instance 0 aggregate",
+            "90 family 1 instance 1 repetition 0",
+            "89 family 1 instance 1 repetition 1",
+            "88 family 1 instance 1 aggregate"
+        ]
+
+        for n in range(len(self.json['benchmarks']) ** 2):
+            random.shuffle(self.json['benchmarks'])
+            sorted_benchmarks = util.sort_benchmark_results(self.json)[
+                'benchmarks']
+            self.assertEqual(len(expected_names), len(sorted_benchmarks))
+            for out, expected in zip(sorted_benchmarks, expected_names):
+                self.assertEqual(out['name'], expected)
+
+
+def assert_utest(unittest_instance, lhs, rhs):
+    if lhs['utest']:
+        unittest_instance.assertAlmostEqual(
+            lhs['utest']['cpu_pvalue'],
+            rhs['utest']['cpu_pvalue'])
+        unittest_instance.assertAlmostEqual(
+            lhs['utest']['time_pvalue'],
+            rhs['utest']['time_pvalue'])
+        unittest_instance.assertEqual(
+            lhs['utest']['have_optimal_repetitions'],
+            rhs['utest']['have_optimal_repetitions'])
+    else:
+        # lhs is empty. assert if rhs is not.
+        unittest_instance.assertEqual(lhs['utest'], rhs['utest'])
+
+
+def assert_measurements(unittest_instance, lhs, rhs):
+    for m1, m2 in zip(lhs['measurements'], rhs['measurements']):
+        unittest_instance.assertEqual(m1['real_time'], m2['real_time'])
+        unittest_instance.assertEqual(m1['cpu_time'], m2['cpu_time'])
+        # m1['time'] and m1['cpu'] hold values which are being calculated,
+        # and therefore we must use almost-equal pattern.
+        unittest_instance.assertAlmostEqual(m1['time'], m2['time'], places=4)
+        unittest_instance.assertAlmostEqual(m1['cpu'], m2['cpu'], places=4)
+
+
+if __name__ == '__main__':
+    unittest.main()
+
+# vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
+# kate: tab-width: 4; replace-tabs on; indent-width 4; tab-indents: off;
+# kate: indent-mode python; remove-trailing-spaces modified;
diff --git a/tools/compare_commits/gbench/util.py b/tools/compare_commits/gbench/util.py
new file mode 100644
index 00000000..5d0012c0
--- /dev/null
+++ b/tools/compare_commits/gbench/util.py
@@ -0,0 +1,181 @@
+"""util.py - General utilities for running, loading, and processing benchmarks
+"""
+import json
+import os
+import tempfile
+import subprocess
+import sys
+import functools
+
+# Input file type enumeration
+IT_Invalid = 0
+IT_JSON = 1
+IT_Executable = 2
+
+_num_magic_bytes = 2 if sys.platform.startswith('win') else 4
+
+
+def is_executable_file(filename):
+    """
+    Return 'True' if 'filename' names a valid file which is likely
+    an executable. A file is considered an executable if it starts with the
+    magic bytes for a EXE, Mach O, or ELF file.
+    """
+    if not os.path.isfile(filename):
+        return False
+    with open(filename, mode='rb') as f:
+        magic_bytes = f.read(_num_magic_bytes)
+    if sys.platform == 'darwin':
+        return magic_bytes in [
+            b'\xfe\xed\xfa\xce',  # MH_MAGIC
+            b'\xce\xfa\xed\xfe',  # MH_CIGAM
+            b'\xfe\xed\xfa\xcf',  # MH_MAGIC_64
+            b'\xcf\xfa\xed\xfe',  # MH_CIGAM_64
+            b'\xca\xfe\xba\xbe',  # FAT_MAGIC
+            b'\xbe\xba\xfe\xca'   # FAT_CIGAM
+        ]
+    elif sys.platform.startswith('win'):
+        return magic_bytes == b'MZ'
+    else:
+        return magic_bytes == b'\x7FELF'
+
+
+def is_json_file(filename):
+    """
+    Returns 'True' if 'filename' names a valid JSON output file.
+    'False' otherwise.
+    """
+    try:
+        with open(filename, 'r') as f:
+            json.load(f)
+        return True
+    except BaseException:
+        pass
+    return False
+
+
+def classify_input_file(filename):
+    """
+    Return a tuple (type, msg) where 'type' specifies the classified type
+    of 'filename'. If 'type' is 'IT_Invalid' then 'msg' is a human readable
+    string represeting the error.
+    """
+    ftype = IT_Invalid
+    err_msg = None
+    if not os.path.exists(filename):
+        err_msg = "'%s' does not exist" % filename
+    elif not os.path.isfile(filename):
+        err_msg = "'%s' does not name a file" % filename
+    elif is_executable_file(filename):
+        ftype = IT_Executable
+    elif is_json_file(filename):
+        ftype = IT_JSON
+    else:
+        err_msg = "'%s' does not name a valid benchmark executable or JSON file" % filename
+    return ftype, err_msg
+
+
+def check_input_file(filename):
+    """
+    Classify the file named by 'filename' and return the classification.
+    If the file is classified as 'IT_Invalid' print an error message and exit
+    the program.
+    """
+    ftype, msg = classify_input_file(filename)
+    if ftype == IT_Invalid:
+        print("Invalid input file: %s" % msg)
+        sys.exit(1)
+    return ftype
+
+
+def find_benchmark_flag(prefix, benchmark_flags):
+    """
+    Search the specified list of flags for a flag matching `<prefix><arg>` and
+    if it is found return the arg it specifies. If specified more than once the
+    last value is returned. If the flag is not found None is returned.
+    """
+    assert prefix.startswith('--') and prefix.endswith('=')
+    result = None
+    for f in benchmark_flags:
+        if f.startswith(prefix):
+            result = f[len(prefix):]
+    return result
+
+
+def remove_benchmark_flags(prefix, benchmark_flags):
+    """
+    Return a new list containing the specified benchmark_flags except those
+    with the specified prefix.
+    """
+    assert prefix.startswith('--') and prefix.endswith('=')
+    return [f for f in benchmark_flags if not f.startswith(prefix)]
+
+
+def load_benchmark_results(fname):
+    """
+    Read benchmark output from a file and return the JSON object.
+    REQUIRES: 'fname' names a file containing JSON benchmark output.
+    """
+    with open(fname, 'r') as f:
+        return json.load(f)
+
+
+def sort_benchmark_results(result):
+    benchmarks = result['benchmarks']
+
+    # From inner key to the outer key!
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: benchmark['repetition_index'] if 'repetition_index' in benchmark else -1)
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: 1 if 'run_type' in benchmark and benchmark['run_type'] == "aggregate" else 0)
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: benchmark['per_family_instance_index'] if 'per_family_instance_index' in benchmark else -1)
+    benchmarks = sorted(
+        benchmarks, key=lambda benchmark: benchmark['family_index'] if 'family_index' in benchmark else -1)
+
+    result['benchmarks'] = benchmarks
+    return result
+
+
+def run_benchmark(exe_name, benchmark_flags):
+    """
+    Run a benchmark specified by 'exe_name' with the specified
+    'benchmark_flags'. The benchmark is run directly as a subprocess to preserve
+    real time console output.
+    RETURNS: A JSON object representing the benchmark output
+    """
+    output_name = find_benchmark_flag('--benchmark_out=',
+                                      benchmark_flags)
+    is_temp_output = False
+    if output_name is None:
+        is_temp_output = True
+        thandle, output_name = tempfile.mkstemp()
+        os.close(thandle)
+        benchmark_flags = list(benchmark_flags) + \
+            ['--benchmark_out=%s' % output_name]
+
+    cmd = [exe_name] + benchmark_flags
+    print("RUNNING: %s" % ' '.join(cmd))
+    exitCode = subprocess.call(cmd)
+    if exitCode != 0:
+        print('TEST FAILED...')
+        sys.exit(exitCode)
+    json_res = load_benchmark_results(output_name)
+    if is_temp_output:
+        os.unlink(output_name)
+    return json_res
+
+
+def run_or_load_benchmark(filename, benchmark_flags):
+    """
+    Get the results for a specified benchmark. If 'filename' specifies
+    an executable benchmark then the results are generated by running the
+    benchmark. Otherwise 'filename' must name a valid JSON output file,
+    which is loaded and the result returned.
+    """
+    ftype = check_input_file(filename)
+    if ftype == IT_JSON:
+        return load_benchmark_results(filename)
+    if ftype == IT_Executable:
+        return run_benchmark(filename, benchmark_flags)
+    raise ValueError('Unknown file type %s' % ftype)
diff --git a/tools/compare_commits/requirements.txt b/tools/compare_commits/requirements.txt
new file mode 100644
index 00000000..3b3331b5
--- /dev/null
+++ b/tools/compare_commits/requirements.txt
@@ -0,0 +1 @@
+scipy>=1.5.0
\ No newline at end of file
diff --git a/tools/compare_commits/rootbench_compare.sh b/tools/compare_commits/rootbench_compare.sh
new file mode 100755
index 00000000..c2e0125a
--- /dev/null
+++ b/tools/compare_commits/rootbench_compare.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+
+: '
+This is a small script to compare performance between 2 different patches/commits.
+The script will take care of building 2 ROOT versions and running the desired benchmarks.
+And finally https://github.com/google/benchmark/blob/main/tools/compare.py compares both.
+See https://github.com/google/benchmark/blob/main/docs/tools.md.
+User needs to specify 2 commits and test regex to run on them:
+1. "r" - owner of the baseline (default is `root-project`)
+2. "b" - branch name of the baseline (default is `master`)
+3. "c" - commit hash of the baseline (if none provided, the latest commit is picked)
+4. "R" - owner of the baseline (default is `root-project`)
+5. "B" - branch name of the baseline (default is `master`)
+6. "C" - commit hash of the baseline (if none provided, the latest commit is picked)
+7. "t" - regex to be passed to rootbench ctest (default empty string, hence all tests)
+8. "n" - cores to use (default empty string, hence all cores of the machine)
+
+Example usage:
+```
+sh rootbench_compare.sh -b latest-stable -c 9e7933c9bddba3f6381ba2b8c11539c46e8cc2f4 \
+-R eguiraud -B df-bulk -t "(RDF)"
+```
+```
+sh rootbench_compare.sh -B latest-stable -t "(OpenDataBenchmarks|df102_NanoAODDimuonAnalysis)"
+```
+'
+
+root_owner_baseline="root-project"
+branch_baseline="master"
+commit_baseline=""
+
+root_owner_compared="root-project"
+branch_compared="master"
+commit_compared=""
+
+testregex="" # same for both builds
+
+while getopts r:b:c:R:B:C:t:n: flag
+do
+	case "${flag}" in
+		r) root_owner_baseline=${OPTARG};;
+		b) branch_baseline=${OPTARG};;
+		c) commit_baseline=${OPTARG};;
+		R) root_owner_compared=${OPTARG};;
+		B) branch_compared=${OPTARG};;
+		C) commit_compared=${OPTARG};;
+		t) testregex=${OPTARG};;
+		n) ncores=${OPTARG};;
+	esac
+done
+
+git clone https://github.com/${root_owner_baseline}/root --branch ${branch_baseline} baseline_root
+cd baseline_root
+if [${commit_baseline} != ""]
+then
+    git reset --hard ${commit_baseline}
+fi
+mkdir rbuild && cd rbuild
+cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_CXX_FLAGS="-O2 -g -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" ..
+cmake --build . -- -j${ncores}
+cd ../..
+
+git clone https://github.com/${root_owner_compared}/root --branch ${branch_compared} compared_root
+cd compared_root
+if [${commit_compared} != ""]
+then
+    git reset --hard ${commit_compared}
+fi
+mkdir rbuild && cd rbuild
+cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_CXX_FLAGS="-O2 -g -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" ..
+cmake --build . -- -j${ncores}
+cd ../..
+
+source baseline_root/rbuild/bin/thisroot.sh
+mkdir baseline_bench && cd baseline_bench
+cmake -Drootbench-datafiles=ON -Drootbench-out-format-json=ON ../../..
+cmake --build . -- -j${ncores}
+RB_TEMP_FS="." ctest -R ${testregex} -j${ncores} --verbose
+cd ..
+
+source compared_root/rbuild/bin/thisroot.sh
+mkdir compared_bench && cd compared_bench
+cmake -Drootbench-datafiles=OFF -Drootbench-out-format-json=ON ../../..
+cmake --build . -- -j${ncores}
+RB_TEMP_FS="../baseline_bench" ctest -R ${testregex} -j${ncores} --verbose
+cd ..
+
+for file in $(find baseline_bench/root -name "*.json")
+do
+    python3 compare.py benchmarks $file compared_bench/root/${file#baseline_bench/root/} --display_aggregates_only
+done