Skip to content

Commit

Permalink
add diff-pairs tool
Browse files Browse the repository at this point in the history
This takes the output of `diff-tree -z --raw` and feeds it
back to the later stages of the diff machinery to produce
diffs in other formats. Because the interim format contains
any whole-tree copy/rename information, you can safely feed
segments of the tree diff to get progressive patch-format
diffs. So something like:

  git diff-tree -r -z $a $b |
  git diff-pairs -p

should give you the same output that `git diff-tree -p`
would have.  Likewise, feeding each pair individually works,
too:

  git diff-tree -r -z -M $a $b |
  perl -0ne '
	my $meta = $_;
	my $path = <>;
	# only renames have an extra path
	my $path2 = <> if $meta =~ /[RC]\d+/;

	print STDERR "feeding one diff\n";
	open(my $fh, "|git diff-pairs -p");
	print $fh $meta, $path, $path2;
  '

The renames will still be shown just as if the diff had been
done in one process.

Signed-off-by: Jeff King <[email protected]>
  • Loading branch information
peff committed Nov 25, 2024
1 parent 6ea2d9d commit 119bdc3
Show file tree
Hide file tree
Showing 8 changed files with 327 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
/git-diff
/git-diff-files
/git-diff-index
/git-diff-pairs
/git-diff-tree
/git-difftool
/git-difftool--helper
Expand Down
66 changes: 66 additions & 0 deletions Documentation/git-diff-pairs.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
git-diff-pairs(1)
=================

NAME
----
git-diff-pairs - Compare blob pairs generated by `diff-tree --raw`

SYNOPSIS
--------
[verse]
'git diff-pairs' [diff-options]

DESCRIPTION
-----------

Given the output of `diff-tree -z` on its stdin, `diff-pairs` will
reformat that output into whatever format is requested on its command
line. For example:

-----------------------------
git diff-tree -z -M $a $b |
git diff-pairs -p
-----------------------------

will compute the tree diff in one step (including renames), and then
`diff-pairs` will compute and format the blob-level diffs for each pair.
This can be used to modify the raw diff in the middle (without having to
parse or re-create more complicated formats like `--patch`), or to
compute diffs progressively over the course of multiple invocations of
`diff-pairs`.

Each blob pair is fed to the diff machinery individually and the output
flushed immediately, meaning it is safe to interactively read and write
from `diff-pairs`.

OPTIONS
-------

All diff options below are accepted, but note that tree-wide options
like `-M` are effectively noops, as we consider only one pair at a time.

include::diff-options.txt[]

include::diff-generate-patch.txt[]

BUGS
----

`diff-pairs` should handle any input generated by `diff-tree --raw -z`.
It may choke or otherwise misbehave on output from `diff-files`, etc.

Here's an incomplete list of things that `diff-pairs` could do, but
doesn't (mostly in the name of simplicity):

- Only `-z` input is accepted, not normal `--raw` input.

- Abbreviated sha1s are rejected in the input from `diff-tree`; if you
want to abbreviate the output, you can pass `--abbrev` to
`diff-pairs`.

- Pathspecs are not handled by `diff-pairs`; you can limit the diff via
the initial `diff-tree` invocation.

GIT
---
Part of the linkgit:git[1] suite
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -1234,6 +1234,7 @@ BUILTIN_OBJS += builtin/describe.o
BUILTIN_OBJS += builtin/diagnose.o
BUILTIN_OBJS += builtin/diff-files.o
BUILTIN_OBJS += builtin/diff-index.o
BUILTIN_OBJS += builtin/diff-pairs.o
BUILTIN_OBJS += builtin/diff-tree.o
BUILTIN_OBJS += builtin/diff.o
BUILTIN_OBJS += builtin/difftool.o
Expand Down
1 change: 1 addition & 0 deletions builtin.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ int cmd_diagnose(int argc, const char **argv, const char *prefix, struct reposit
int cmd_diff_files(int argc, const char **argv, const char *prefix, struct repository *repo);
int cmd_diff_index(int argc, const char **argv, const char *prefix, struct repository *repo);
int cmd_diff(int argc, const char **argv, const char *prefix, struct repository *repo);
int cmd_diff_pairs(int argc, const char **argv, const char *prefix, struct repository *repo);
int cmd_diff_tree(int argc, const char **argv, const char *prefix, struct repository *repo);
int cmd_difftool(int argc, const char **argv, const char *prefix, struct repository *repo);
int cmd_env__helper(int argc, const char **argv, const char *prefix, struct repository *repo);
Expand Down
174 changes: 174 additions & 0 deletions builtin/diff-pairs.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
#include "builtin.h"
#include "commit.h"
#include "diff.h"
#include "diffcore.h"
#include "revision.h"
#include "config.h"
#include "builtin.h"
#include "hex.h"

static const char diff_pairs_usage[] =
"git diff-pairs [diff-options]\n"
"\n"
"Reads pairs of blobs from stdin in 'diff-tree -z' syntax:\n"
"\n"
" :<mode_a> <mode_b> <sha1_a> <sha1_b> <type>\\0<path>\0[path2\0]\n"
"\n"
"and outputs the diff for each a/b pair to stdout.";

static unsigned parse_mode_or_die(const char *mode, const char **endp)
{
uint16_t ret;

*endp = parse_mode(mode, &ret);
if (!*endp)
die("unable to parse mode: %s", mode);
return ret;
}

static void parse_oid(const char *p, struct object_id *oid, const char **endp,
const struct git_hash_algo *algop)
{
if (parse_oid_hex_algop(p, oid, endp, algop) || *(*endp)++ != ' ')
die("unable to parse object id: %s", p);
}

static unsigned short parse_score(const char *score)
{
unsigned long ret;
char *endp;

errno = 0;
ret = strtoul(score, &endp, 10);
ret *= MAX_SCORE / 100;
if (errno || endp == score || *endp || (unsigned short)ret != ret)
die("unable to parse rename/copy score: %s", score);
return ret;
}

/*
* The pair-creation is mostly done by diff_change and diff_addremove,
* which queue the filepair without returning it. So we have to resort
* to pulling it out of the global diff queue.
*/
static void set_pair_status(char status)
{
/*
* If we have no items in the queue, for some reason the pair wasn't
* worth queueing. This generally shouldn't happen (since it means
* dropping some parts of the diff), but the user can trigger it with
* things like --ignore-submodules. If they do, the only sensible thing
* is for us to play along and skip it.
*/
if (!diff_queued_diff.nr)
return;

diff_queued_diff.queue[0]->status = status;
}

int cmd_diff_pairs(int argc, const char **argv, const char *prefix,
struct repository *repo)
{
struct rev_info revs;
struct strbuf meta = STRBUF_INIT;
struct strbuf path = STRBUF_INIT;
struct strbuf path_dst = STRBUF_INIT;

if (argc > 1 && !strcmp(argv[1], "-h"))
usage(diff_pairs_usage);

repo_init_revisions(repo, &revs, prefix);
repo_config(repo, git_diff_basic_config, NULL);
revs.disable_stdin = 1;
argc = setup_revisions(argc, argv, &revs, NULL);

/* Don't allow pathspecs at all. */
if (argc > 1)
usage(diff_pairs_usage);

if (!revs.diffopt.output_format)
revs.diffopt.output_format = DIFF_FORMAT_RAW;

while (1) {
unsigned mode_a, mode_b;
struct object_id oid_a, oid_b;
char status;
const char *p;

if (strbuf_getline_nul(&meta, stdin) == EOF)
break;

p = meta.buf;
if (*p == ':')
p++;

mode_a = parse_mode_or_die(p, &p);
mode_b = parse_mode_or_die(p, &p);

parse_oid(p, &oid_a, &p, repo->hash_algo);
parse_oid(p, &oid_b, &p, repo->hash_algo);

status = *p++;

if (strbuf_getline_nul(&path, stdin) == EOF)
die("got EOF while reading path");

switch (status) {
case DIFF_STATUS_ADDED:
diff_addremove(&revs.diffopt, '+',
mode_b, &oid_b,
1, path.buf, 0);
set_pair_status(status);
break;

case DIFF_STATUS_DELETED:
diff_addremove(&revs.diffopt, '-',
mode_a, &oid_a,
1, path.buf, 0);
set_pair_status(status);
break;

case DIFF_STATUS_TYPE_CHANGED:
case DIFF_STATUS_MODIFIED:
diff_change(&revs.diffopt,
mode_a, mode_b,
&oid_a, &oid_b,
1, 1, path.buf, 0, 0);
set_pair_status(status);
break;

case DIFF_STATUS_RENAMED:
case DIFF_STATUS_COPIED:
{
struct diff_filespec *a, *b;
struct diff_filepair *pair;

if (strbuf_getline_nul(&path_dst, stdin) == EOF)
die("got EOF while reading secondary path");

a = alloc_filespec(path.buf);
b = alloc_filespec(path_dst.buf);
fill_filespec(a, &oid_a, 1, mode_a);
fill_filespec(b, &oid_b, 1, mode_b);

pair = diff_queue(&diff_queued_diff, a, b);
pair->status = status;
pair->score = parse_score(p);
pair->renamed_pair = 1;
}
break;

default:
die("unknown diff status: %c", status);
}

diff_flush(&revs.diffopt);
}

strbuf_release(&meta);
strbuf_release(&path);
strbuf_release(&path_dst);
release_revisions(&revs);

return 0;
}
1 change: 1 addition & 0 deletions command-list.txt
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ git-diagnose ancillaryinterrogators
git-diff mainporcelain info
git-diff-files plumbinginterrogators
git-diff-index plumbinginterrogators
git-diff-pairs plumbinginterrogators
git-diff-tree plumbinginterrogators
git-difftool ancillaryinterrogators complete
git-fast-export ancillarymanipulators
Expand Down
1 change: 1 addition & 0 deletions git.c
Original file line number Diff line number Diff line change
Expand Up @@ -544,6 +544,7 @@ static struct cmd_struct commands[] = {
{ "diff", cmd_diff, NO_PARSEOPT },
{ "diff-files", cmd_diff_files, RUN_SETUP | NEED_WORK_TREE | NO_PARSEOPT },
{ "diff-index", cmd_diff_index, RUN_SETUP | NO_PARSEOPT },
{ "diff-pairs", cmd_diff_pairs, RUN_SETUP | NO_PARSEOPT },
{ "diff-tree", cmd_diff_tree, RUN_SETUP | NO_PARSEOPT },
{ "difftool", cmd_difftool, RUN_SETUP_GENTLY },
{ "fast-export", cmd_fast_export, RUN_SETUP },
Expand Down
82 changes: 82 additions & 0 deletions t/t4070-diff-pairs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
#!/bin/sh

test_description='basic diff-pairs tests'
. ./test-lib.sh

# This creates a diff with added, modified, deleted, renamed, copied, and
# typechange entries. That includes one in a subdirectory for non-recursive
# tests, and both exact and inexact similarity scores.
test_expect_success 'create commit with various diffs' '
echo to-be-gone >deleted &&
echo original >modified &&
echo now-a-file >symlink &&
test_seq 200 >two-hundred &&
test_seq 201 500 >five-hundred &&
git add . &&
test_tick &&
git commit -m base &&
git tag base &&
echo now-here >added &&
echo new >modified &&
rm deleted &&
mkdir subdir &&
echo content >subdir/file &&
mv two-hundred renamed &&
test_seq 201 500 | sed s/300/modified/ >copied &&
rm symlink &&
git add -A . &&
test_ln_s_add dest symlink &&
test_tick &&
git commit -m new &&
git tag new
'

test_expect_success 'diff-pairs recreates --raw' '
git diff-tree -r -M -C -C base new >expect &&
# note that diff-pairs uses the default abbrev,
# so we must tweak that for identical output
git diff-tree -r -M -C -C -z base new |
git diff-pairs --no-abbrev >actual &&
test_cmp expect actual
'

test_expect_success 'diff-pairs can create -p output' '
git diff-tree -p -M -C -C base new >expect &&
git diff-tree -r -M -C -C -z base new |
git diff-pairs -p >actual &&
test_cmp expect actual
'

test_expect_success 'non-recursive --raw retains tree entry' '
git diff-tree base new >expect &&
git diff-tree -z base new |
git diff-pairs --no-abbrev >actual &&
test_cmp expect actual
'

test_expect_success 'split input across multiple diff-pairs' '
write_script split-raw-diff "$PERL_PATH" <<-\EOF &&
$/ = "\0";
while (<>) {
my $meta = $_;
my $path = <>;
# renames have an extra path
my $path2 = <> if $meta =~ /[RC]\d+/;
open(my $fh, ">", sprintf "diff%03d", $.);
print $fh $meta, $path, $path2;
}
EOF
git diff-tree -p -M -C -C base new >expect &&
git diff-tree -r -z -M -C -C base new |
./split-raw-diff &&
for i in diff*; do
git diff-pairs -p <$i || return 1
done >actual &&
test_cmp expect actual
'

test_done

0 comments on commit 119bdc3

Please sign in to comment.