7 use Algorithm::Diff ();
8 use vars qw{$VERSION @ISA @EXPORT};
15 ## Hunks are made of ops. An op is the starting index for each
16 ## sequence and the opcode:
17 use constant A => 0; # Array index before match/discard
19 use constant OPCODE => 2; # "-", " ", "+"
20 use constant FLAG => 3; # What to display if not OPCODE "!"
22 my %internal_styles = (
26 Table => undef, ## "internal", but in another module
30 my @seqs = ( shift, shift );
31 my $options = shift || {};
33 for my $i ( 0 .. 1 ) {
37 while ( $type eq "CODE" ) {
38 $seqs[$i] = $seq = $seq->( $options );
42 my $AorB = !$i ? "A" : "B";
44 if ( $type eq "ARRAY" ) {
45 ## This is most efficient :)
46 $options->{"OFFSET_$AorB"} = 0
47 unless defined $options->{"OFFSET_$AorB"};
49 elsif ( $type eq "SCALAR" ) {
50 $seqs[$i] = [split( /^/m, $$seq )];
51 $options->{"OFFSET_$AorB"} = 1
52 unless defined $options->{"OFFSET_$AorB"};
55 $options->{"OFFSET_$AorB"} = 1
56 unless defined $options->{"OFFSET_$AorB"};
57 $options->{"FILENAME_$AorB"} = $seq
58 unless defined $options->{"FILENAME_$AorB"};
59 $options->{"MTIME_$AorB"} = (stat($seq))[9]
60 unless defined $options->{"MTIME_$AorB"};
63 open F, "<$seq" or carp "$!: $seq";
68 elsif ( $type eq "GLOB" || UNIVERSAL::isa( $seq, "IO::Handle" ) ) {
69 $options->{"OFFSET_$AorB"} = 1
70 unless defined $options->{"OFFSET_$AorB"};
75 confess "Can't handle input of type ", ref;
81 my $output_handler = $options->{OUTPUT};
82 my $type = ref $output_handler ;
83 if ( ! defined $output_handler ) {
85 $output_handler = sub { $output .= shift };
87 elsif ( $type eq "CODE" ) {
90 elsif ( $type eq "SCALAR" ) {
91 my $out_ref = $output_handler;
92 $output_handler = sub { $$out_ref .= shift };
94 elsif ( $type eq "ARRAY" ) {
95 my $out_ref = $output_handler;
96 $output_handler = sub { push @$out_ref, shift };
98 elsif ( $type eq "GLOB" || UNIVERSAL::isa $output_handler, "IO::Handle" ) {
99 my $output_handle = $output_handler;
100 $output_handler = sub { print $output_handle shift };
103 croak "Unrecognized output type: $type";
106 my $style = $options->{STYLE};
107 $style = "Unified" unless defined $options->{STYLE};
108 $style = "Text::Diff::$style" if exists $internal_styles{$style};
110 if ( ! $style->can( "hunk" ) ) {
111 eval "require $style; 1" or die $@;
115 if ! ref $style && $style->can( "new" );
117 my $ctx_lines = $options->{CONTEXT};
118 $ctx_lines = 3 unless defined $ctx_lines;
119 $ctx_lines = 0 if $style->isa( "Text::Diff::OldStyle" );
121 my @keygen_args = $options->{KEYGEN_ARGS}
122 ? @{$options->{KEYGEN_ARGS}}
126 my $diffs = 0; ## Number of discards this hunk
127 my $ctx = 0; ## Number of " " (ctx_lines) ops pushed after last diff.
128 my @ops; ## ops (" ", +, -) in this hunk
129 my $hunks = 0; ## Number of hunks
132 $output_handler->( $style->file_header( @seqs, $options ) )
134 $output_handler->( $style->hunk_header( @seqs, @_, $options ) );
135 $output_handler->( $style->hunk ( @seqs, @_, $options ) );
136 $output_handler->( $style->hunk_footer( @seqs, @_, $options ) );
139 ## We keep 2*ctx_lines so that if a diff occurs
140 ## at 2*ctx_lines we continue to grow the hunk instead
141 ## of emitting diffs and context as we go. We
142 ## need to know the total length of both of the two
143 ## subsequences so the line count can be printed in the
145 my $dis_a = sub {push @ops, [@_[0,1],"-"]; ++$diffs ; $ctx = 0 };
146 my $dis_b = sub {push @ops, [@_[0,1],"+"]; ++$diffs ; $ctx = 0 };
148 Algorithm::Diff::traverse_sequences(
152 push @ops, [@_[0,1]," "];
154 if ( $diffs && ++$ctx > $ctx_lines * 2 ) {
155 $emit_ops->( [ splice @ops, 0, $#ops - $ctx_lines ] );
159 ## throw away context lines that aren't needed any more
160 shift @ops if ! $diffs && @ops > $ctx_lines;
165 $options->{KEYGEN}, # pass in user arguments for key gen function
170 $#ops -= $ctx - $ctx_lines if $ctx > $ctx_lines;
171 $emit_ops->( \@ops );
174 $output_handler->( $style->file_footer( @seqs, $options ) ) if $hunks;
176 return defined $output ? $output : $hunks;
181 my ( $p1, $fn1, $t1, $p2, $fn2, $t2 ) = @{$h}{
190 ## remember to change Text::Diff::Table if this logic is tweaked.
191 return "" unless defined $fn1 && defined $fn2;
194 $p1, " ", $fn1, defined $t1 ? "\t" . localtime $t1 : (), "\n",
195 $p2, " ", $fn2, defined $t2 ? "\t" . localtime $t2 : (), "\n",
199 ## _range encapsulates the building of, well, ranges. Turns out there are
202 my ( $ops, $a_or_b, $format ) = @_;
204 my $start = $ops->[ 0]->[$a_or_b];
205 my $after = $ops->[-1]->[$a_or_b];
207 ## The sequence indexes in the lines are from *before* the OPCODE is
208 ## executed, so we bump the last index up unless the OP indicates
211 unless $ops->[-1]->[OPCODE] eq ( $a_or_b == A ? "+" : "-" );
213 ## convert from 0..n index to 1..(n+1) line number. The unless modifier
214 ## handles diffs with no context, where only one file is affected. In this
215 ## case $start == $after indicates an empty range, and the $start must
216 ## not be incremented.
217 my $empty_range = $start == $after;
218 ++$start unless $empty_range;
222 ? $format eq "unified" && $empty_range
225 : $format eq "unified"
226 ? "$start,".($after-$start+1)
231 my ( $seqs, $op, $a_or_b, $op_prefixes ) = @_;
233 my $opcode = $op->[OPCODE];
234 return () unless defined $op_prefixes->{$opcode};
236 my $op_sym = defined $op->[FLAG] ? $op->[FLAG] : $opcode;
237 $op_sym = $op_prefixes->{$op_sym};
238 return () unless defined $op_sym;
240 $a_or_b = $op->[OPCODE] ne "+" ? 0 : 1 unless defined $a_or_b;
241 return ( $op_sym, $seqs->[$a_or_b][$op->[$a_or_b]] );
245 package Text::Diff::Base;
249 return bless { @_ }, ref $proto || $proto;
252 sub file_header { return "" }
254 sub hunk_header { return "" }
256 sub hunk { return "" }
258 sub hunk_footer { return "" }
260 sub file_footer { return "" }
263 @Text::Diff::Unified::ISA = qw( Text::Diff::Base );
265 sub Text::Diff::Unified::file_header {
266 shift; ## No instance data
270 { FILENAME_PREFIX_A => "---", FILENAME_PREFIX_B => "+++", %$options }
274 sub Text::Diff::Unified::hunk_header {
275 shift; ## No instance data
276 pop; ## Ignore options
281 _range( $ops, A, "unified" ),
283 _range( $ops, B, "unified" ),
288 sub Text::Diff::Unified::hunk {
289 shift; ## No instance data
290 pop; ## Ignore options
293 my $prefixes = { "+" => "+", " " => " ", "-" => "-" };
295 return join "", map _op_to_line( \@_, $_, undef, $prefixes ), @$ops
298 @Text::Diff::Context::ISA = qw( Text::Diff::Base );
300 sub Text::Diff::Context::file_header {
301 _header { FILENAME_PREFIX_A=>"***", FILENAME_PREFIX_B=>"---", %{$_[-1]} };
304 sub Text::Diff::Context::hunk_header {
305 return "***************\n";
308 sub Text::Diff::Context::hunk {
309 shift; ## No instance data
310 pop; ## Ignore options
312 ## Leave the sequences in @_[0,1]
314 my $a_range = _range( $ops, A, "" );
315 my $b_range = _range( $ops, B, "" );
317 ## Sigh. Gotta make sure that differences that aren't adds/deletions
318 ## get prefixed with "!", and that the old opcodes are removed.
320 for ( my $start = 0; $start <= $#$ops ; $start = $after ) {
321 ## Scan until next difference
323 my $opcode = $ops->[$start]->[OPCODE];
324 next if $opcode eq " ";
327 while ( $after <= $#$ops && $ops->[$after]->[OPCODE] ne " " ) {
328 $bang_it ||= $ops->[$after]->[OPCODE] ne $opcode;
333 for my $i ( $start..($after-1) ) {
334 $ops->[$i]->[FLAG] = "!";
339 my $b_prefixes = { "+" => "+ ", " " => " ", "-" => undef, "!" => "! " };
340 my $a_prefixes = { "+" => undef, " " => " ", "-" => "- ", "!" => "! " };
343 "*** ", $a_range, " ****\n",
344 map( _op_to_line( \@_, $_, A, $a_prefixes ), @$ops ),
345 "--- ", $b_range, " ----\n",
346 map( _op_to_line( \@_, $_, B, $b_prefixes ), @$ops ),
350 @Text::Diff::OldStyle::ISA = qw( Text::Diff::Base );
354 my $op = $ops->[0]->[OPCODE];
355 $op = "c" if grep $_->[OPCODE] ne $op, @$ops;
356 $op = "a" if $op eq "+";
357 $op = "d" if $op eq "-";
361 sub Text::Diff::OldStyle::hunk_header {
362 shift; ## No instance data
363 pop; ## ignore options
368 return join "", _range( $ops, A, "" ), $op, _range( $ops, B, "" ), "\n";
371 sub Text::Diff::OldStyle::hunk {
372 shift; ## No instance data
373 pop; ## ignore options
375 ## Leave the sequences in @_[0,1]
377 my $a_prefixes = { "+" => undef, " " => undef, "-" => "< " };
378 my $b_prefixes = { "+" => "> ", " " => undef, "-" => undef };
383 map( _op_to_line( \@_, $_, A, $a_prefixes ), @$ops ),
384 $op eq "c" ? "---\n" : (),
385 map( _op_to_line( \@_, $_, B, $b_prefixes ), @$ops ),
395 Text::Diff - Perform diffs on files and record sets
401 ## Mix and match filenames, strings, file handles, producer subs,
402 ## or arrays of records; returns diff in a string.
403 ## WARNING: can return B<large> diffs for large files.
404 my $diff = diff "file1.txt", "file2.txt", { STYLE => "Context" };
405 my $diff = diff \$string1, \$string2, \%options;
406 my $diff = diff \*FH1, \*FH2;
407 my $diff = diff \&reader1, \&reader2;
408 my $diff = diff \@records1, \@records2;
410 ## May also mix input types:
411 my $diff = diff \@records1, "file_B.txt";
415 C<diff()> provides a basic set of services akin to the GNU C<diff> utility. It
416 is not anywhere near as feature complete as GNU C<diff>, but it is better
417 integrated with Perl and available on all platforms. It is often faster than
418 shelling out to a system's C<diff> executable for small files, and generally
419 slower on larger files.
421 Relies on L<Algorithm::Diff> for, well, the algorithm. This may not produce
422 the same exact diff as a system's local C<diff> executable, but it will be a
423 valid diff and comprehensible by C<patch>. We haven't seen any differences
424 between Algorithm::Diff's logic and GNU diff's, but we have not examined them
425 to make sure they are indeed identical.
427 B<Note>: If you don't want to import the C<diff> function, do one of the
434 That's a pretty rare occurence, so C<diff()> is exported by default.
437 diff() takes two parameters from which to draw input and a set of
438 options to control it's output. The options are:
442 =item FILENAME_A, MTIME_A, FILENAME_B, MTIME_B
444 The name of the file and the modification time "files"
446 These are filled in automatically for each file when diff() is passed a
447 filename, unless a defined value is passed in.
449 If a filename is not passed in and FILENAME_A and FILENAME_B are not provided
450 or C<undef>, the header will not be printed.
452 Unused on C<OldStyle> diffs.
454 =item OFFSET_A, OFFSET_B
456 The index of the first line / element. These default to 1 for all
457 parameter types except ARRAY references, for which the default is 0. This
458 is because ARRAY references are presumed to be data structures, while the
459 others are line oriented text.
463 "Unified", "Context", "OldStyle", or an object or class reference for a class
464 providing C<file_header()>, C<hunk_header()>, C<hunk()>, C<hunk_footer()> and
465 C<file_footer()> methods. The two footer() methods are provided for
466 overloading only; none of the formats provide them.
468 Defaults to "Unified" (unlike standard C<diff>, but Unified is what's most
469 often used in submitting patches and is the most human readable of the three.
471 If the package indicated by the STYLE has no hunk() method, c<diff()> will
472 load it automatically (lazy loading). Since all such packages should inherit
473 from Text::Diff::Base, this should be marvy.
475 Styles may be specified as class names (C<STYLE => "Foo"), in which case they
476 will be C<new()>ed with no parameters, or as objects (C<STYLE => Foo->new>).
480 How many lines before and after each diff to display. Ignored on old-style
481 diffs. Defaults to 3.
485 Examples and their equivalent subroutines:
487 OUTPUT => \*FOOHANDLE, # like: sub { print FOOHANDLE shift() }
488 OUTPUT => \$output, # like: sub { $output .= shift }
489 OUTPUT => \@output, # like: sub { push @output, shift }
490 OUTPUT => sub { $output .= shift },
492 If no C<OUTPUT> is supplied, returns the diffs in a string. If
493 C<OUTPUT> is a C<CODE> ref, it will be called once with the (optional)
494 file header, and once for each hunk body with the text to emit. If
495 C<OUTPUT> is an L<IO::Handle>, output will be emitted to that handle.
497 =item FILENAME_PREFIX_A, FILENAME_PREFIX_B
499 The string to print before the filename in the header. Unused on C<OldStyle>
500 diffs. Defaults are C<"---">, C<"+++"> for Unified and C<"***">, C<"+++"> for
503 =item KEYGEN, KEYGEN_ARGS
505 These are passed to L<Algorithm::Diff/traverse_sequences>.
509 B<Note>: if neither C<FILENAME_> option is defined, the header will not be
510 printed. If at one is present, the other and both MTIME_ options must be
511 present or "Use of undefined variable" warnings will be generated (except
512 on C<OldStyle> diffs, which ignores these options).
514 =head1 Formatting Classes
516 These functions implement the output formats. They are grouped in to classes
517 so diff() can use class names to call the correct set of output routines and so
518 that you may inherit from them easily. There are no constructors or instance
519 methods for these classes, though subclasses may provide them if need be.
521 Each class has file_header(), hunk_header(), hunk(), and footer() methods
522 identical to those documented in the Text::Diff::Unified section. header() is
523 called before the hunk() is first called, footer() afterwards. The default
524 footer function is an empty method provided for overloading:
526 sub footer { return "End of patch\n" }
528 Some output formats are provided by external modules (which are loaded
529 automatically), such as L<Text::Diff::Table>. These are
530 are documented here to keep the documentation simple.
532 =head2 Text::Diff::Base
534 Returns "" for all methods (other than C<new()>).
536 =head2 Text::Diff::Unified
538 --- A Mon Nov 12 23:49:30 2001
539 +++ B Mon Nov 12 23:49:30 2001
561 $s = Text::Diff::Unified->file_header( $options );
563 Returns a string containing a unified header. The sole parameter is the
564 options hash passed in to diff(), containing at least:
573 FILENAME_PREFIX_A => "---",
574 FILENAME_PREFIX_B => "+++",
576 to override the default prefixes (default values shown).
580 Text::Diff::Unified->hunk_header( \@ops, $options );
582 Returns a string containing the output of one hunk of unified diff.
584 =item Text::Diff::Unified::hunk
586 Text::Diff::Unified->hunk( \@seq_a, \@seq_b, \@ops, $options );
588 Returns a string containing the output of one hunk of unified diff.
592 =head2 Text::Diff::Table
594 +--+----------------------------------+--+------------------------------+
595 | |../Test-Differences-0.2/MANIFEST | |../Test-Differences/MANIFEST |
596 | |Thu Dec 13 15:38:49 2001 | |Sat Dec 15 02:09:44 2001 |
597 +--+----------------------------------+--+------------------------------+
599 | 1|Differences.pm | 2|Differences.pm |
600 | 2|MANIFEST | 3|MANIFEST |
601 | | * 4|MANIFEST.SKIP *
602 | 3|Makefile.PL | 5|Makefile.PL |
603 | | * 6|t/00escape.t *
604 | 4|t/00flatten.t | 7|t/00flatten.t |
605 | 5|t/01text_vs_data.t | 8|t/01text_vs_data.t |
606 | 6|t/10test.t | 9|t/10test.t |
607 +--+----------------------------------+--+------------------------------+
609 This format also goes to some pains to highlight "invisible" characters on
610 differing elements by selectively escaping whitespace:
612 +--+--------------------------+--------------------------+
613 | |demo_ws_A.txt |demo_ws_B.txt |
614 | |Fri Dec 21 08:36:32 2001 |Fri Dec 21 08:36:50 2001 |
615 +--+--------------------------+--------------------------+
616 | 1|identical |identical |
617 * 2| spaced in | also spaced in *
618 * 3|embedded space |embedded tab *
619 | 4|identical |identical |
620 * 5| spaced in |\ttabbed in *
621 * 6|trailing spaces\s\s\n |trailing tabs\t\t\n *
622 | 7|identical |identical |
623 * 8|lf line\n |crlf line\r\n *
624 * 9|embedded ws |embedded\tws *
625 +--+--------------------------+--------------------------+
627 See L</Text::Diff::Table> for more details, including how the whitespace
630 =head2 Text::Diff::Context
632 *** A Mon Nov 12 23:49:30 2001
633 --- B Mon Nov 12 23:49:30 2001
664 Note: hunk_header() returns only "***************\n".
666 =head2 Text::Diff::OldStyle
677 Note: no file_header().
681 Must suck both input files entirely in to memory and store them with a normal
682 amount of Perlish overhead (one array location) per record. This is implied by
683 the implementation of Algorithm::Diff, which takes two arrays. If
684 Algorithm::Diff ever offers an incremental mode, this can be changed (contact
685 the maintainers of Algorithm::Diff and Text::Diff if you need this; it
686 shouldn't be too terribly hard to tie arrays in this fashion).
688 Does not provide most of the more refined GNU diff options: recursive directory
689 tree scanning, ignoring blank lines / whitespace, etc., etc. These can all be
690 added as time permits and need arises, many are rather easy; patches quite
693 Uses closures internally, this may lead to leaks on C<perl> versions 5.6.1 and
694 prior if used many times over a process' life time.
698 Adam Kennedy E<lt>adamk@cpan.orgE<gt>
700 Barrie Slaymaker E<lt>barries@slaysys.comE<gt>
704 Some parts copyright 2009 Adam Kennedy.
706 Copyright 2001 Barrie Slaymaker. All Rights Reserved.
708 You may use this under the terms of either the Artistic License or GNU Public
709 License v 2.0 or greater.