Commit | Line | Data |
3fea05b9 |
1 | # This is a version of Algorithm::Diff that uses only a comparison function, |
2 | # like versions <= 0.59 used to. |
3 | # $Revision: 1.3 $ |
4 | |
5 | package Algorithm::DiffOld; |
6 | use strict; |
7 | use vars qw($VERSION @EXPORT_OK @ISA @EXPORT); |
8 | use integer; # see below in _replaceNextLargerWith() for mod to make |
9 | # if you don't use this |
10 | require Exporter; |
11 | @ISA = qw(Exporter); |
12 | @EXPORT = qw(); |
13 | @EXPORT_OK = qw(LCS diff traverse_sequences); |
14 | $VERSION = 1.10; # manually tracking Algorithm::Diff |
15 | |
16 | # McIlroy-Hunt diff algorithm |
17 | # Adapted from the Smalltalk code of Mario I. Wolczko, <mario@wolczko.com> |
18 | # by Ned Konz, perl@bike-nomad.com |
19 | |
20 | =head1 NAME |
21 | |
22 | Algorithm::DiffOld - Compute `intelligent' differences between two files / lists |
23 | but use the old (<=0.59) interface. |
24 | |
25 | =head1 NOTE |
26 | |
27 | This has been provided as part of the Algorithm::Diff package by Ned Konz. |
28 | This particular module is B<ONLY> for people who B<HAVE> to have the old |
29 | interface, which uses a comparison function rather than a key generating |
30 | function. |
31 | |
32 | Because each of the lines in one array have to be compared with each |
33 | of the lines in the other array, this does M*N comparisions. This can |
34 | be very slow. I clocked it at taking 18 times as long as the stock |
35 | version of Algorithm::Diff for a 4000-line file. It will get worse |
36 | quadratically as array sizes increase. |
37 | |
38 | =head1 SYNOPSIS |
39 | |
40 | use Algorithm::DiffOld qw(diff LCS traverse_sequences); |
41 | |
42 | @lcs = LCS( \@seq1, \@seq2, $comparison_function ); |
43 | |
44 | $lcsref = LCS( \@seq1, \@seq2, $comparison_function ); |
45 | |
46 | @diffs = diff( \@seq1, \@seq2, $comparison_function ); |
47 | |
48 | traverse_sequences( \@seq1, \@seq2, |
49 | { MATCH => $callback, |
50 | DISCARD_A => $callback, |
51 | DISCARD_B => $callback, |
52 | }, |
53 | $comparison_function ); |
54 | |
55 | =head1 COMPARISON FUNCTIONS |
56 | |
57 | Each of the main routines should be passed a comparison function. If you |
58 | aren't passing one in, B<use Algorithm::Diff instead>. |
59 | |
60 | These functions should return a true value when two items should compare |
61 | as equal. |
62 | |
63 | For instance, |
64 | |
65 | @lcs = LCS( \@seq1, \@seq2, sub { my ($a, $b) = @_; $a eq $b } ); |
66 | |
67 | but if that is all you're doing with your comparison function, just use |
68 | Algorithm::Diff and let it do this (this is its default). |
69 | |
70 | Or: |
71 | |
72 | sub someFunkyComparisonFunction |
73 | { |
74 | my ($a, $b) = @_; |
75 | $a =~ m{$b}; |
76 | } |
77 | |
78 | @diffs = diff( \@lines, \@patterns, \&someFunkyComparisonFunction ); |
79 | |
80 | which would allow you to diff an array @lines which consists of text |
81 | lines with an array @patterns which consists of regular expressions. |
82 | |
83 | This is actually the reason I wrote this version -- there is no way |
84 | to do this with a key generation function as in the stock Algorithm::Diff. |
85 | |
86 | =cut |
87 | |
88 | # Find the place at which aValue would normally be inserted into the array. If |
89 | # that place is already occupied by aValue, do nothing, and return undef. If |
90 | # the place does not exist (i.e., it is off the end of the array), add it to |
91 | # the end, otherwise replace the element at that point with aValue. |
92 | # It is assumed that the array's values are numeric. |
93 | # This is where the bulk (75%) of the time is spent in this module, so try to |
94 | # make it fast! |
95 | |
96 | sub _replaceNextLargerWith |
97 | { |
98 | my ( $array, $aValue, $high ) = @_; |
99 | $high ||= $#$array; |
100 | |
101 | # off the end? |
102 | if ( $high == -1 || $aValue > $array->[ -1 ] ) |
103 | { |
104 | push( @$array, $aValue ); |
105 | return $high + 1; |
106 | } |
107 | |
108 | # binary search for insertion point... |
109 | my $low = 0; |
110 | my $index; |
111 | my $found; |
112 | while ( $low <= $high ) |
113 | { |
114 | $index = ( $high + $low ) / 2; |
115 | # $index = int(( $high + $low ) / 2); # without 'use integer' |
116 | $found = $array->[ $index ]; |
117 | |
118 | if ( $aValue == $found ) |
119 | { |
120 | return undef; |
121 | } |
122 | elsif ( $aValue > $found ) |
123 | { |
124 | $low = $index + 1; |
125 | } |
126 | else |
127 | { |
128 | $high = $index - 1; |
129 | } |
130 | } |
131 | |
132 | # now insertion point is in $low. |
133 | $array->[ $low ] = $aValue; # overwrite next larger |
134 | return $low; |
135 | } |
136 | |
137 | # This method computes the longest common subsequence in $a and $b. |
138 | |
139 | # Result is array or ref, whose contents is such that |
140 | # $a->[ $i ] == $b->[ $result[ $i ] ] |
141 | # foreach $i in ( 0 .. $#result ) if $result[ $i ] is defined. |
142 | |
143 | # An additional argument may be passed; this is a CODE ref to a comparison |
144 | # routine. By default, comparisons will use "eq" . |
145 | # Note that this routine will be called as many as M*N times, so make it fast! |
146 | |
147 | # Additional parameters, if any, will be passed to the key generation routine. |
148 | |
149 | sub _longestCommonSubsequence |
150 | { |
151 | my $a = shift; # array ref |
152 | my $b = shift; # array ref |
153 | my $compare = shift || sub { my $a = shift; my $b = shift; $a eq $b }; |
154 | |
155 | my $aStart = 0; |
156 | my $aFinish = $#$a; |
157 | my $bStart = 0; |
158 | my $bFinish = $#$b; |
159 | my $matchVector = []; |
160 | |
161 | # First we prune off any common elements at the beginning |
162 | while ( $aStart <= $aFinish |
163 | and $bStart <= $bFinish |
164 | and &$compare( $a->[ $aStart ], $b->[ $bStart ], @_ ) ) |
165 | { |
166 | $matchVector->[ $aStart++ ] = $bStart++; |
167 | } |
168 | |
169 | # now the end |
170 | while ( $aStart <= $aFinish |
171 | and $bStart <= $bFinish |
172 | and &$compare( $a->[ $aFinish ], $b->[ $bFinish ], @_ ) ) |
173 | { |
174 | $matchVector->[ $aFinish-- ] = $bFinish--; |
175 | } |
176 | |
177 | my $thresh = []; |
178 | my $links = []; |
179 | |
180 | my ( $i, $ai, $j, $k ); |
181 | for ( $i = $aStart; $i <= $aFinish; $i++ ) |
182 | { |
183 | $k = 0; |
184 | # look for each element of @b between $bStart and $bFinish |
185 | # that matches $a->[ $i ], in reverse order |
186 | for ($j = $bFinish; $j >= $bStart; $j--) |
187 | { |
188 | next if ! &$compare( $a->[$i], $b->[$j], @_ ); |
189 | # optimization: most of the time this will be true |
190 | if ( $k |
191 | and $thresh->[ $k ] > $j |
192 | and $thresh->[ $k - 1 ] < $j ) |
193 | { |
194 | $thresh->[ $k ] = $j; |
195 | } |
196 | else |
197 | { |
198 | $k = _replaceNextLargerWith( $thresh, $j, $k ); |
199 | } |
200 | |
201 | # oddly, it's faster to always test this (CPU cache?). |
202 | if ( defined( $k ) ) |
203 | { |
204 | $links->[ $k ] = |
205 | [ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ]; |
206 | } |
207 | } |
208 | } |
209 | |
210 | if ( @$thresh ) |
211 | { |
212 | for ( my $link = $links->[ $#$thresh ]; $link; $link = $link->[ 0 ] ) |
213 | { |
214 | $matchVector->[ $link->[ 1 ] ] = $link->[ 2 ]; |
215 | } |
216 | } |
217 | |
218 | return wantarray ? @$matchVector : $matchVector; |
219 | } |
220 | |
221 | sub traverse_sequences |
222 | { |
223 | my $a = shift; # array ref |
224 | my $b = shift; # array ref |
225 | my $callbacks = shift || { }; |
226 | my $compare = shift; |
227 | my $matchCallback = $callbacks->{'MATCH'} || sub { }; |
228 | my $discardACallback = $callbacks->{'DISCARD_A'} || sub { }; |
229 | my $finishedACallback = $callbacks->{'A_FINISHED'}; |
230 | my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { }; |
231 | my $finishedBCallback = $callbacks->{'B_FINISHED'}; |
232 | my $matchVector = _longestCommonSubsequence( $a, $b, $compare, @_ ); |
233 | # Process all the lines in match vector |
234 | my $lastA = $#$a; |
235 | my $lastB = $#$b; |
236 | my $bi = 0; |
237 | my $ai; |
238 | for ( $ai = 0; $ai <= $#$matchVector; $ai++ ) |
239 | { |
240 | my $bLine = $matchVector->[ $ai ]; |
241 | if ( defined( $bLine ) ) # matched |
242 | { |
243 | &$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine; |
244 | &$matchCallback( $ai, $bi++, @_ ); |
245 | } |
246 | else |
247 | { |
248 | &$discardACallback( $ai, $bi, @_ ); |
249 | } |
250 | } |
251 | # the last entry (if any) processed was a match. |
252 | |
253 | if ( defined( $finishedBCallback ) && $ai <= $lastA ) |
254 | { |
255 | &$finishedBCallback( $bi, @_ ); |
256 | } |
257 | else |
258 | { |
259 | &$discardACallback( $ai++, $bi, @_ ) while ( $ai <= $lastA ); |
260 | } |
261 | |
262 | if ( defined( $finishedACallback ) && $bi <= $lastB ) |
263 | { |
264 | &$finishedACallback( $ai, @_ ); |
265 | } |
266 | else |
267 | { |
268 | &$discardBCallback( $ai, $bi++, @_ ) while ( $bi <= $lastB ); |
269 | } |
270 | return 1; |
271 | } |
272 | |
273 | sub LCS |
274 | { |
275 | my $a = shift; # array ref |
276 | my $matchVector = _longestCommonSubsequence( $a, @_ ); |
277 | my @retval; |
278 | my $i; |
279 | for ( $i = 0; $i <= $#$matchVector; $i++ ) |
280 | { |
281 | if ( defined( $matchVector->[ $i ] ) ) |
282 | { |
283 | push( @retval, $a->[ $i ] ); |
284 | } |
285 | } |
286 | return wantarray ? @retval : \@retval; |
287 | } |
288 | |
289 | sub diff |
290 | { |
291 | my $a = shift; # array ref |
292 | my $b = shift; # array ref |
293 | my $retval = []; |
294 | my $hunk = []; |
295 | my $discard = sub { push( @$hunk, [ '-', $_[ 0 ], $a->[ $_[ 0 ] ] ] ) }; |
296 | my $add = sub { push( @$hunk, [ '+', $_[ 1 ], $b->[ $_[ 1 ] ] ] ) }; |
297 | my $match = sub { push( @$retval, $hunk ) if scalar(@$hunk); $hunk = [] }; |
298 | traverse_sequences( $a, $b, |
299 | { MATCH => $match, DISCARD_A => $discard, DISCARD_B => $add }, |
300 | @_ ); |
301 | &$match(); |
302 | return wantarray ? @$retval : $retval; |
303 | } |
304 | |
305 | 1; |