Commit | Line | Data |
910a0a6d |
1 | package Text::Tradition::Parser::Util; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use Algorithm::Diff; |
6 | use Exporter 'import'; |
7 | use vars qw/ @EXPORT_OK /; |
8 | @EXPORT_OK = qw/ add_hash_entry check_for_repeated cmp_str collate_variants is_monotonic /; |
9 | |
25331c49 |
10 | =head1 NAME |
910a0a6d |
11 | |
25331c49 |
12 | Text::Tradition::Parser::Util |
13 | |
14 | =head1 DESCRIPTION |
15 | |
16 | A collection of utilities used by multiple Text::Tradition parsers. |
17 | Probably not of external interest. |
18 | |
19 | =head1 METHODS |
20 | |
21 | =head2 B<collate_variants>( $collation, @reading_ranges ) |
910a0a6d |
22 | |
23 | Given a set of readings in the form |
24 | ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... ) |
25 | walks through each to identify those readings that are identical. The |
26 | collation is a Text::Tradition::Collation object; the elements of |
27 | @readings are Text::Tradition::Collation::Reading objects that appear |
28 | on the collation graph. |
29 | |
910a0a6d |
30 | =cut |
31 | |
32 | sub collate_variants { |
33 | my( $collation, @reading_sets ) = @_; |
f8b07f32 |
34 | |
35 | # Make sure the reading sets are unique, but keep |
36 | # the lemma first. |
37 | my $lemma = shift @reading_sets; |
38 | my %unique_sets; |
39 | map { $unique_sets{$_} = $_ } @reading_sets; |
40 | delete $unique_sets{$lemma}; |
41 | my @sets = values %unique_sets; |
42 | unshift( @sets, $lemma ); |
910a0a6d |
43 | |
44 | # Two different ways to do this, depending on whether we want |
45 | # transposed reading nodes to be merged into one (producing a |
46 | # nonlinear, bidirectional graph) or not (producing a relatively |
47 | # linear, unidirectional graph.) |
f8b07f32 |
48 | return $collation->linear ? _collate_linearly( $collation, @sets ) |
49 | : _collate_nonlinearly( $collation, @sets ); |
910a0a6d |
50 | } |
51 | |
027d819c |
52 | sub _collate_linearly { |
910a0a6d |
53 | my( $collation, $lemma_set, @variant_sets ) = @_; |
54 | |
55 | my @unique; |
56 | my $substitutions = {}; |
57 | push( @unique, @$lemma_set ); |
58 | while( @variant_sets ) { |
59 | my $variant_set = shift @variant_sets; |
60 | # Use diff to do this job |
61 | my $diff = Algorithm::Diff->new( \@unique, $variant_set, |
62 | {'keyGen' => \&_collation_hash} ); |
63 | my @new_unique; |
64 | my %merged; |
65 | while( $diff->Next ) { |
66 | if( $diff->Same ) { |
67 | # merge the nodes |
68 | my @l = $diff->Items( 1 ); |
69 | my @v = $diff->Items( 2 ); |
70 | foreach my $i ( 0 .. $#l ) { |
e4b0f464 |
71 | if( !$merged{$l[$i]->id} ) { |
f6e19c7c |
72 | next if $v[$i] eq $l[$i]; |
0068967c |
73 | # print STDERR sprintf( "Merging %s into %s\n", |
74 | # $v[$i]->id, |
75 | # $l[$i]->id ); |
910a0a6d |
76 | $collation->merge_readings( $l[$i], $v[$i] ); |
e4b0f464 |
77 | $merged{$l[$i]->id} = 1; |
78 | $substitutions->{$v[$i]->id} = $l[$i]; |
910a0a6d |
79 | } else { |
e4b0f464 |
80 | print STDERR "Would have double merged " . $l[$i]->id . "\n"; |
910a0a6d |
81 | } |
82 | } |
83 | # splice the lemma nodes into the variant set |
84 | my( $offset ) = $diff->Get( 'min2' ); |
85 | splice( @$variant_set, $offset, scalar( @l ), @l ); |
86 | push( @new_unique, @l ); |
87 | } else { |
88 | # Keep the old unique readings |
89 | push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 ); |
90 | # Add the new readings to the 'unique' list |
91 | push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 ); |
92 | } |
93 | } |
94 | @unique = @new_unique; |
95 | } |
96 | return $substitutions; |
97 | } |
98 | |
027d819c |
99 | sub _collate_nonlinearly { |
910a0a6d |
100 | my( $collation, $lemma_set, @variant_sets ) = @_; |
101 | |
102 | my @unique; |
103 | my $substitutions = {}; |
104 | push( @unique, @$lemma_set ); |
105 | while( @variant_sets ) { |
106 | my $variant_set = shift @variant_sets; |
107 | # Simply match the first reading that carries the same word, so |
108 | # long as that reading has not yet been used to match another |
109 | # word in this variant. That way lies loopy madness. |
110 | my @distinct; |
111 | my %merged; |
112 | foreach my $idx ( 0 .. $#{$variant_set} ) { |
113 | my $vw = $variant_set->[$idx]; |
e4b0f464 |
114 | my @same = grep { cmp_str( $_ ) eq $vw->text } @unique; |
910a0a6d |
115 | my $matched; |
116 | if( @same ) { |
117 | foreach my $i ( 0 .. $#same ) { |
e4b0f464 |
118 | unless( $merged{$same[$i]->id} ) { |
910a0a6d |
119 | #print STDERR sprintf( "Merging %s into %s\n", |
e4b0f464 |
120 | # $vw->id, |
121 | # $same[$i]->id ); |
910a0a6d |
122 | $collation->merge_readings( $same[$i], $vw ); |
e4b0f464 |
123 | $merged{$same[$i]->id} = 1; |
910a0a6d |
124 | $matched = $i; |
125 | $variant_set->[$idx] = $same[$i]; |
e4b0f464 |
126 | $substitutions->{$vw->id} = $same[$i]; |
910a0a6d |
127 | } |
128 | } |
129 | } |
130 | unless( @same && defined($matched) ) { |
131 | push( @distinct, $vw ); |
132 | } |
133 | } |
134 | push( @unique, @distinct ); |
135 | } |
136 | return $substitutions; |
137 | } |
138 | |
139 | sub _collation_hash { |
140 | my $node = shift; |
141 | return cmp_str( $node ); |
142 | } |
143 | |
027d819c |
144 | =head2 B<cmp_str> |
145 | |
146 | Don't use this. Really. |
147 | |
148 | =cut |
149 | |
910a0a6d |
150 | sub cmp_str { |
151 | my( $reading ) = @_; |
e4b0f464 |
152 | my $word = $reading->text(); |
44a6e7af |
153 | return $word unless $reading->collation->tradition->name =~ /158/; |
910a0a6d |
154 | $word = lc( $word ); |
155 | $word =~ s/\W//g; |
156 | $word =~ s/v/u/g; |
157 | $word =~ s/j/i/g; |
158 | $word =~ s/cha/ca/g; |
159 | $word =~ s/quatuor/quattuor/g; |
160 | $word =~ s/ioannes/iohannes/g; |
161 | return $word; |
162 | } |
163 | |
25331c49 |
164 | =head2 B<check_for_repeated>( @readings ) |
910a0a6d |
165 | |
166 | Given an array of items, returns any items that appear in the array more |
167 | than once. |
168 | |
169 | =cut |
170 | |
171 | sub check_for_repeated { |
172 | my @seq = @_; |
173 | my %unique; |
174 | my @repeated; |
175 | foreach ( @seq ) { |
e4b0f464 |
176 | if( exists $unique{$_->id} ) { |
177 | push( @repeated, $_->id ); |
910a0a6d |
178 | } else { |
e4b0f464 |
179 | $unique{$_->id} = 1; |
910a0a6d |
180 | } |
181 | } |
182 | return @repeated; |
183 | } |
184 | |
027d819c |
185 | =head2 B<add_hash_entry>( $hash, $key, $entry ) |
186 | |
187 | Very simple utility for adding $entry to the list at $hash->{$key}. |
188 | |
189 | =cut |
190 | |
910a0a6d |
191 | sub add_hash_entry { |
192 | my( $hash, $key, $entry ) = @_; |
193 | if( exists $hash->{$key} ) { |
194 | push( @{$hash->{$key}}, $entry ); |
195 | } else { |
196 | $hash->{$key} = [ $entry ]; |
197 | } |
198 | } |
199 | |
25331c49 |
200 | 1; |
201 | |
202 | =head1 BUGS / TODO |
203 | |
204 | =over |
205 | |
206 | =item * Get rid of abomination that is cmp_str. |
207 | |
208 | =back |
209 | |
210 | =head1 LICENSE |
211 | |
212 | This package is free software and is provided "as is" without express |
213 | or implied warranty. You can redistribute it and/or modify it under |
214 | the same terms as Perl itself. |
215 | |
216 | =head1 AUTHOR |
217 | |
218 | Tara L Andrews E<lt>aurum@cpan.orgE<gt> |