huge pile of pod updates
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Util.pm
CommitLineData
910a0a6d 1package Text::Tradition::Parser::Util;
2
3use strict;
4use warnings;
5use Algorithm::Diff;
6use Exporter 'import';
7use vars qw/ @EXPORT_OK /;
8@EXPORT_OK = qw/ add_hash_entry check_for_repeated cmp_str collate_variants is_monotonic /;
9
25331c49 10=head1 NAME
910a0a6d 11
25331c49 12Text::Tradition::Parser::Util
13
14=head1 DESCRIPTION
15
16A collection of utilities used by multiple Text::Tradition parsers.
17Probably not of external interest.
18
19=head1 METHODS
20
21=head2 B<collate_variants>( $collation, @reading_ranges )
910a0a6d 22
23Given a set of readings in the form
24( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
25walks through each to identify those readings that are identical. The
26collation is a Text::Tradition::Collation object; the elements of
27@readings are Text::Tradition::Collation::Reading objects that appear
28on the collation graph.
29
910a0a6d 30=cut
31
32sub collate_variants {
33 my( $collation, @reading_sets ) = @_;
34
35 # Two different ways to do this, depending on whether we want
36 # transposed reading nodes to be merged into one (producing a
37 # nonlinear, bidirectional graph) or not (producing a relatively
38 # linear, unidirectional graph.)
027d819c 39 return $collation->linear ? _collate_linearly( @_ )
40 : _collate_nonlinearly( @_ );
910a0a6d 41}
42
027d819c 43sub _collate_linearly {
910a0a6d 44 my( $collation, $lemma_set, @variant_sets ) = @_;
45
46 my @unique;
47 my $substitutions = {};
48 push( @unique, @$lemma_set );
49 while( @variant_sets ) {
50 my $variant_set = shift @variant_sets;
51 # Use diff to do this job
52 my $diff = Algorithm::Diff->new( \@unique, $variant_set,
53 {'keyGen' => \&_collation_hash} );
54 my @new_unique;
55 my %merged;
56 while( $diff->Next ) {
57 if( $diff->Same ) {
58 # merge the nodes
59 my @l = $diff->Items( 1 );
60 my @v = $diff->Items( 2 );
61 foreach my $i ( 0 .. $#l ) {
e4b0f464 62 if( !$merged{$l[$i]->id} ) {
f6e19c7c 63 next if $v[$i] eq $l[$i];
0068967c 64# print STDERR sprintf( "Merging %s into %s\n",
65# $v[$i]->id,
66# $l[$i]->id );
910a0a6d 67 $collation->merge_readings( $l[$i], $v[$i] );
e4b0f464 68 $merged{$l[$i]->id} = 1;
69 $substitutions->{$v[$i]->id} = $l[$i];
910a0a6d 70 } else {
e4b0f464 71 print STDERR "Would have double merged " . $l[$i]->id . "\n";
910a0a6d 72 }
73 }
74 # splice the lemma nodes into the variant set
75 my( $offset ) = $diff->Get( 'min2' );
76 splice( @$variant_set, $offset, scalar( @l ), @l );
77 push( @new_unique, @l );
78 } else {
79 # Keep the old unique readings
80 push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
81 # Add the new readings to the 'unique' list
82 push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
83 }
84 }
85 @unique = @new_unique;
86 }
87 return $substitutions;
88}
89
027d819c 90sub _collate_nonlinearly {
910a0a6d 91 my( $collation, $lemma_set, @variant_sets ) = @_;
92
93 my @unique;
94 my $substitutions = {};
95 push( @unique, @$lemma_set );
96 while( @variant_sets ) {
97 my $variant_set = shift @variant_sets;
98 # Simply match the first reading that carries the same word, so
99 # long as that reading has not yet been used to match another
100 # word in this variant. That way lies loopy madness.
101 my @distinct;
102 my %merged;
103 foreach my $idx ( 0 .. $#{$variant_set} ) {
104 my $vw = $variant_set->[$idx];
e4b0f464 105 my @same = grep { cmp_str( $_ ) eq $vw->text } @unique;
910a0a6d 106 my $matched;
107 if( @same ) {
108 foreach my $i ( 0 .. $#same ) {
e4b0f464 109 unless( $merged{$same[$i]->id} ) {
910a0a6d 110 #print STDERR sprintf( "Merging %s into %s\n",
e4b0f464 111 # $vw->id,
112 # $same[$i]->id );
910a0a6d 113 $collation->merge_readings( $same[$i], $vw );
e4b0f464 114 $merged{$same[$i]->id} = 1;
910a0a6d 115 $matched = $i;
116 $variant_set->[$idx] = $same[$i];
e4b0f464 117 $substitutions->{$vw->id} = $same[$i];
910a0a6d 118 }
119 }
120 }
121 unless( @same && defined($matched) ) {
122 push( @distinct, $vw );
123 }
124 }
125 push( @unique, @distinct );
126 }
127 return $substitutions;
128}
129
130sub _collation_hash {
131 my $node = shift;
132 return cmp_str( $node );
133}
134
027d819c 135=head2 B<cmp_str>
136
137Don't use this. Really.
138
139=cut
140
910a0a6d 141sub cmp_str {
142 my( $reading ) = @_;
e4b0f464 143 my $word = $reading->text();
910a0a6d 144 $word = lc( $word );
145 $word =~ s/\W//g;
146 $word =~ s/v/u/g;
147 $word =~ s/j/i/g;
148 $word =~ s/cha/ca/g;
149 $word =~ s/quatuor/quattuor/g;
150 $word =~ s/ioannes/iohannes/g;
151 return $word;
152}
153
25331c49 154=head2 B<check_for_repeated>( @readings )
910a0a6d 155
156Given an array of items, returns any items that appear in the array more
157than once.
158
159=cut
160
161sub check_for_repeated {
162 my @seq = @_;
163 my %unique;
164 my @repeated;
165 foreach ( @seq ) {
e4b0f464 166 if( exists $unique{$_->id} ) {
167 push( @repeated, $_->id );
910a0a6d 168 } else {
e4b0f464 169 $unique{$_->id} = 1;
910a0a6d 170 }
171 }
172 return @repeated;
173}
174
027d819c 175=head2 B<add_hash_entry>( $hash, $key, $entry )
176
177Very simple utility for adding $entry to the list at $hash->{$key}.
178
179=cut
180
910a0a6d 181sub add_hash_entry {
182 my( $hash, $key, $entry ) = @_;
183 if( exists $hash->{$key} ) {
184 push( @{$hash->{$key}}, $entry );
185 } else {
186 $hash->{$key} = [ $entry ];
187 }
188}
189
25331c49 1901;
191
192=head1 BUGS / TODO
193
194=over
195
196=item * Get rid of abomination that is cmp_str.
197
198=back
199
200=head1 LICENSE
201
202This package is free software and is provided "as is" without express
203or implied warranty. You can redistribute it and/or modify it under
204the same terms as Perl itself.
205
206=head1 AUTHOR
207
208Tara L Andrews E<lt>aurum@cpan.orgE<gt>