fix perldoc
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Util.pm
CommitLineData
910a0a6d 1package Text::Tradition::Parser::Util;
2
3use strict;
4use warnings;
5use Algorithm::Diff;
6use Exporter 'import';
7use vars qw/ @EXPORT_OK /;
8@EXPORT_OK = qw/ add_hash_entry check_for_repeated cmp_str collate_variants is_monotonic /;
9
25331c49 10=head1 NAME
910a0a6d 11
25331c49 12Text::Tradition::Parser::Util
13
14=head1 DESCRIPTION
15
16A collection of utilities used by multiple Text::Tradition parsers.
17Probably not of external interest.
18
19=head1 METHODS
20
21=head2 B<collate_variants>( $collation, @reading_ranges )
910a0a6d 22
23Given a set of readings in the form
24( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
25walks through each to identify those readings that are identical. The
26collation is a Text::Tradition::Collation object; the elements of
27@readings are Text::Tradition::Collation::Reading objects that appear
28on the collation graph.
29
30TODO: Handle collapsed and non-collapsed transpositions.
31
32=cut
33
34sub collate_variants {
35 my( $collation, @reading_sets ) = @_;
36
37 # Two different ways to do this, depending on whether we want
38 # transposed reading nodes to be merged into one (producing a
39 # nonlinear, bidirectional graph) or not (producing a relatively
40 # linear, unidirectional graph.)
41 return $collation->linear ? collate_linearly( @_ )
42 : collate_nonlinearly( @_ );
43}
44
45sub collate_linearly {
46 my( $collation, $lemma_set, @variant_sets ) = @_;
47
48 my @unique;
49 my $substitutions = {};
50 push( @unique, @$lemma_set );
51 while( @variant_sets ) {
52 my $variant_set = shift @variant_sets;
53 # Use diff to do this job
54 my $diff = Algorithm::Diff->new( \@unique, $variant_set,
55 {'keyGen' => \&_collation_hash} );
56 my @new_unique;
57 my %merged;
58 while( $diff->Next ) {
59 if( $diff->Same ) {
60 # merge the nodes
61 my @l = $diff->Items( 1 );
62 my @v = $diff->Items( 2 );
63 foreach my $i ( 0 .. $#l ) {
64 if( !$merged{$l[$i]->name} ) {
65 print STDERR sprintf( "Merging %s into %s\n",
66 $v[$i]->name,
67 $l[$i]->name );
68 $collation->merge_readings( $l[$i], $v[$i] );
69 $merged{$l[$i]->name} = 1;
70 $substitutions->{$v[$i]->name} = $l[$i];
71 } else {
72 print STDERR "Would have double merged " . $l[$i]->name . "\n";
73 }
74 }
75 # splice the lemma nodes into the variant set
76 my( $offset ) = $diff->Get( 'min2' );
77 splice( @$variant_set, $offset, scalar( @l ), @l );
78 push( @new_unique, @l );
79 } else {
80 # Keep the old unique readings
81 push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
82 # Add the new readings to the 'unique' list
83 push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
84 }
85 }
86 @unique = @new_unique;
87 }
88 return $substitutions;
89}
90
91sub collate_nonlinearly {
92 my( $collation, $lemma_set, @variant_sets ) = @_;
93
94 my @unique;
95 my $substitutions = {};
96 push( @unique, @$lemma_set );
97 while( @variant_sets ) {
98 my $variant_set = shift @variant_sets;
99 # Simply match the first reading that carries the same word, so
100 # long as that reading has not yet been used to match another
101 # word in this variant. That way lies loopy madness.
102 my @distinct;
103 my %merged;
104 foreach my $idx ( 0 .. $#{$variant_set} ) {
105 my $vw = $variant_set->[$idx];
106 my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
107 my $matched;
108 if( @same ) {
109 foreach my $i ( 0 .. $#same ) {
110 unless( $merged{$same[$i]->name} ) {
111 #print STDERR sprintf( "Merging %s into %s\n",
112 # $vw->name,
113 # $same[$i]->name );
114 $collation->merge_readings( $same[$i], $vw );
115 $merged{$same[$i]->name} = 1;
116 $matched = $i;
117 $variant_set->[$idx] = $same[$i];
118 $substitutions->{$vw->name} = $same[$i];
119 }
120 }
121 }
122 unless( @same && defined($matched) ) {
123 push( @distinct, $vw );
124 }
125 }
126 push( @unique, @distinct );
127 }
128 return $substitutions;
129}
130
131sub _collation_hash {
132 my $node = shift;
133 return cmp_str( $node );
134}
135
910a0a6d 136sub cmp_str {
137 my( $reading ) = @_;
138 my $word = $reading->label();
139 $word = lc( $word );
140 $word =~ s/\W//g;
141 $word =~ s/v/u/g;
142 $word =~ s/j/i/g;
143 $word =~ s/cha/ca/g;
144 $word =~ s/quatuor/quattuor/g;
145 $word =~ s/ioannes/iohannes/g;
146 return $word;
147}
148
25331c49 149=head2 B<check_for_repeated>( @readings )
910a0a6d 150
151Given an array of items, returns any items that appear in the array more
152than once.
153
154=cut
155
156sub check_for_repeated {
157 my @seq = @_;
158 my %unique;
159 my @repeated;
160 foreach ( @seq ) {
161 if( exists $unique{$_->name} ) {
162 push( @repeated, $_->name );
163 } else {
164 $unique{$_->name} = 1;
165 }
166 }
167 return @repeated;
168}
169
170sub add_hash_entry {
171 my( $hash, $key, $entry ) = @_;
172 if( exists $hash->{$key} ) {
173 push( @{$hash->{$key}}, $entry );
174 } else {
175 $hash->{$key} = [ $entry ];
176 }
177}
178
179sub is_monotonic {
180 my( @readings ) = @_;
181 my( $common, $min, $max ) = ( -1, -1, -1 );
182 foreach my $rdg ( @readings ) {
183# print STDERR "Checking reading " . $rdg->name . "/" . $rdg->text . " - "
184# . $rdg->position->reference ."\n";
185 return 0 if $rdg->position->common < $common;
186 if( $rdg->position->common == $common ) {
187 return 0 if $rdg->position->min <= $min;
188 return 0 if $rdg->position->max <= $max;
189 }
190 $common = $rdg->position->common;
191 $min = $rdg->position->min;
192 $max = $rdg->position->max;
193 }
194 return 1;
25331c49 195}
196
1971;
198
199=head1 BUGS / TODO
200
201=over
202
203=item * Get rid of abomination that is cmp_str.
204
205=back
206
207=head1 LICENSE
208
209This package is free software and is provided "as is" without express
210or implied warranty. You can redistribute it and/or modify it under
211the same terms as Perl itself.
212
213=head1 AUTHOR
214
215Tara L Andrews E<lt>aurum@cpan.orgE<gt>