Merge branch 'master' of github.com:tla/stemmatology
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Util.pm
CommitLineData
910a0a6d 1package Text::Tradition::Parser::Util;
2
3use strict;
4use warnings;
5use Algorithm::Diff;
6use Exporter 'import';
7use vars qw/ @EXPORT_OK /;
8@EXPORT_OK = qw/ add_hash_entry check_for_repeated cmp_str collate_variants is_monotonic /;
9
25331c49 10=head1 NAME
910a0a6d 11
25331c49 12Text::Tradition::Parser::Util
13
14=head1 DESCRIPTION
15
16A collection of utilities used by multiple Text::Tradition parsers.
17Probably not of external interest.
18
19=head1 METHODS
20
21=head2 B<collate_variants>( $collation, @reading_ranges )
910a0a6d 22
23Given a set of readings in the form
24( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
25walks through each to identify those readings that are identical. The
26collation is a Text::Tradition::Collation object; the elements of
27@readings are Text::Tradition::Collation::Reading objects that appear
28on the collation graph.
29
910a0a6d 30=cut
31
32sub collate_variants {
33 my( $collation, @reading_sets ) = @_;
f8b07f32 34
35 # Make sure the reading sets are unique, but keep
36 # the lemma first.
37 my $lemma = shift @reading_sets;
38 my %unique_sets;
39 map { $unique_sets{$_} = $_ } @reading_sets;
40 delete $unique_sets{$lemma};
41 my @sets = values %unique_sets;
42 unshift( @sets, $lemma );
910a0a6d 43
44 # Two different ways to do this, depending on whether we want
45 # transposed reading nodes to be merged into one (producing a
46 # nonlinear, bidirectional graph) or not (producing a relatively
47 # linear, unidirectional graph.)
f8b07f32 48 return $collation->linear ? _collate_linearly( $collation, @sets )
49 : _collate_nonlinearly( $collation, @sets );
910a0a6d 50}
51
027d819c 52sub _collate_linearly {
910a0a6d 53 my( $collation, $lemma_set, @variant_sets ) = @_;
54
55 my @unique;
56 my $substitutions = {};
57 push( @unique, @$lemma_set );
58 while( @variant_sets ) {
59 my $variant_set = shift @variant_sets;
60 # Use diff to do this job
61 my $diff = Algorithm::Diff->new( \@unique, $variant_set,
62 {'keyGen' => \&_collation_hash} );
63 my @new_unique;
64 my %merged;
65 while( $diff->Next ) {
66 if( $diff->Same ) {
67 # merge the nodes
68 my @l = $diff->Items( 1 );
69 my @v = $diff->Items( 2 );
70 foreach my $i ( 0 .. $#l ) {
e4b0f464 71 if( !$merged{$l[$i]->id} ) {
f6e19c7c 72 next if $v[$i] eq $l[$i];
0068967c 73# print STDERR sprintf( "Merging %s into %s\n",
74# $v[$i]->id,
75# $l[$i]->id );
910a0a6d 76 $collation->merge_readings( $l[$i], $v[$i] );
e4b0f464 77 $merged{$l[$i]->id} = 1;
78 $substitutions->{$v[$i]->id} = $l[$i];
910a0a6d 79 } else {
e4b0f464 80 print STDERR "Would have double merged " . $l[$i]->id . "\n";
910a0a6d 81 }
82 }
83 # splice the lemma nodes into the variant set
84 my( $offset ) = $diff->Get( 'min2' );
85 splice( @$variant_set, $offset, scalar( @l ), @l );
86 push( @new_unique, @l );
87 } else {
88 # Keep the old unique readings
89 push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
90 # Add the new readings to the 'unique' list
91 push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
92 }
93 }
94 @unique = @new_unique;
95 }
96 return $substitutions;
97}
98
027d819c 99sub _collate_nonlinearly {
910a0a6d 100 my( $collation, $lemma_set, @variant_sets ) = @_;
101
102 my @unique;
103 my $substitutions = {};
104 push( @unique, @$lemma_set );
105 while( @variant_sets ) {
106 my $variant_set = shift @variant_sets;
107 # Simply match the first reading that carries the same word, so
108 # long as that reading has not yet been used to match another
109 # word in this variant. That way lies loopy madness.
110 my @distinct;
111 my %merged;
112 foreach my $idx ( 0 .. $#{$variant_set} ) {
113 my $vw = $variant_set->[$idx];
e4b0f464 114 my @same = grep { cmp_str( $_ ) eq $vw->text } @unique;
910a0a6d 115 my $matched;
116 if( @same ) {
117 foreach my $i ( 0 .. $#same ) {
e4b0f464 118 unless( $merged{$same[$i]->id} ) {
910a0a6d 119 #print STDERR sprintf( "Merging %s into %s\n",
e4b0f464 120 # $vw->id,
121 # $same[$i]->id );
910a0a6d 122 $collation->merge_readings( $same[$i], $vw );
e4b0f464 123 $merged{$same[$i]->id} = 1;
910a0a6d 124 $matched = $i;
125 $variant_set->[$idx] = $same[$i];
e4b0f464 126 $substitutions->{$vw->id} = $same[$i];
910a0a6d 127 }
128 }
129 }
130 unless( @same && defined($matched) ) {
131 push( @distinct, $vw );
132 }
133 }
134 push( @unique, @distinct );
135 }
136 return $substitutions;
137}
138
139sub _collation_hash {
140 my $node = shift;
141 return cmp_str( $node );
142}
143
027d819c 144=head2 B<cmp_str>
145
146Don't use this. Really.
147
148=cut
149
910a0a6d 150sub cmp_str {
151 my( $reading ) = @_;
e4b0f464 152 my $word = $reading->text();
44a6e7af 153 return $word unless $reading->collation->tradition->name =~ /158/;
910a0a6d 154 $word = lc( $word );
155 $word =~ s/\W//g;
156 $word =~ s/v/u/g;
157 $word =~ s/j/i/g;
158 $word =~ s/cha/ca/g;
159 $word =~ s/quatuor/quattuor/g;
160 $word =~ s/ioannes/iohannes/g;
161 return $word;
162}
163
25331c49 164=head2 B<check_for_repeated>( @readings )
910a0a6d 165
166Given an array of items, returns any items that appear in the array more
167than once.
168
169=cut
170
171sub check_for_repeated {
172 my @seq = @_;
173 my %unique;
174 my @repeated;
175 foreach ( @seq ) {
e4b0f464 176 if( exists $unique{$_->id} ) {
177 push( @repeated, $_->id );
910a0a6d 178 } else {
e4b0f464 179 $unique{$_->id} = 1;
910a0a6d 180 }
181 }
182 return @repeated;
183}
184
027d819c 185=head2 B<add_hash_entry>( $hash, $key, $entry )
186
187Very simple utility for adding $entry to the list at $hash->{$key}.
188
189=cut
190
910a0a6d 191sub add_hash_entry {
192 my( $hash, $key, $entry ) = @_;
193 if( exists $hash->{$key} ) {
194 push( @{$hash->{$key}}, $entry );
195 } else {
196 $hash->{$key} = [ $entry ];
197 }
198}
199
25331c49 2001;
201
202=head1 BUGS / TODO
203
204=over
205
206=item * Get rid of abomination that is cmp_str.
207
208=back
209
210=head1 LICENSE
211
212This package is free software and is provided "as is" without express
213or implied warranty. You can redistribute it and/or modify it under
214the same terms as Perl itself.
215
216=head1 AUTHOR
217
218Tara L Andrews E<lt>aurum@cpan.orgE<gt>