changed interface for Tradition init
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Util.pm
CommitLineData
910a0a6d 1package Text::Tradition::Parser::Util;
2
3use strict;
4use warnings;
5use Algorithm::Diff;
6use Exporter 'import';
7use vars qw/ @EXPORT_OK /;
8@EXPORT_OK = qw/ add_hash_entry check_for_repeated cmp_str collate_variants is_monotonic /;
9
10=item B<collate_variants>
11
12collate_variants( $collation, @reading_ranges )
13
14Given a set of readings in the form
15( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
16walks through each to identify those readings that are identical. The
17collation is a Text::Tradition::Collation object; the elements of
18@readings are Text::Tradition::Collation::Reading objects that appear
19on the collation graph.
20
21TODO: Handle collapsed and non-collapsed transpositions.
22
23=cut
24
25sub collate_variants {
26 my( $collation, @reading_sets ) = @_;
27
28 # Two different ways to do this, depending on whether we want
29 # transposed reading nodes to be merged into one (producing a
30 # nonlinear, bidirectional graph) or not (producing a relatively
31 # linear, unidirectional graph.)
32 return $collation->linear ? collate_linearly( @_ )
33 : collate_nonlinearly( @_ );
34}
35
36sub collate_linearly {
37 my( $collation, $lemma_set, @variant_sets ) = @_;
38
39 my @unique;
40 my $substitutions = {};
41 push( @unique, @$lemma_set );
42 while( @variant_sets ) {
43 my $variant_set = shift @variant_sets;
44 # Use diff to do this job
45 my $diff = Algorithm::Diff->new( \@unique, $variant_set,
46 {'keyGen' => \&_collation_hash} );
47 my @new_unique;
48 my %merged;
49 while( $diff->Next ) {
50 if( $diff->Same ) {
51 # merge the nodes
52 my @l = $diff->Items( 1 );
53 my @v = $diff->Items( 2 );
54 foreach my $i ( 0 .. $#l ) {
55 if( !$merged{$l[$i]->name} ) {
56 print STDERR sprintf( "Merging %s into %s\n",
57 $v[$i]->name,
58 $l[$i]->name );
59 $collation->merge_readings( $l[$i], $v[$i] );
60 $merged{$l[$i]->name} = 1;
61 $substitutions->{$v[$i]->name} = $l[$i];
62 } else {
63 print STDERR "Would have double merged " . $l[$i]->name . "\n";
64 }
65 }
66 # splice the lemma nodes into the variant set
67 my( $offset ) = $diff->Get( 'min2' );
68 splice( @$variant_set, $offset, scalar( @l ), @l );
69 push( @new_unique, @l );
70 } else {
71 # Keep the old unique readings
72 push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
73 # Add the new readings to the 'unique' list
74 push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
75 }
76 }
77 @unique = @new_unique;
78 }
79 return $substitutions;
80}
81
82sub collate_nonlinearly {
83 my( $collation, $lemma_set, @variant_sets ) = @_;
84
85 my @unique;
86 my $substitutions = {};
87 push( @unique, @$lemma_set );
88 while( @variant_sets ) {
89 my $variant_set = shift @variant_sets;
90 # Simply match the first reading that carries the same word, so
91 # long as that reading has not yet been used to match another
92 # word in this variant. That way lies loopy madness.
93 my @distinct;
94 my %merged;
95 foreach my $idx ( 0 .. $#{$variant_set} ) {
96 my $vw = $variant_set->[$idx];
97 my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
98 my $matched;
99 if( @same ) {
100 foreach my $i ( 0 .. $#same ) {
101 unless( $merged{$same[$i]->name} ) {
102 #print STDERR sprintf( "Merging %s into %s\n",
103 # $vw->name,
104 # $same[$i]->name );
105 $collation->merge_readings( $same[$i], $vw );
106 $merged{$same[$i]->name} = 1;
107 $matched = $i;
108 $variant_set->[$idx] = $same[$i];
109 $substitutions->{$vw->name} = $same[$i];
110 }
111 }
112 }
113 unless( @same && defined($matched) ) {
114 push( @distinct, $vw );
115 }
116 }
117 push( @unique, @distinct );
118 }
119 return $substitutions;
120}
121
122sub _collation_hash {
123 my $node = shift;
124 return cmp_str( $node );
125}
126
127=item B<cmp_str>
128
129Pretend you never saw this method. Really it needs to not be hardcoded.
130
131=cut
132
133sub cmp_str {
134 my( $reading ) = @_;
135 my $word = $reading->label();
136 $word = lc( $word );
137 $word =~ s/\W//g;
138 $word =~ s/v/u/g;
139 $word =~ s/j/i/g;
140 $word =~ s/cha/ca/g;
141 $word =~ s/quatuor/quattuor/g;
142 $word =~ s/ioannes/iohannes/g;
143 return $word;
144}
145
146=item B<collate_variants>
147
148my @rep = check_for_repeated( @readings )
149
150Given an array of items, returns any items that appear in the array more
151than once.
152
153=cut
154
155sub check_for_repeated {
156 my @seq = @_;
157 my %unique;
158 my @repeated;
159 foreach ( @seq ) {
160 if( exists $unique{$_->name} ) {
161 push( @repeated, $_->name );
162 } else {
163 $unique{$_->name} = 1;
164 }
165 }
166 return @repeated;
167}
168
169sub add_hash_entry {
170 my( $hash, $key, $entry ) = @_;
171 if( exists $hash->{$key} ) {
172 push( @{$hash->{$key}}, $entry );
173 } else {
174 $hash->{$key} = [ $entry ];
175 }
176}
177
178sub is_monotonic {
179 my( @readings ) = @_;
180 my( $common, $min, $max ) = ( -1, -1, -1 );
181 foreach my $rdg ( @readings ) {
182# print STDERR "Checking reading " . $rdg->name . "/" . $rdg->text . " - "
183# . $rdg->position->reference ."\n";
184 return 0 if $rdg->position->common < $common;
185 if( $rdg->position->common == $common ) {
186 return 0 if $rdg->position->min <= $min;
187 return 0 if $rdg->position->max <= $max;
188 }
189 $common = $rdg->position->common;
190 $min = $rdg->position->min;
191 $max = $rdg->position->max;
192 }
193 return 1;
194}