Commit | Line | Data |
910a0a6d |
1 | package Text::Tradition::Parser::Util; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use Algorithm::Diff; |
6 | use Exporter 'import'; |
7 | use vars qw/ @EXPORT_OK /; |
8 | @EXPORT_OK = qw/ add_hash_entry check_for_repeated cmp_str collate_variants is_monotonic /; |
9 | |
10 | =item B<collate_variants> |
11 | |
12 | collate_variants( $collation, @reading_ranges ) |
13 | |
14 | Given a set of readings in the form |
15 | ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... ) |
16 | walks through each to identify those readings that are identical. The |
17 | collation is a Text::Tradition::Collation object; the elements of |
18 | @readings are Text::Tradition::Collation::Reading objects that appear |
19 | on the collation graph. |
20 | |
21 | TODO: Handle collapsed and non-collapsed transpositions. |
22 | |
23 | =cut |
24 | |
25 | sub collate_variants { |
26 | my( $collation, @reading_sets ) = @_; |
27 | |
28 | # Two different ways to do this, depending on whether we want |
29 | # transposed reading nodes to be merged into one (producing a |
30 | # nonlinear, bidirectional graph) or not (producing a relatively |
31 | # linear, unidirectional graph.) |
32 | return $collation->linear ? collate_linearly( @_ ) |
33 | : collate_nonlinearly( @_ ); |
34 | } |
35 | |
36 | sub collate_linearly { |
37 | my( $collation, $lemma_set, @variant_sets ) = @_; |
38 | |
39 | my @unique; |
40 | my $substitutions = {}; |
41 | push( @unique, @$lemma_set ); |
42 | while( @variant_sets ) { |
43 | my $variant_set = shift @variant_sets; |
44 | # Use diff to do this job |
45 | my $diff = Algorithm::Diff->new( \@unique, $variant_set, |
46 | {'keyGen' => \&_collation_hash} ); |
47 | my @new_unique; |
48 | my %merged; |
49 | while( $diff->Next ) { |
50 | if( $diff->Same ) { |
51 | # merge the nodes |
52 | my @l = $diff->Items( 1 ); |
53 | my @v = $diff->Items( 2 ); |
54 | foreach my $i ( 0 .. $#l ) { |
55 | if( !$merged{$l[$i]->name} ) { |
56 | print STDERR sprintf( "Merging %s into %s\n", |
57 | $v[$i]->name, |
58 | $l[$i]->name ); |
59 | $collation->merge_readings( $l[$i], $v[$i] ); |
60 | $merged{$l[$i]->name} = 1; |
61 | $substitutions->{$v[$i]->name} = $l[$i]; |
62 | } else { |
63 | print STDERR "Would have double merged " . $l[$i]->name . "\n"; |
64 | } |
65 | } |
66 | # splice the lemma nodes into the variant set |
67 | my( $offset ) = $diff->Get( 'min2' ); |
68 | splice( @$variant_set, $offset, scalar( @l ), @l ); |
69 | push( @new_unique, @l ); |
70 | } else { |
71 | # Keep the old unique readings |
72 | push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 ); |
73 | # Add the new readings to the 'unique' list |
74 | push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 ); |
75 | } |
76 | } |
77 | @unique = @new_unique; |
78 | } |
79 | return $substitutions; |
80 | } |
81 | |
82 | sub collate_nonlinearly { |
83 | my( $collation, $lemma_set, @variant_sets ) = @_; |
84 | |
85 | my @unique; |
86 | my $substitutions = {}; |
87 | push( @unique, @$lemma_set ); |
88 | while( @variant_sets ) { |
89 | my $variant_set = shift @variant_sets; |
90 | # Simply match the first reading that carries the same word, so |
91 | # long as that reading has not yet been used to match another |
92 | # word in this variant. That way lies loopy madness. |
93 | my @distinct; |
94 | my %merged; |
95 | foreach my $idx ( 0 .. $#{$variant_set} ) { |
96 | my $vw = $variant_set->[$idx]; |
97 | my @same = grep { cmp_str( $_ ) eq $vw->label } @unique; |
98 | my $matched; |
99 | if( @same ) { |
100 | foreach my $i ( 0 .. $#same ) { |
101 | unless( $merged{$same[$i]->name} ) { |
102 | #print STDERR sprintf( "Merging %s into %s\n", |
103 | # $vw->name, |
104 | # $same[$i]->name ); |
105 | $collation->merge_readings( $same[$i], $vw ); |
106 | $merged{$same[$i]->name} = 1; |
107 | $matched = $i; |
108 | $variant_set->[$idx] = $same[$i]; |
109 | $substitutions->{$vw->name} = $same[$i]; |
110 | } |
111 | } |
112 | } |
113 | unless( @same && defined($matched) ) { |
114 | push( @distinct, $vw ); |
115 | } |
116 | } |
117 | push( @unique, @distinct ); |
118 | } |
119 | return $substitutions; |
120 | } |
121 | |
122 | sub _collation_hash { |
123 | my $node = shift; |
124 | return cmp_str( $node ); |
125 | } |
126 | |
127 | =item B<cmp_str> |
128 | |
129 | Pretend you never saw this method. Really it needs to not be hardcoded. |
130 | |
131 | =cut |
132 | |
133 | sub cmp_str { |
134 | my( $reading ) = @_; |
135 | my $word = $reading->label(); |
136 | $word = lc( $word ); |
137 | $word =~ s/\W//g; |
138 | $word =~ s/v/u/g; |
139 | $word =~ s/j/i/g; |
140 | $word =~ s/cha/ca/g; |
141 | $word =~ s/quatuor/quattuor/g; |
142 | $word =~ s/ioannes/iohannes/g; |
143 | return $word; |
144 | } |
145 | |
146 | =item B<collate_variants> |
147 | |
148 | my @rep = check_for_repeated( @readings ) |
149 | |
150 | Given an array of items, returns any items that appear in the array more |
151 | than once. |
152 | |
153 | =cut |
154 | |
155 | sub check_for_repeated { |
156 | my @seq = @_; |
157 | my %unique; |
158 | my @repeated; |
159 | foreach ( @seq ) { |
160 | if( exists $unique{$_->name} ) { |
161 | push( @repeated, $_->name ); |
162 | } else { |
163 | $unique{$_->name} = 1; |
164 | } |
165 | } |
166 | return @repeated; |
167 | } |
168 | |
169 | sub add_hash_entry { |
170 | my( $hash, $key, $entry ) = @_; |
171 | if( exists $hash->{$key} ) { |
172 | push( @{$hash->{$key}}, $entry ); |
173 | } else { |
174 | $hash->{$key} = [ $entry ]; |
175 | } |
176 | } |
177 | |
178 | sub is_monotonic { |
179 | my( @readings ) = @_; |
180 | my( $common, $min, $max ) = ( -1, -1, -1 ); |
181 | foreach my $rdg ( @readings ) { |
182 | # print STDERR "Checking reading " . $rdg->name . "/" . $rdg->text . " - " |
183 | # . $rdg->position->reference ."\n"; |
184 | return 0 if $rdg->position->common < $common; |
185 | if( $rdg->position->common == $common ) { |
186 | return 0 if $rdg->position->min <= $min; |
187 | return 0 if $rdg->position->max <= $max; |
188 | } |
189 | $common = $rdg->position->common; |
190 | $min = $rdg->position->min; |
191 | $max = $rdg->position->max; |
192 | } |
193 | return 1; |
194 | } |