lib/Text/Tradition/Parser/Util.pm

   1 package Text::Tradition::Parser::Util;
   2
   3 use strict;
   4 use warnings;
   5 use Algorithm::Diff;
   6 use Exporter 'import';
   7 use vars qw/ @EXPORT_OK /;
   8 @EXPORT_OK = qw/ add_hash_entry check_for_repeated cmp_str collate_variants is_monotonic /;
   9
  10 =head1 NAME
  11
  12 Text::Tradition::Parser::Util
  13
  14 =head1 DESCRIPTION
  15
  16 A collection of utilities used by multiple Text::Tradition parsers.
  17 Probably not of external interest.
  18
  19 =head1 METHODS
  20
  21 =head2 B<collate_variants>( $collation, @reading_ranges )
  22
  23 Given a set of readings in the form
  24 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
  25 walks through each to identify those readings that are identical.  The
  26 collation is a Text::Tradition::Collation object; the elements of
  27 @readings are Text::Tradition::Collation::Reading objects that appear
  28 on the collation graph.
  29
  30 =cut
  31
  32 sub collate_variants {
  33     my( $collation, @reading_sets ) = @_;
  34
  35     # Make sure the reading sets are unique, but retain their ordering.
  36     my %unique_sets;
  37     my @sets;
  38     foreach( @reading_sets ) {
  39         push( @sets, $_ ) unless $unique_sets{$_};
  40         $unique_sets{$_} = $_;
  41     }
  42
  43     # Two different ways to do this, depending on whether we want
  44     # transposed reading nodes to be merged into one (producing a
  45     # nonlinear, bidirectional graph) or not (producing a relatively
  46     # linear, unidirectional graph.)
  47     return $collation->linear ? _collate_linearly( $collation, @sets )
  48         : _collate_nonlinearly( $collation, @sets );
  49 }
  50
  51 sub _collate_linearly {
  52     my( $collation, $lemma_set, @variant_sets ) = @_;
  53
  54     my @unique;
  55     my $substitutions = {};
  56     push( @unique, @$lemma_set );
  57     while( @variant_sets ) {
  58         my $variant_set = shift @variant_sets;
  59         # Use diff to do this job
  60         my $diff = Algorithm::Diff->new( \@unique, $variant_set,
  61                                          {'keyGen' => \&_collation_hash} );
  62         my @new_unique;
  63         my %merged;
  64         while( $diff->Next ) {
  65             if( $diff->Same ) {
  66                 # merge the nodes
  67                 my @l = $diff->Items( 1 );
  68                 my @v = $diff->Items( 2 );
  69                 foreach my $i ( 0 .. $#l ) {
  70                     if( !$merged{$l[$i]->id} ) {
  71                         next if $v[$i] eq $l[$i];
  72 #                         print STDERR sprintf( "Merging %s into %s\n",
  73 #                                              $v[$i]->id,
  74 #                                              $l[$i]->id );
  75                         $collation->merge_readings( $l[$i], $v[$i] );
  76                         $merged{$l[$i]->id} = 1;
  77                         $substitutions->{$v[$i]->id} = $l[$i];
  78                     } else {
  79                         print STDERR "Would have double merged " . $l[$i]->id . "\n";
  80                     }
  81                 }
  82                 # splice the lemma nodes into the variant set
  83                 my( $offset ) = $diff->Get( 'min2' );
  84                 splice( @$variant_set, $offset, scalar( @l ), @l );
  85                 push( @new_unique, @l );
  86             } else {
  87                 # Keep the old unique readings
  88                 push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
  89                 # Add the new readings to the 'unique' list
  90                 push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
  91             }
  92         }
  93         @unique = @new_unique;
  94     }
  95     return $substitutions;
  96 }
  97
  98 sub _collate_nonlinearly {
  99     my( $collation, $lemma_set, @variant_sets ) = @_;
 100
 101     my @unique;
 102     my $substitutions = {};
 103     push( @unique, @$lemma_set );
 104     while( @variant_sets ) {
 105         my $variant_set = shift @variant_sets;
 106         # Simply match the first reading that carries the same word, so
 107         # long as that reading has not yet been used to match another
 108         # word in this variant. That way lies loopy madness.
 109         my @distinct;
 110         my %merged;
 111         foreach my $idx ( 0 .. $#{$variant_set} ) {
 112             my $vw = $variant_set->[$idx];
 113             my @same = grep { cmp_str( $_ ) eq $vw->text } @unique;
 114             my $matched;
 115             if( @same ) {
 116                 foreach my $i ( 0 .. $#same ) {
 117                     unless( $merged{$same[$i]->id} ) {
 118                         #print STDERR sprintf( "Merging %s into %s\n",
 119                         #                     $vw->id,
 120                         #                     $same[$i]->id );
 121                         $collation->merge_readings( $same[$i], $vw );
 122                         $merged{$same[$i]->id} = 1;
 123                         $matched = $i;
 124                         $variant_set->[$idx] = $same[$i];
 125                         $substitutions->{$vw->id} = $same[$i];
 126                     }
 127                 }
 128             }
 129             unless( @same && defined($matched) ) {
 130                 push( @distinct, $vw );
 131             }
 132         }
 133         push( @unique, @distinct );
 134     }
 135     return $substitutions;
 136 }
 137
 138 sub _collation_hash {
 139     my $node = shift;
 140     return cmp_str( $node );
 141 }
 142
 143 =head2 B<cmp_str>
 144
 145 Don't use this. Really.
 146
 147 =cut
 148
 149 sub cmp_str {
 150     my( $reading ) = @_;
 151     my $word = $reading->text();
 152     return $word unless $reading->collation->tradition->name =~ /158/;
 153     $word = lc( $word );
 154     $word =~ s/\W//g;
 155     $word =~ s/v/u/g;
 156     $word =~ s/j/i/g;
 157     $word =~ s/cha/ca/g;
 158     $word =~ s/quatuor/quattuor/g;
 159     $word =~ s/ioannes/iohannes/g;
 160     return $word;
 161 }
 162
 163 =head2 B<check_for_repeated>( @readings )
 164
 165 Given an array of items, returns any items that appear in the array more
 166 than once.
 167
 168 =cut
 169
 170 sub check_for_repeated {
 171     my @seq = @_;
 172     my %unique;
 173     my @repeated;
 174     foreach ( @seq ) {
 175         if( exists $unique{$_->id} ) {
 176             push( @repeated, $_->id );
 177         } else {
 178             $unique{$_->id} = 1;
 179         }
 180     }
 181     return @repeated;
 182 }
 183
 184 =head2 B<add_hash_entry>( $hash, $key, $entry )
 185
 186 Very simple utility for adding $entry to the list at $hash->{$key}.
 187
 188 =cut
 189
 190 sub add_hash_entry {
 191     my( $hash, $key, $entry ) = @_;
 192     if( exists $hash->{$key} ) {
 193         push( @{$hash->{$key}}, $entry );
 194     } else {
 195         $hash->{$key} = [ $entry ];
 196     }
 197 }
 198
 199 1;
 200
 201 =head1 BUGS / TODO
 202
 203 =over
 204
 205 =item * Get rid of abomination that is cmp_str.
 206
 207 =back
 208
 209 =head1 LICENSE
 210
 211 This package is free software and is provided "as is" without express
 212 or implied warranty.  You can redistribute it and/or modify it under
 213 the same terms as Perl itself.
 214
 215 =head1 AUTHOR
 216
 217 Tara L Andrews E<lt>aurum@cpan.orgE<gt>