lib/Text/Tradition/Parser/Util.pm

   1 package Text::Tradition::Parser::Util;
   2
   3 use strict;
   4 use warnings;
   5 use Algorithm::Diff;
   6 use Exporter 'import';
   7 use vars qw/ @EXPORT_OK /;
   8 @EXPORT_OK = qw/ add_hash_entry check_for_repeated cmp_str collate_variants is_monotonic /;
   9
  10 =head1 NAME
  11
  12 Text::Tradition::Parser::Util
  13
  14 =head1 DESCRIPTION
  15
  16 A collection of utilities used by multiple Text::Tradition parsers.
  17 Probably not of external interest.
  18
  19 =head1 METHODS
  20
  21 =head2 B<collate_variants>( $collation, @reading_ranges )
  22
  23 Given a set of readings in the form
  24 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
  25 walks through each to identify those readings that are identical.  The
  26 collation is a Text::Tradition::Collation object; the elements of
  27 @readings are Text::Tradition::Collation::Reading objects that appear
  28 on the collation graph.
  29
  30 =cut
  31
  32 sub collate_variants {
  33     my( $collation, @reading_sets ) = @_;
  34
  35     # Make sure the reading sets are unique, but keep
  36     # the lemma first.
  37     my $lemma = shift @reading_sets;
  38     my %unique_sets;
  39     map { $unique_sets{$_} = $_ } @reading_sets;
  40         delete $unique_sets{$lemma};
  41         my @sets = values %unique_sets;
  42         unshift( @sets, $lemma );
  43
  44     # Two different ways to do this, depending on whether we want
  45     # transposed reading nodes to be merged into one (producing a
  46     # nonlinear, bidirectional graph) or not (producing a relatively
  47     # linear, unidirectional graph.)
  48     return $collation->linear ? _collate_linearly( $collation, @sets )
  49         : _collate_nonlinearly( $collation, @sets );
  50 }
  51
  52 sub _collate_linearly {
  53     my( $collation, $lemma_set, @variant_sets ) = @_;
  54
  55     my @unique;
  56     my $substitutions = {};
  57     push( @unique, @$lemma_set );
  58     while( @variant_sets ) {
  59         my $variant_set = shift @variant_sets;
  60         # Use diff to do this job
  61         my $diff = Algorithm::Diff->new( \@unique, $variant_set,
  62                                          {'keyGen' => \&_collation_hash} );
  63         my @new_unique;
  64         my %merged;
  65         while( $diff->Next ) {
  66             if( $diff->Same ) {
  67                 # merge the nodes
  68                 my @l = $diff->Items( 1 );
  69                 my @v = $diff->Items( 2 );
  70                 foreach my $i ( 0 .. $#l ) {
  71                     if( !$merged{$l[$i]->id} ) {
  72                         next if $v[$i] eq $l[$i];
  73 #                         print STDERR sprintf( "Merging %s into %s\n",
  74 #                                              $v[$i]->id,
  75 #                                              $l[$i]->id );
  76                         $collation->merge_readings( $l[$i], $v[$i] );
  77                         $merged{$l[$i]->id} = 1;
  78                         $substitutions->{$v[$i]->id} = $l[$i];
  79                     } else {
  80                         print STDERR "Would have double merged " . $l[$i]->id . "\n";
  81                     }
  82                 }
  83                 # splice the lemma nodes into the variant set
  84                 my( $offset ) = $diff->Get( 'min2' );
  85                 splice( @$variant_set, $offset, scalar( @l ), @l );
  86                 push( @new_unique, @l );
  87             } else {
  88                 # Keep the old unique readings
  89                 push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
  90                 # Add the new readings to the 'unique' list
  91                 push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
  92             }
  93         }
  94         @unique = @new_unique;
  95     }
  96     return $substitutions;
  97 }
  98
  99 sub _collate_nonlinearly {
 100     my( $collation, $lemma_set, @variant_sets ) = @_;
 101
 102     my @unique;
 103     my $substitutions = {};
 104     push( @unique, @$lemma_set );
 105     while( @variant_sets ) {
 106         my $variant_set = shift @variant_sets;
 107         # Simply match the first reading that carries the same word, so
 108         # long as that reading has not yet been used to match another
 109         # word in this variant. That way lies loopy madness.
 110         my @distinct;
 111         my %merged;
 112         foreach my $idx ( 0 .. $#{$variant_set} ) {
 113             my $vw = $variant_set->[$idx];
 114             my @same = grep { cmp_str( $_ ) eq $vw->text } @unique;
 115             my $matched;
 116             if( @same ) {
 117                 foreach my $i ( 0 .. $#same ) {
 118                     unless( $merged{$same[$i]->id} ) {
 119                         #print STDERR sprintf( "Merging %s into %s\n",
 120                         #                     $vw->id,
 121                         #                     $same[$i]->id );
 122                         $collation->merge_readings( $same[$i], $vw );
 123                         $merged{$same[$i]->id} = 1;
 124                         $matched = $i;
 125                         $variant_set->[$idx] = $same[$i];
 126                         $substitutions->{$vw->id} = $same[$i];
 127                     }
 128                 }
 129             }
 130             unless( @same && defined($matched) ) {
 131                 push( @distinct, $vw );
 132             }
 133         }
 134         push( @unique, @distinct );
 135     }
 136     return $substitutions;
 137 }
 138
 139 sub _collation_hash {
 140     my $node = shift;
 141     return cmp_str( $node );
 142 }
 143
 144 =head2 B<cmp_str>
 145
 146 Don't use this. Really.
 147
 148 =cut
 149
 150 sub cmp_str {
 151     my( $reading ) = @_;
 152     my $word = $reading->text();
 153     return $word unless $reading->collation->tradition->name =~ /158/;
 154     $word = lc( $word );
 155     $word =~ s/\W//g;
 156     $word =~ s/v/u/g;
 157     $word =~ s/j/i/g;
 158     $word =~ s/cha/ca/g;
 159     $word =~ s/quatuor/quattuor/g;
 160     $word =~ s/ioannes/iohannes/g;
 161     return $word;
 162 }
 163
 164 =head2 B<check_for_repeated>( @readings )
 165
 166 Given an array of items, returns any items that appear in the array more
 167 than once.
 168
 169 =cut
 170
 171 sub check_for_repeated {
 172     my @seq = @_;
 173     my %unique;
 174     my @repeated;
 175     foreach ( @seq ) {
 176         if( exists $unique{$_->id} ) {
 177             push( @repeated, $_->id );
 178         } else {
 179             $unique{$_->id} = 1;
 180         }
 181     }
 182     return @repeated;
 183 }
 184
 185 =head2 B<add_hash_entry>( $hash, $key, $entry )
 186
 187 Very simple utility for adding $entry to the list at $hash->{$key}.
 188
 189 =cut
 190
 191 sub add_hash_entry {
 192     my( $hash, $key, $entry ) = @_;
 193     if( exists $hash->{$key} ) {
 194         push( @{$hash->{$key}}, $entry );
 195     } else {
 196         $hash->{$key} = [ $entry ];
 197     }
 198 }
 199
 200 1;
 201
 202 =head1 BUGS / TODO
 203
 204 =over
 205
 206 =item * Get rid of abomination that is cmp_str.
 207
 208 =back
 209
 210 =head1 LICENSE
 211
 212 This package is free software and is provided "as is" without express
 213 or implied warranty.  You can redistribute it and/or modify it under
 214 the same terms as Perl itself.
 215
 216 =head1 AUTHOR
 217
 218 Tara L Andrews E<lt>aurum@cpan.orgE<gt>