lib/Text/Tradition/Parser/Util.pm

   1 package Text::Tradition::Parser::Util;
   2
   3 use strict;
   4 use warnings;
   5 use Algorithm::Diff;
   6 use Exporter 'import';
   7 use vars qw/ @EXPORT_OK /;
   8 @EXPORT_OK = qw/ add_hash_entry check_for_repeated cmp_str collate_variants is_monotonic /;
   9
  10 =head1 NAME
  11
  12 Text::Tradition::Parser::Util
  13
  14 =head1 DESCRIPTION
  15
  16 A collection of utilities used by multiple Text::Tradition parsers.
  17 Probably not of external interest.
  18
  19 =head1 METHODS
  20
  21 =head2 B<collate_variants>( $collation, @reading_ranges )
  22
  23 Given a set of readings in the form
  24 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
  25 walks through each to identify those readings that are identical.  The
  26 collation is a Text::Tradition::Collation object; the elements of
  27 @readings are Text::Tradition::Collation::Reading objects that appear
  28 on the collation graph.
  29
  30 TODO: Handle collapsed and non-collapsed transpositions.
  31
  32 =cut
  33
  34 sub collate_variants {
  35     my( $collation, @reading_sets ) = @_;
  36
  37     # Two different ways to do this, depending on whether we want
  38     # transposed reading nodes to be merged into one (producing a
  39     # nonlinear, bidirectional graph) or not (producing a relatively
  40     # linear, unidirectional graph.)
  41     return $collation->linear ? collate_linearly( @_ )
  42         : collate_nonlinearly( @_ );
  43 }
  44
  45 sub collate_linearly {
  46     my( $collation, $lemma_set, @variant_sets ) = @_;
  47
  48     my @unique;
  49     my $substitutions = {};
  50     push( @unique, @$lemma_set );
  51     while( @variant_sets ) {
  52         my $variant_set = shift @variant_sets;
  53         # Use diff to do this job
  54         my $diff = Algorithm::Diff->new( \@unique, $variant_set,
  55                                          {'keyGen' => \&_collation_hash} );
  56         my @new_unique;
  57         my %merged;
  58         while( $diff->Next ) {
  59             if( $diff->Same ) {
  60                 # merge the nodes
  61                 my @l = $diff->Items( 1 );
  62                 my @v = $diff->Items( 2 );
  63                 foreach my $i ( 0 .. $#l ) {
  64                     if( !$merged{$l[$i]->name} ) {
  65                         print STDERR sprintf( "Merging %s into %s\n",
  66                                              $v[$i]->name,
  67                                              $l[$i]->name );
  68                         $collation->merge_readings( $l[$i], $v[$i] );
  69                         $merged{$l[$i]->name} = 1;
  70                         $substitutions->{$v[$i]->name} = $l[$i];
  71                     } else {
  72                         print STDERR "Would have double merged " . $l[$i]->name . "\n";
  73                     }
  74                 }
  75                 # splice the lemma nodes into the variant set
  76                 my( $offset ) = $diff->Get( 'min2' );
  77                 splice( @$variant_set, $offset, scalar( @l ), @l );
  78                 push( @new_unique, @l );
  79             } else {
  80                 # Keep the old unique readings
  81                 push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
  82                 # Add the new readings to the 'unique' list
  83                 push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
  84             }
  85         }
  86         @unique = @new_unique;
  87     }
  88     return $substitutions;
  89 }
  90
  91 sub collate_nonlinearly {
  92     my( $collation, $lemma_set, @variant_sets ) = @_;
  93
  94     my @unique;
  95     my $substitutions = {};
  96     push( @unique, @$lemma_set );
  97     while( @variant_sets ) {
  98         my $variant_set = shift @variant_sets;
  99         # Simply match the first reading that carries the same word, so
 100         # long as that reading has not yet been used to match another
 101         # word in this variant. That way lies loopy madness.
 102         my @distinct;
 103         my %merged;
 104         foreach my $idx ( 0 .. $#{$variant_set} ) {
 105             my $vw = $variant_set->[$idx];
 106             my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
 107             my $matched;
 108             if( @same ) {
 109                 foreach my $i ( 0 .. $#same ) {
 110                     unless( $merged{$same[$i]->name} ) {
 111                         #print STDERR sprintf( "Merging %s into %s\n",
 112                         #                     $vw->name,
 113                         #                     $same[$i]->name );
 114                         $collation->merge_readings( $same[$i], $vw );
 115                         $merged{$same[$i]->name} = 1;
 116                         $matched = $i;
 117                         $variant_set->[$idx] = $same[$i];
 118                         $substitutions->{$vw->name} = $same[$i];
 119                     }
 120                 }
 121             }
 122             unless( @same && defined($matched) ) {
 123                 push( @distinct, $vw );
 124             }
 125         }
 126         push( @unique, @distinct );
 127     }
 128     return $substitutions;
 129 }
 130
 131 sub _collation_hash {
 132     my $node = shift;
 133     return cmp_str( $node );
 134 }
 135
 136 sub cmp_str {
 137     my( $reading ) = @_;
 138     my $word = $reading->label();
 139     $word = lc( $word );
 140     $word =~ s/\W//g;
 141     $word =~ s/v/u/g;
 142     $word =~ s/j/i/g;
 143     $word =~ s/cha/ca/g;
 144     $word =~ s/quatuor/quattuor/g;
 145     $word =~ s/ioannes/iohannes/g;
 146     return $word;
 147 }
 148
 149 =head2 B<check_for_repeated>( @readings )
 150
 151 Given an array of items, returns any items that appear in the array more
 152 than once.
 153
 154 =cut
 155
 156 sub check_for_repeated {
 157     my @seq = @_;
 158     my %unique;
 159     my @repeated;
 160     foreach ( @seq ) {
 161         if( exists $unique{$_->name} ) {
 162             push( @repeated, $_->name );
 163         } else {
 164             $unique{$_->name} = 1;
 165         }
 166     }
 167     return @repeated;
 168 }
 169
 170 sub add_hash_entry {
 171     my( $hash, $key, $entry ) = @_;
 172     if( exists $hash->{$key} ) {
 173         push( @{$hash->{$key}}, $entry );
 174     } else {
 175         $hash->{$key} = [ $entry ];
 176     }
 177 }
 178
 179 sub is_monotonic {
 180     my( @readings ) = @_;
 181     my( $common, $min, $max ) = ( -1, -1, -1 );
 182     foreach my $rdg ( @readings ) {
 183 #         print STDERR "Checking reading " . $rdg->name . "/" . $rdg->text . " - "
 184 #         . $rdg->position->reference ."\n";
 185         return 0 if $rdg->position->common < $common;
 186         if( $rdg->position->common == $common ) {
 187             return 0 if $rdg->position->min <= $min;
 188             return 0 if $rdg->position->max <= $max;
 189         }
 190         $common = $rdg->position->common;
 191         $min = $rdg->position->min;
 192         $max = $rdg->position->max;
 193     }
 194     return 1;
 195 }
 196
 197 1;
 198
 199 =head1 BUGS / TODO
 200
 201 =over
 202
 203 =item * Get rid of abomination that is cmp_str.
 204
 205 =back
 206
 207 =head1 LICENSE
 208
 209 This package is free software and is provided "as is" without express
 210 or implied warranty.  You can redistribute it and/or modify it under
 211 the same terms as Perl itself.
 212
 213 =head1 AUTHOR
 214
 215 Tara L Andrews E<lt>aurum@cpan.orgE<gt>