lib/Text/Tradition/Parser/Util.pm

   1 package Text::Tradition::Parser::Util;
   2
   3 use strict;
   4 use warnings;
   5 use Algorithm::Diff;
   6 use Exporter 'import';
   7 use vars qw/ @EXPORT_OK /;
   8 @EXPORT_OK = qw/ add_hash_entry check_for_repeated cmp_str collate_variants is_monotonic /;
   9
  10 =item B<collate_variants>
  11
  12 collate_variants( $collation, @reading_ranges )
  13
  14 Given a set of readings in the form
  15 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
  16 walks through each to identify those readings that are identical.  The
  17 collation is a Text::Tradition::Collation object; the elements of
  18 @readings are Text::Tradition::Collation::Reading objects that appear
  19 on the collation graph.
  20
  21 TODO: Handle collapsed and non-collapsed transpositions.
  22
  23 =cut
  24
  25 sub collate_variants {
  26     my( $collation, @reading_sets ) = @_;
  27
  28     # Two different ways to do this, depending on whether we want
  29     # transposed reading nodes to be merged into one (producing a
  30     # nonlinear, bidirectional graph) or not (producing a relatively
  31     # linear, unidirectional graph.)
  32     return $collation->linear ? collate_linearly( @_ )
  33         : collate_nonlinearly( @_ );
  34 }
  35
  36 sub collate_linearly {
  37     my( $collation, $lemma_set, @variant_sets ) = @_;
  38
  39     my @unique;
  40     my $substitutions = {};
  41     push( @unique, @$lemma_set );
  42     while( @variant_sets ) {
  43         my $variant_set = shift @variant_sets;
  44         # Use diff to do this job
  45         my $diff = Algorithm::Diff->new( \@unique, $variant_set,
  46                                          {'keyGen' => \&_collation_hash} );
  47         my @new_unique;
  48         my %merged;
  49         while( $diff->Next ) {
  50             if( $diff->Same ) {
  51                 # merge the nodes
  52                 my @l = $diff->Items( 1 );
  53                 my @v = $diff->Items( 2 );
  54                 foreach my $i ( 0 .. $#l ) {
  55                     if( !$merged{$l[$i]->name} ) {
  56                         print STDERR sprintf( "Merging %s into %s\n",
  57                                              $v[$i]->name,
  58                                              $l[$i]->name );
  59                         $collation->merge_readings( $l[$i], $v[$i] );
  60                         $merged{$l[$i]->name} = 1;
  61                         $substitutions->{$v[$i]->name} = $l[$i];
  62                     } else {
  63                         print STDERR "Would have double merged " . $l[$i]->name . "\n";
  64                     }
  65                 }
  66                 # splice the lemma nodes into the variant set
  67                 my( $offset ) = $diff->Get( 'min2' );
  68                 splice( @$variant_set, $offset, scalar( @l ), @l );
  69                 push( @new_unique, @l );
  70             } else {
  71                 # Keep the old unique readings
  72                 push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
  73                 # Add the new readings to the 'unique' list
  74                 push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
  75             }
  76         }
  77         @unique = @new_unique;
  78     }
  79     return $substitutions;
  80 }
  81
  82 sub collate_nonlinearly {
  83     my( $collation, $lemma_set, @variant_sets ) = @_;
  84
  85     my @unique;
  86     my $substitutions = {};
  87     push( @unique, @$lemma_set );
  88     while( @variant_sets ) {
  89         my $variant_set = shift @variant_sets;
  90         # Simply match the first reading that carries the same word, so
  91         # long as that reading has not yet been used to match another
  92         # word in this variant. That way lies loopy madness.
  93         my @distinct;
  94         my %merged;
  95         foreach my $idx ( 0 .. $#{$variant_set} ) {
  96             my $vw = $variant_set->[$idx];
  97             my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
  98             my $matched;
  99             if( @same ) {
 100                 foreach my $i ( 0 .. $#same ) {
 101                     unless( $merged{$same[$i]->name} ) {
 102                         #print STDERR sprintf( "Merging %s into %s\n",
 103                         #                     $vw->name,
 104                         #                     $same[$i]->name );
 105                         $collation->merge_readings( $same[$i], $vw );
 106                         $merged{$same[$i]->name} = 1;
 107                         $matched = $i;
 108                         $variant_set->[$idx] = $same[$i];
 109                         $substitutions->{$vw->name} = $same[$i];
 110                     }
 111                 }
 112             }
 113             unless( @same && defined($matched) ) {
 114                 push( @distinct, $vw );
 115             }
 116         }
 117         push( @unique, @distinct );
 118     }
 119     return $substitutions;
 120 }
 121
 122 sub _collation_hash {
 123     my $node = shift;
 124     return cmp_str( $node );
 125 }
 126
 127 =item B<cmp_str>
 128
 129 Pretend you never saw this method.  Really it needs to not be hardcoded.
 130
 131 =cut
 132
 133 sub cmp_str {
 134     my( $reading ) = @_;
 135     my $word = $reading->label();
 136     $word = lc( $word );
 137     $word =~ s/\W//g;
 138     $word =~ s/v/u/g;
 139     $word =~ s/j/i/g;
 140     $word =~ s/cha/ca/g;
 141     $word =~ s/quatuor/quattuor/g;
 142     $word =~ s/ioannes/iohannes/g;
 143     return $word;
 144 }
 145
 146 =item B<collate_variants>
 147
 148 my @rep = check_for_repeated( @readings )
 149
 150 Given an array of items, returns any items that appear in the array more
 151 than once.
 152
 153 =cut
 154
 155 sub check_for_repeated {
 156     my @seq = @_;
 157     my %unique;
 158     my @repeated;
 159     foreach ( @seq ) {
 160         if( exists $unique{$_->name} ) {
 161             push( @repeated, $_->name );
 162         } else {
 163             $unique{$_->name} = 1;
 164         }
 165     }
 166     return @repeated;
 167 }
 168
 169 sub add_hash_entry {
 170     my( $hash, $key, $entry ) = @_;
 171     if( exists $hash->{$key} ) {
 172         push( @{$hash->{$key}}, $entry );
 173     } else {
 174         $hash->{$key} = [ $entry ];
 175     }
 176 }
 177
 178 sub is_monotonic {
 179     my( @readings ) = @_;
 180     my( $common, $min, $max ) = ( -1, -1, -1 );
 181     foreach my $rdg ( @readings ) {
 182 #         print STDERR "Checking reading " . $rdg->name . "/" . $rdg->text . " - "
 183 #         . $rdg->position->reference ."\n";
 184         return 0 if $rdg->position->common < $common;
 185         if( $rdg->position->common == $common ) {
 186             return 0 if $rdg->position->min <= $min;
 187             return 0 if $rdg->position->max <= $max;
 188         }
 189         $common = $rdg->position->common;
 190         $min = $rdg->position->min;
 191         $max = $rdg->position->max;
 192     }
 193     return 1;
 194 }