new position logic for the lemmatizer and toggler; still need non-linear positions

[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / BaseText.pm
diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm

index ae5235c..eedaed9 100644 (file)
--- a/lib/Text/Tradition/Parser/BaseText.pm
+++ b/lib/Text/Tradition/Parser/BaseText.pm
@@ -40,14 +40,12 @@ Takes an initialized graph and a set of options, which must include:
 
 =cut
 
-my $DETRANSPOSE = 0;
 sub parse {
     my( $tradition, %opts ) = @_;
 
     my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
     load( $format_mod );
     my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
-    $DETRANSPOSE = 1 if $opts{'linear'};
     merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
 }
 
@@ -78,7 +76,7 @@ underscore in its name.
 
 =cut
 
-my $SHORT = 25;  # Debug var - set this to limit the number of lines parsed
+my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
 
 my %base_text_index;
 my $edits_required = {};
@@ -94,7 +92,7 @@ sub merge_base {
     foreach my $app ( @app_entries ) {
        my( $line, $num ) = split( /\./, $app->{_id} );
        # DEBUG with a short graph
-       last if $SHORT && $line > $SHORT;
+       last if $SHORTEND && $line > $SHORTEND;
        # DEBUG for problematic entries
        my $scrutinize = '';
        my $first_line_reading = $base_line_starts[ $line ];
@@ -178,6 +176,9 @@ sub merge_base {
        my %pc_seen; # Keep track of mss with explicit post-corr data
        foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
            my @mss = grep { $app->{$_} eq $k } keys( %$app );
+
+           # Keep track of lemma nodes that don't actually appear in
+           # any MSS; we will want to remove them from the collation.
            push( @unwitnessed_lemma_nodes, @lemma_set )
                if !@mss && $k eq 'rdg_0';
 
@@ -191,11 +192,11 @@ sub merge_base {
            }
            next if $k eq 'rdg_0';
 
+           # Parse the variant into reading tokens.
            # TODO don't hardcode the reading split operation
            my @variant = split( /\s+/, $app->{$k} );
            @variant = () if $app->{$k} eq '/'; # This is an omission.
            
-           # Make the variant into a set of readings.
            my @variant_readings;
            my $ctr = 0;
            foreach my $vw ( @variant ) {
@@ -213,16 +214,18 @@ sub merge_base {
 
        # Now collate and collapse the identical readings within the
        # collated sets.  Modifies the reading sets that were passed.
-       $DB::single = 1 if "$line.$num" eq '16.2';
        collate_variants( $collation, @reading_sets );
 
+       # Record any stated relationships between the nodes and the lemma.
+       set_relationships( $collation, $app, \@lemma_set, $variant_objects );
+
        # Now create the splice-edit objects that will be used
        # to reconstruct each witness.
 
        foreach my $rkey ( keys %$variant_objects ) {
            # Object is argument list for splice, so:
            # offset, length, replacements
-           my $edit_object = [ $base_text_index{$lemma_start->name},
+           my $edit_object = [ $lemma_start->name,
                                scalar( @lemma_set ),
                                $variant_objects->{$rkey}->{reading} ];
            foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
@@ -247,25 +250,32 @@ sub merge_base {
 
     # Now make the witness objects, and create their text sequences
     foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
+       print STDERR "Creating witness $w\n";
        my $witness_obj = $collation->tradition->add_witness( sigil => $w );
-       my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w} );
-       my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"} )
+       my $debug; #  = $w eq 'Vb11';
+       my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
+       my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
            if exists( $edits_required->{$w."_post"} );
 
+       my @repeated = _check_for_repeated( @ante_corr_seq );
+       warn "Repeated elements @repeated in $w a.c."
+           if @repeated;
+       @repeated = _check_for_repeated( @post_corr_seq );
+       warn "Repeated elements @repeated in $w p.c."
+           if @repeated;
+
        # Now save these paths in my witness object
        if( @post_corr_seq ) {
            $witness_obj->path( \@post_corr_seq );
-           my @ante_corr = make_witness_uncorrections( \@post_corr_seq,
-                                                       \@ante_corr_seq );
-           $witness_obj->ante_corr( \@ante_corr );
+           $witness_obj->uncorrected_path( \@ante_corr_seq );
        } else {
            $witness_obj->path( \@ante_corr_seq );
        }
     }
 
     # Now remove our 'base text' edges, which is to say, the only
-    # ones we have created so far.  Also remove any nodes that didn't
-    # appear in any witnesses.
+    # ones we have created so far.  Also remove any unwitnessed
+    # lemma nodes (TODO unless we are treating base as witness)
     foreach ( $collation->paths() ) {
        $collation->del_path( $_ );
     }
@@ -273,12 +283,62 @@ sub merge_base {
        $collation->del_reading( $_ );
     }
 
+    ### HACKY HACKY Do some one-off path corrections here.
+    if( $collation->linear ) {
+       my $c = $collation;
+       my $end = $SHORTEND ? $SHORTEND : 155;
+       my $path = $c->tradition->witness('Vb11')->path;
+       if( $end > 16 ) {
+           $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
+           splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) );
+       }
+       # What else?
+    } else {
+       my $c = $collation;
+       my $end = $SHORTEND ? $SHORTEND : 155;
+       # Vb5:
+       my $path = $c->tradition->witness('Vb5')->path;
+       splice( @$path, 1436, 0, $c->reading('106,14') ) if $end > 106;
+       # Vb11: 
+       $path = $c->tradition->witness('Vb11')->path;
+       if( $end > 16 ) {
+           $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
+           splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( '16,1' ) );
+       }
+       # Vb12 a.c.:
+       $path = $c->tradition->witness('Vb12')->uncorrected_path;
+       splice( @$path, 1828, 1, $c->reading('rdg_2/137.5.0') ) if $end > 137;
+       # Vb13:
+       $path = $c->tradition->witness('Vb13')->path;
+       splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58;
+       # Vb20 a.c.: 
+       $path = $c->tradition->witness('Vb20')->uncorrected_path;
+       splice( @$path, 1251, 1, $c->reading( '94,6' ) ) if $end > 94;
+       # Vb26: 
+       $path = $c->tradition->witness('Vb26')->path;
+       splice( @$path, 618, 0, $c->reading('46,2') ) if $end > 46;
+    }
+
     # Now walk paths and calculate positions.
     my @common_readings = 
        $collation->make_witness_paths();
     $collation->calculate_positions( @common_readings );
 }
 
+sub _check_for_repeated {
+    my @seq = @_;
+    my %unique;
+    my @repeated;
+    foreach ( @seq ) {
+       if( exists $unique{$_->name} ) {
+           push( @repeated, $_->name );
+       } else {
+           $unique{$_->name} = 1;
+       }
+    }
+    return @repeated;
+}
+
 =item B<read_base>
 
 my @line_beginnings = read_base( 'reference.txt', $collation );
@@ -312,7 +372,7 @@ sub read_base {
        my $started = 0;
        my $wordref = 0;
        my $lineref = scalar @$lineref_array;
-       last if $SHORT && $lineref > $SHORT;
+       last if $SHORTEND && $lineref > $SHORTEND;
        foreach my $w ( @words ) {
            my $readingref = join( ',', $lineref, ++$wordref );
            my $reading = $collation->add_reading( $readingref );
@@ -360,100 +420,211 @@ TODO: Handle collapsed and non-collapsed transpositions.
 sub collate_variants {
     my( $collation, @reading_sets ) = @_;
 
-    # Merge the nodes across the sets so that there is only one node
-    # for any given reading.  Use diff to identify the 'same' nodes.
+    # Two different ways to do this, depending on whether we want
+    # transposed reading nodes to be merged into one (producing a
+    # nonlinear, bidirectional graph) or not (producing a relatively
+    # linear, unidirectional graph.)
+    return $collation->linear ? collate_linearly( @_ )
+       : collate_nonlinearly( @_ );
+}
 
-    my $lemma_set = shift @reading_sets;
+sub collate_linearly {
+    my( $collation, $lemma_set, @variant_sets ) = @_;
 
     my @unique;
     push( @unique, @$lemma_set );
-
-    while( @reading_sets ) {
-       my $variant_set = shift @reading_sets;
-       if( $DETRANSPOSE ) {
-           # Use diff to do this job
-           my $diff = Algorithm::Diff->new( \@unique, $variant_set, 
-                                            {'keyGen' => \&_collation_hash} );
-           my @new_unique;
-           while( $diff->Next ) {
-               if( $diff->Same ) {
-                   # merge the nodes
-                   my @l = $diff->Items( 1 );
-                   my @v = $diff->Items( 2 );
-                   foreach my $i ( 0 .. $#l ) {
+    while( @variant_sets ) {
+       my $variant_set = shift @variant_sets;
+       # Use diff to do this job
+       my $diff = Algorithm::Diff->new( \@unique, $variant_set, 
+                                        {'keyGen' => \&_collation_hash} );
+       my @new_unique;
+       my %merged;
+       while( $diff->Next ) {
+           if( $diff->Same ) {
+               # merge the nodes
+               my @l = $diff->Items( 1 );
+               my @v = $diff->Items( 2 );
+               foreach my $i ( 0 .. $#l ) {
+                   if( !$merged{$l[$i]->name} ) {
                        $collation->merge_readings( $l[$i], $v[$i] );
+                       $merged{$l[$i]->name} = 1;
+                   } else {
+                       print STDERR "Would have double merged " . $l[$i]->name . "\n";
                    }
-                   # splice the lemma nodes into the variant set
-                   my( $offset ) = $diff->Get( 'min2' );
-                   splice( @$variant_set, $offset, scalar( @l ), @l );
-                   push( @new_unique, @l );
-               } else {
-                   # Keep the old unique readings
-                   push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
-                   # Add the new readings to the 'unique' list
-                   push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
                }
+               # splice the lemma nodes into the variant set
+               my( $offset ) = $diff->Get( 'min2' );
+               splice( @$variant_set, $offset, scalar( @l ), @l );
+               push( @new_unique, @l );
+           } else {
+               # Keep the old unique readings
+               push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
+               # Add the new readings to the 'unique' list
+               push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
            }
-           @unique = @new_unique;
-       } else {
-           # It becomes a much simpler job
-           $DB::single = 1;
-           my @distinct;
-           foreach my $idx ( 0 .. $#{$variant_set} ) {
-               my $vw = $variant_set->[$idx];
-               my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
-               if( @same ) {
-                   $collation->merge_readings( $same[0], $vw );
-                   $variant_set->[$idx] = $same[0];
-               } else {
-                   push( @distinct, $vw );
+       }
+       @unique = @new_unique;
+    }
+}
+
+sub collate_nonlinearly {
+    my( $collation, $lemma_set, @variant_sets ) = @_;
+    
+    my @unique;
+    push( @unique, @$lemma_set );
+    while( @variant_sets ) {
+       my $variant_set = shift @variant_sets;
+       # Simply match the first reading that carries the same word, so
+       # long as that reading has not yet been used to match another
+       # word in this variant. That way lies loopy madness.
+       my @distinct;
+       my %merged;
+       foreach my $idx ( 0 .. $#{$variant_set} ) {
+           my $vw = $variant_set->[$idx];
+           my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
+           my $matched;
+           if( @same ) {
+               foreach my $i ( 0 .. $#same ) {
+                   unless( $merged{$same[$i]->name} ) {
+                       #print STDERR sprintf( "Merging %s into %s\n", 
+                       #                     $vw->name,
+                       #                     $same[$i]->name );
+                       $collation->merge_readings( $same[$i], $vw );
+                       $merged{$same[$i]->name} = 1;
+                       $matched = $i;
+                       $variant_set->[$idx] = $same[$i];
+                   }
                }
            }
-           push( @unique, @distinct );
+           unless( @same && defined($matched) ) {
+               push( @distinct, $vw );
+           }
        }
+       push( @unique, @distinct );
     }
-
-    return;
 }
 
+
     
 sub _collation_hash {
     my $node = shift;
     return cmp_str( $node );
 }
 
-sub apply_edits {
-    my( $collation, $edit_sequence ) = @_;
-    my @lemma_names = sort { $base_text_index{$a} <=> $base_text_index{$b} }
-        keys %base_text_index;
-    my @lemma_text = map { $collation->reading( $_ ) } @lemma_names;
+sub set_relationships {
+    my( $collation, $app, $lemma, $variants ) = @_;
+    foreach my $rkey ( keys %$variants ) {
+       my $var = $variants->{$rkey}->{'reading'};
+       my $type = $app->{sprintf( "_%s_type", $rkey )};
+       my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
+       my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
+       
+       my %rel_options = ();
+       $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
+       $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
+       
+       if( $type =~ /^(inv|tr|rep)$/i ) {
+           # Transposition or repetition: look for nodes with the
+           # same label but different IDs and mark them.
+           $type = 'repetition' if $type =~ /^rep/i;
+           $rel_options{'type'} = $type;
+           my %labels;
+           foreach my $r ( @$lemma ) {
+               $labels{cmp_str( $r )} = $r;
+           }
+           foreach my $r( @$var ) {
+               if( exists $labels{$r->label} &&
+                   $r->name ne $labels{$r->label}->name ) {
+                   if( $type eq 'repetition' ) {
+                       # Repetition
+                       $collation->add_relationship( $r, $labels{$r->label}, \%rel_options );
+                   } else {
+                       # Transposition
+                       $r->set_identical( $labels{$r->label} );
+                   }
+               }
+           }
+       } elsif( $type =~ /^(gr|lex|sp(el)?)$/i ) {
+
+           # Grammar/spelling/lexical: this can be a one-to-one or
+           # one-to-many mapping.  We should think about merging
+           # readings if it is one-to-many.
+
+           $type = 'grammatical' if $type =~ /gr/i;
+           $type = 'spelling' if $type =~ /sp/i;
+           $type = 'repetition' if $type =~ /rep/i;
+           $type = 'lexical' if $type =~ /lex/i;
+           $rel_options{'type'} = $type;
+           if( @$lemma == @$var ) {
+               foreach my $i ( 0 .. $#{$lemma} ) {
+                   $collation->add_relationship( $var->[$i], $lemma->[$i],
+                       \%rel_options );
+               } 
+           } else {
+               # An uneven many-to-many mapping.  Make a segment out of
+               # whatever we have.
+               my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
+               my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
+               $collation->add_relationship( $varseg, $lemseg, \%rel_options );
+           }
+       } elsif( $type !~ /^(add|om)$/i ) {
+           warn "Unrecognized type $type";
+       }
+    }
+}
+       
 
+
+sub apply_edits {
+    my( $collation, $edit_sequence, $debug ) = @_;
+    my @lemma_text = $collation->reading_sequence( $collation->start,
+                                          $collation->reading( '#END#' ) );
     my $drift = 0;
     foreach my $correction ( @$edit_sequence ) {
-       my( $offset, $length, $items ) = @$correction;
+       my( $lemma_start, $length, $items ) = @$correction;
+       my $offset = $base_text_index{$lemma_start};
        my $realoffset = $offset + $drift;
+       if( $debug ||
+           $lemma_text[$realoffset]->name ne $lemma_start ) {
+           my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
+           my @base_phrase;
+           my $i = $realoffset;
+           my $l = $collation->reading( $lemma_start );
+           while( $i < $realoffset+$length ) {
+               push( @base_phrase, $l );
+               $l = $collation->next_reading( $l );
+               $i++;
+           }
+           
+           print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
+                                 "with %s (%s) with drift %d\n",
+                                 join( ' ', map {$_->label} @base_phrase ),
+                                 join( ' ', map {$_->name} @base_phrase ),
+                                 $realoffset,
+                                 join( ' ', map {$_->label} @$items ),
+                                 join( ' ', map {$_->name} @$items ),
+                                 $drift,
+                                 ) if $debug;
+                                 
+           if( $lemma_text[$realoffset]->name ne $lemma_start ) {
+               warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
+                              "but %s (%s) is there instead", 
+                              join( ' ', map {$_->label} @base_phrase ),
+                              join( ' ', map {$_->name} @base_phrase ),
+                              join( ' ', map {$_->label} @$items ),
+                              join( ' ', map {$_->name} @$items ),
+                              join( ' ', map {$_->label} @this_phrase ),
+                              join( ' ', map {$_->name} @this_phrase ),
+                     ) );
+               # next;
+           }
+       }
        splice( @lemma_text, $realoffset, $length, @$items );
        $drift += @$items - $length;
     }
     return @lemma_text;
 }
-
-sub make_witness_uncorrections {
-    my( $path, $uncorr_path ) = @_;
-    my $diff = Algorithm::Diff->new( $path, $uncorr_path, 
-                                    { 'keyGen' => \&_collation_hash } );
-    # We basically just want to make a bunch of splice arguments that
-    # will reconstruct the ante-corr text from the post-corr.
-    my @diff_list;
-    while( $diff->Next ) {
-       next if $diff->Same;
-       my( $offset ) = $diff->Get( 'min1' );
-       my $length = scalar( $diff->Items( 1 ) );
-       my $items = []; push( @$items, $diff->Items( 2 ) );
-       push( @diff_list, [ $offset, $length, $items ] );
-    }
-    return @diff_list;
-}
        
 
 # Helper function. Given a witness sigil, if it is a post-correctione