new position logic for the lemmatizer and toggler; still need non-linear positions

[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / BaseText.pm
diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm

index 07e9d57..eedaed9 100644 (file)
--- a/lib/Text/Tradition/Parser/BaseText.pm
+++ b/lib/Text/Tradition/Parser/BaseText.pm
@@ -3,6 +3,7 @@ package Text::Tradition::Parser::BaseText;
 use strict;
 use warnings;
 use Module::Load;
+use Algorithm::Diff;
 
 =head1 NAME
 
@@ -40,12 +41,12 @@ Takes an initialized graph and a set of options, which must include:
 =cut
 
 sub parse {
-    my( $graph, %opts ) = @_;
+    my( $tradition, %opts ) = @_;
 
     my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
     load( $format_mod );
     my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
-    merge_base( $graph, $opts{'base'}, @apparatus_entries );
+    merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
 }
 
 =item B<merge_base>
@@ -75,18 +76,26 @@ underscore in its name.
 
 =cut
 
+my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
+
+my %base_text_index;
+my $edits_required = {};
+
+# edits_required -> wit -> [ { start_idx, end_idx, items } ]
+
 sub merge_base {
-    my( $graph, $base_file, @app_entries ) = @_;
-    my @base_line_starts = read_base( $base_file, $graph );
+    my( $collation, $base_file, @app_entries ) = @_;
+    my @base_line_starts = read_base( $base_file, $collation );
 
     my %all_witnesses;
+    my @unwitnessed_lemma_nodes;
     foreach my $app ( @app_entries ) {
        my( $line, $num ) = split( /\./, $app->{_id} );
        # DEBUG with a short graph
-       # last if $line > 2;
+       last if $SHORTEND && $line > $SHORTEND;
        # DEBUG for problematic entries
-       my $scrutinize = "";
-       my $first_line_node = $base_line_starts[ $line ];
+       my $scrutinize = '';
+       my $first_line_reading = $base_line_starts[ $line ];
        my $too_far = $base_line_starts[ $line+1 ];
        
        my $lemma = $app->{rdg_0};
@@ -98,7 +107,7 @@ sub merge_base {
        my @lemma_words = split( /\s+/, $lemma );
        
        # Now search for the lemma words within this line.
-       my $lemma_start = $first_line_node;
+       my $lemma_start = $first_line_reading;
        my $lemma_end;
        my %seen;
        while( $lemma_start ne $too_far ) {
@@ -120,17 +129,19 @@ sub merge_base {
                if( --$seq < 1 ) {
                    # Now we have to compare the rest of the words here.
                    if( scalar( @lemma_words ) > 1 ) {
-                       my $next_node = $graph->next_word( $lemma_start );
+                       my $next_reading = 
+                           $collation->next_reading( $lemma_start );
                        foreach my $w ( @lemma_words[1..$#lemma_words] ) {
                            printf STDERR "Now matching %s against %s\n", 
-                                   cmp_str($next_node), $w
+                                   cmp_str($next_reading), $w
                                if "$line.$num" eq $scrutinize;
-                           if( $w ne cmp_str($next_node) ) {
+                           if( $w ne cmp_str($next_reading) ) {
                                $unmatch = 1;
                                last;
                            } else {
-                               $lemma_end = $next_node;
-                               $next_node = $graph->next_word( $lemma_end );
+                               $lemma_end = $next_reading;
+                               $next_reading = 
+                                   $collation->next_reading( $lemma_end );
                            }
                        }
                    } else {
@@ -142,278 +153,500 @@ sub merge_base {
            }
            last unless ( $unmatch || !defined( $lemma_end ) );
            $lemma_end = undef;
-           $lemma_start = $graph->next_word( $lemma_start );
+           $lemma_start = $collation->next_reading( $lemma_start );
        }
        
        unless( $lemma_end ) {
            warn "No match found for @lemma_words at $line.$num";
            next;
-       } else {
-           # These are no longer common nodes; unmark them as such.
-           my @lemma_nodes = $graph->node_sequence( $lemma_start, 
-                                                    $lemma_end );
-           map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes;
        }
        
-       # Now we have our lemma nodes; we add the variant nodes to the graph.
-       
-       # Keep track of the start and end point of each reading for later
-       # node collapse.
-       my @readings = ( $lemma_start, $lemma_end );
-
-       # For each reading that is not rdg_0, we make a chain of nodes
-       # and connect them to the anchor.  Edges are named after the mss
-       # that are relevant.
+       # Now we have found the lemma; we will record an 'edit', in
+       # terms of a splice operation, for each subsequent reading.
+       # We also note which witnesses take the given edit.
+
+       my @lemma_set = $collation->reading_sequence( $lemma_start, 
+                                                     $lemma_end );
+       my @reading_sets = [ @lemma_set ];
+
+       # For each reading that is not rdg_0, we create the variant
+       # reading nodes, and store the range as an edit operation on
+       # the base text.
+       my $variant_objects;
+       my %pc_seen; # Keep track of mss with explicit post-corr data
        foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
-           next if $k eq 'rdg_0'; # that's the lemma.
-           my @variant = split( /\s+/, $app->{$k} );
-           @variant = () if $app->{$k} eq '/'; # This is an omission.
            my @mss = grep { $app->{$_} eq $k } keys( %$app );
-           
-           unless( @mss ) {
-               print STDERR "Skipping '@variant' at $line.$num: no mss\n";
-               next;
-           }
-           
-           # Determine the label name for the edges here.
-           my $edge_name = join(', ', @mss );
+
+           # Keep track of lemma nodes that don't actually appear in
+           # any MSS; we will want to remove them from the collation.
+           push( @unwitnessed_lemma_nodes, @lemma_set )
+               if !@mss && $k eq 'rdg_0';
+
+           # Keep track of what witnesses we have seen.
            @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
+           # Keep track of which witnesses bear corrected readings here.
+           foreach my $m ( @mss ) {
+               my $base = _is_post_corr( $m );
+               next unless $base;
+               $pc_seen{$base} = 1;
+           }
+           next if $k eq 'rdg_0';
+
+           # Parse the variant into reading tokens.
+           # TODO don't hardcode the reading split operation
+           my @variant = split( /\s+/, $app->{$k} );
+           @variant = () if $app->{$k} eq '/'; # This is an omission.
            
-           # Make the variant into a set of nodes.
+           my @variant_readings;
            my $ctr = 0;
-           my $last_node = $graph->prior_word( $lemma_start );
-           my $var_start;
            foreach my $vw ( @variant ) {
                my $vwname = "$k/$line.$num.$ctr"; $ctr++;
-               my $vwnode = $graph->add_node( $vwname );
-               $vwnode->set_attribute( 'label', $vw );
-               $vwnode->set_attribute( 'class', 'variant' );
-               $graph->add_edge( $last_node, $vwnode, $edge_name );
-               $var_start = $vwnode unless $var_start;
-               $last_node = $vwnode;
+               my $vwreading = $collation->add_reading( $vwname );
+               $vwreading->text( $vw );
+               push( @variant_readings, $vwreading );
            }
-           # Now hook it up at the end.
-           $graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
-                                       $edge_name );
-           
-           if( $var_start ) { # if it wasn't an empty reading
-               push( @readings, $var_start, $last_node );
+
+           $variant_objects->{$k} = { 'mss' => \@mss,
+                                      'reading' => \@variant_readings,
+           };
+           push( @reading_sets, \@variant_readings );
+       }
+
+       # Now collate and collapse the identical readings within the
+       # collated sets.  Modifies the reading sets that were passed.
+       collate_variants( $collation, @reading_sets );
+
+       # Record any stated relationships between the nodes and the lemma.
+       set_relationships( $collation, $app, \@lemma_set, $variant_objects );
+
+       # Now create the splice-edit objects that will be used
+       # to reconstruct each witness.
+
+       foreach my $rkey ( keys %$variant_objects ) {
+           # Object is argument list for splice, so:
+           # offset, length, replacements
+           my $edit_object = [ $lemma_start->name,
+                               scalar( @lemma_set ),
+                               $variant_objects->{$rkey}->{reading} ];
+           foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
+               # Is this a p.c. entry?
+               my $base = _is_post_corr( $ms );
+               if( $base ) { # this is a post-corr witness
+                   my $pc_key = $base . "_post";
+                   _add_hash_entry( $edits_required, $pc_key, $edit_object );
+               } else { # this is an ante-corr witness
+                   my $pc_key = $ms . "_post";
+                   _add_hash_entry( $edits_required, $ms, $edit_object );
+                   unless( $pc_seen{$ms} ) {
+                       # If this witness carries no correction, add this 
+                       # same object to its post-corrected state.
+                       _add_hash_entry( $edits_required, $pc_key, 
+                                        $edit_object );
+                   }
+               }
            }
        }
+    } # Finished going through the apparatus entries
+
+    # Now make the witness objects, and create their text sequences
+    foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
+       print STDERR "Creating witness $w\n";
+       my $witness_obj = $collation->tradition->add_witness( sigil => $w );
+       my $debug; #  = $w eq 'Vb11';
+       my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
+       my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
+           if exists( $edits_required->{$w."_post"} );
+
+       my @repeated = _check_for_repeated( @ante_corr_seq );
+       warn "Repeated elements @repeated in $w a.c."
+           if @repeated;
+       @repeated = _check_for_repeated( @post_corr_seq );
+       warn "Repeated elements @repeated in $w p.c."
+           if @repeated;
+
+       # Now save these paths in my witness object
+       if( @post_corr_seq ) {
+           $witness_obj->path( \@post_corr_seq );
+           $witness_obj->uncorrected_path( \@ante_corr_seq );
+       } else {
+           $witness_obj->path( \@ante_corr_seq );
+       }
+    }
 
-       # Now collate and collapse the identical nodes within the graph.
-       collate_variants( $graph, @readings );
+    # Now remove our 'base text' edges, which is to say, the only
+    # ones we have created so far.  Also remove any unwitnessed
+    # lemma nodes (TODO unless we are treating base as witness)
+    foreach ( $collation->paths() ) {
+       $collation->del_path( $_ );
+    }
+    foreach( @unwitnessed_lemma_nodes ) {
+       $collation->del_reading( $_ );
     }
 
-    ## Now in theory I have a graph.  I want to make it a little easier to
-    ## read.  So I collapse nodes that have only one edge in and one edge
-    ## out, and I do this by looking at the edges.
-    
-#     foreach my $edge ( $graph->edges() ) {
-#      my @out_edges = $edge->from()->outgoing();
-#      my @in_edges = $edge->to()->incoming();
-       
-#      next if $edge->from() eq $graph->start();
-#      next if $edge->to()->name() eq '#END#';
-#      next unless scalar( @out_edges ) == 1;
-#      next unless scalar( @in_edges ) == 1;
-#      next unless $out_edges[0] eq $in_edges[0];
-#      # In theory if we've got this far, we're safe, but just to
-#      # double-check...
-#      next unless $out_edges[0] eq $edge;
-       
-#      $graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
-#     }
-
-    # Now walk the path for each witness, so that we can do the
-    # position calculations.
-    my $paths = {};
-    foreach my $w ( keys %all_witnesses ) {
-       my $back = undef;
-       if( $w =~ /^(.*)\s*\(p\.\s*c\.\)/ ) {
-           $back = $1;
+    ### HACKY HACKY Do some one-off path corrections here.
+    if( $collation->linear ) {
+       my $c = $collation;
+       my $end = $SHORTEND ? $SHORTEND : 155;
+       my $path = $c->tradition->witness('Vb11')->path;
+       if( $end > 16 ) {
+           $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
+           splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) );
        }
-       my @wit_nodes = $graph->node_sequence( $graph->start, 
-                                              $graph->node( '#END#' ), 
-                                              $w, $back );
-       my @wn_names = map { $_->name() } @wit_nodes;
-       $paths->{$w} = \@wn_names;
+       # What else?
+    } else {
+       my $c = $collation;
+       my $end = $SHORTEND ? $SHORTEND : 155;
+       # Vb5:
+       my $path = $c->tradition->witness('Vb5')->path;
+       splice( @$path, 1436, 0, $c->reading('106,14') ) if $end > 106;
+       # Vb11: 
+       $path = $c->tradition->witness('Vb11')->path;
+       if( $end > 16 ) {
+           $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
+           splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( '16,1' ) );
+       }
+       # Vb12 a.c.:
+       $path = $c->tradition->witness('Vb12')->uncorrected_path;
+       splice( @$path, 1828, 1, $c->reading('rdg_2/137.5.0') ) if $end > 137;
+       # Vb13:
+       $path = $c->tradition->witness('Vb13')->path;
+       splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58;
+       # Vb20 a.c.: 
+       $path = $c->tradition->witness('Vb20')->uncorrected_path;
+       splice( @$path, 1251, 1, $c->reading( '94,6' ) ) if $end > 94;
+       # Vb26: 
+       $path = $c->tradition->witness('Vb26')->path;
+       splice( @$path, 618, 0, $c->reading('46,2') ) if $end > 46;
     }
-    $DB::single = 1;
-    my @common_nodes = grep { $graph->is_common( $_ ) } $graph->nodes();
-    $graph->make_positions( \@common_nodes, $paths );
+
+    # Now walk paths and calculate positions.
+    my @common_readings = 
+       $collation->make_witness_paths();
+    $collation->calculate_positions( @common_readings );
+}
+
+sub _check_for_repeated {
+    my @seq = @_;
+    my %unique;
+    my @repeated;
+    foreach ( @seq ) {
+       if( exists $unique{$_->name} ) {
+           push( @repeated, $_->name );
+       } else {
+           $unique{$_->name} = 1;
+       }
+    }
+    return @repeated;
 }
 
 =item B<read_base>
 
-my @line_beginnings = read_base( 'reference.txt', $graph );
+my @line_beginnings = read_base( 'reference.txt', $collation );
 
-Takes a text file and a (presumed empty) graph object, adds the words
-as simple linear nodes to the graph, and returns a list of nodes that
-represent the beginning of lines. This graph is now the starting point
-for application of apparatus entries in merge_base, e.g. from a CSV
-file or a Classical Text Editor file.
+Takes a text file and a (presumed empty) collation object, adds the
+words as simple linear readings to the collation, and returns a
+list of readings that represent the beginning of lines. This collation
+is now the starting point for application of apparatus entries in
+merge_base, e.g. from a CSV file or a Classical Text Editor file.
 
 =cut
 
 sub read_base {
-    my( $base_file, $graph ) = @_;
+    my( $base_file, $collation ) = @_;
     
-    # This array gives the first node for each line.  We put the
+    # This array gives the first reading for each line.  We put the
     # common starting point in line zero.
-    my $last_node = $graph->start();
-    my $lineref_array = [ $last_node ]; # There is no line zero.
+    my $last_reading = $collation->start();
+    $base_text_index{$last_reading->name} = 0;
+    my $lineref_array = [ $last_reading ]; # There is no line zero.
 
     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
+    my $i = 1;
     while(<BASE>) {
-       # Make the nodes, and connect them up for the base, but also
-       # save the first node of each line in an array for the purpose.
+       # Make the readings, and connect them up for the base, but
+       # also save the first reading of each line in an array for the
+       # purpose.
+       # TODO use configurable reading separator
        chomp;
        my @words = split;
        my $started = 0;
        my $wordref = 0;
        my $lineref = scalar @$lineref_array;
+       last if $SHORTEND && $lineref > $SHORTEND;
        foreach my $w ( @words ) {
-           my $noderef = join( ',', $lineref, ++$wordref );
-           my $node = $graph->add_node( $noderef );
-           $node->set_attribute( 'label', $w );
-           $node->set_attribute( 'class', 'common' );
+           my $readingref = join( ',', $lineref, ++$wordref );
+           my $reading = $collation->add_reading( $readingref );
+           $reading->text( $w );
            unless( $started ) {
-               push( @$lineref_array, $node );
+               push( @$lineref_array, $reading );
                $started = 1;
            }
-           if( $last_node ) {
-               my $edge = $graph->add_edge( $last_node, $node, "base text" );
-               $edge->set_attribute( 'class', 'basetext' );
-               $last_node = $node;
-           } # TODO there should be no else here...
+           # Add edge paths in the graph, for easier tracking when
+           # we start applying corrections.  These paths will be
+           # removed when we're done.
+           my $path = $collation->add_path( $last_reading, $reading, 
+                                            $collation->baselabel );
+           $last_reading = $reading;
+
+           # Note an array index for the reading, for later correction splices.
+           $base_text_index{$readingref} = $i++;
        }
     }
     close BASE;
     # Ending point for all texts
-    my $endpoint = $graph->add_node( '#END#' );
-    $graph->add_edge( $last_node, $endpoint, "base text" );
+    my $endpoint = $collation->add_reading( '#END#' );
+    $collation->add_path( $last_reading, $endpoint, $collation->baselabel );
     push( @$lineref_array, $endpoint );
+    $base_text_index{$endpoint->name} = $i;
 
     return( @$lineref_array );
 }
 
 =item B<collate_variants>
 
-collate_variants( $graph, @readings )
+collate_variants( $collation, @reading_ranges )
 
 Given a set of readings in the form 
 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
-walks through each to identify those nodes that are identical.  The
-graph is a Text::Tradition::Graph object; the elements of @readings are
-Graph::Easy::Node objects that appear on the graph.
+walks through each to identify those readings that are identical.  The
+collation is a Text::Tradition::Collation object; the elements of
+@readings are Text::Tradition::Collation::Reading objects that appear
+on the collation graph.
 
 TODO: Handle collapsed and non-collapsed transpositions.
 
 =cut
 
 sub collate_variants {
-    my( $graph, @readings ) = @_;
-    my $lemma_start = shift @readings;
-    my $lemma_end = shift @readings;
-    my $detranspose = 0;
-
-    # Start the list of distinct nodes with those nodes in the lemma.
-    my @distinct_nodes;
-    while( $lemma_start ne $lemma_end ) {
-       push( @distinct_nodes, [ $lemma_start, 'base text' ] );
-       $lemma_start = $graph->next_word( $lemma_start );
-    } 
-    push( @distinct_nodes, [ $lemma_end, 'base text' ] );
-    
-
-    while( scalar @readings ) {
-       my( $var_start, $var_end ) = splice( @readings, 0, 2 );
-
-       # I want to look at the nodes in the variant and lemma, and
-       # collapse nodes that are the same word.  This is mini-collation.
-       # Each word in the 'main' list can only be collapsed once with a
-       # word from the current reading.
-       my %collapsed = ();
-
-       # Get the label. There will only be one outgoing edge to start
-       # with, so this is safe.
-       my @out = $var_start->outgoing();
-       my $var_label = $out[0]->label();
+    my( $collation, @reading_sets ) = @_;
+
+    # Two different ways to do this, depending on whether we want
+    # transposed reading nodes to be merged into one (producing a
+    # nonlinear, bidirectional graph) or not (producing a relatively
+    # linear, unidirectional graph.)
+    return $collation->linear ? collate_linearly( @_ )
+       : collate_nonlinearly( @_ );
+}
 
-       my @variant_nodes;
-       while( $var_start ne $var_end ) {
-           push( @variant_nodes, $var_start );
-           $var_start = $graph->next_word( $var_start, $var_label );
+sub collate_linearly {
+    my( $collation, $lemma_set, @variant_sets ) = @_;
+
+    my @unique;
+    push( @unique, @$lemma_set );
+    while( @variant_sets ) {
+       my $variant_set = shift @variant_sets;
+       # Use diff to do this job
+       my $diff = Algorithm::Diff->new( \@unique, $variant_set, 
+                                        {'keyGen' => \&_collation_hash} );
+       my @new_unique;
+       my %merged;
+       while( $diff->Next ) {
+           if( $diff->Same ) {
+               # merge the nodes
+               my @l = $diff->Items( 1 );
+               my @v = $diff->Items( 2 );
+               foreach my $i ( 0 .. $#l ) {
+                   if( !$merged{$l[$i]->name} ) {
+                       $collation->merge_readings( $l[$i], $v[$i] );
+                       $merged{$l[$i]->name} = 1;
+                   } else {
+                       print STDERR "Would have double merged " . $l[$i]->name . "\n";
+                   }
+               }
+               # splice the lemma nodes into the variant set
+               my( $offset ) = $diff->Get( 'min2' );
+               splice( @$variant_set, $offset, scalar( @l ), @l );
+               push( @new_unique, @l );
+           } else {
+               # Keep the old unique readings
+               push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
+               # Add the new readings to the 'unique' list
+               push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
+           }
        }
-       push( @variant_nodes, $var_end );
-
-       # Go through the variant nodes, and if we find a lemma node that
-       # hasn't yet been collapsed with a node, equate them.  If we do
-       # not, keep them to push onto the end of all_nodes.
-       my @remaining_nodes;
-       my $last_index = 0;
-       foreach my $w ( @variant_nodes ) {
-           my $word = $w->label();
-           my $matched = 0;
-           foreach my $idx ( $last_index .. $#distinct_nodes ) {
-               my( $l, $edgelabel ) = @{$distinct_nodes[$idx]};
-               if( $word eq cmp_str( $l ) ) {
-                   next if exists( $collapsed{ $l->label } )
-                       && $collapsed{ $l->label } eq $l;
-                   $matched = 1;
-                   $last_index = $idx if $detranspose;
-                   # Collapse the nodes.
-                   printf STDERR "Merging nodes %s/%s and %s/%s\n", 
-                       $l->name, $l->label, $w->name, $w->label;
-                   $graph->merge_nodes( $l, $w );
-                   $collapsed{ $l->label } = $l;
-                   # Now collapse any multiple edges to and from the node.
-                   remove_duplicate_edges( $graph, 
-                                   $graph->prior_word( $l, $edgelabel ), $l );
-                   remove_duplicate_edges( $graph, $l, 
-                                   $graph->next_word( $l, $edgelabel ) );
-                   last if $matched;
+       @unique = @new_unique;
+    }
+}
+
+sub collate_nonlinearly {
+    my( $collation, $lemma_set, @variant_sets ) = @_;
+    
+    my @unique;
+    push( @unique, @$lemma_set );
+    while( @variant_sets ) {
+       my $variant_set = shift @variant_sets;
+       # Simply match the first reading that carries the same word, so
+       # long as that reading has not yet been used to match another
+       # word in this variant. That way lies loopy madness.
+       my @distinct;
+       my %merged;
+       foreach my $idx ( 0 .. $#{$variant_set} ) {
+           my $vw = $variant_set->[$idx];
+           my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
+           my $matched;
+           if( @same ) {
+               foreach my $i ( 0 .. $#same ) {
+                   unless( $merged{$same[$i]->name} ) {
+                       #print STDERR sprintf( "Merging %s into %s\n", 
+                       #                     $vw->name,
+                       #                     $same[$i]->name );
+                       $collation->merge_readings( $same[$i], $vw );
+                       $merged{$same[$i]->name} = 1;
+                       $matched = $i;
+                       $variant_set->[$idx] = $same[$i];
+                   }
                }
            }
-           push( @remaining_nodes, [ $w, $var_label ] ) unless $matched;
+           unless( @same && defined($matched) ) {
+               push( @distinct, $vw );
+           }
        }
-       push( @distinct_nodes, @remaining_nodes) if scalar( @remaining_nodes );
+       push( @unique, @distinct );
     }
 }
 
-=item B<remove_duplicate_edges>
 
-remove_duplicate_edges( $graph, $from, $to );
+    
+sub _collation_hash {
+    my $node = shift;
+    return cmp_str( $node );
+}
 
-Given two nodes, reduce the number of edges between those nodes to
-one.  If neither edge represents a base text, combine their labels.
+sub set_relationships {
+    my( $collation, $app, $lemma, $variants ) = @_;
+    foreach my $rkey ( keys %$variants ) {
+       my $var = $variants->{$rkey}->{'reading'};
+       my $type = $app->{sprintf( "_%s_type", $rkey )};
+       my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
+       my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
+       
+       my %rel_options = ();
+       $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
+       $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
+       
+       if( $type =~ /^(inv|tr|rep)$/i ) {
+           # Transposition or repetition: look for nodes with the
+           # same label but different IDs and mark them.
+           $type = 'repetition' if $type =~ /^rep/i;
+           $rel_options{'type'} = $type;
+           my %labels;
+           foreach my $r ( @$lemma ) {
+               $labels{cmp_str( $r )} = $r;
+           }
+           foreach my $r( @$var ) {
+               if( exists $labels{$r->label} &&
+                   $r->name ne $labels{$r->label}->name ) {
+                   if( $type eq 'repetition' ) {
+                       # Repetition
+                       $collation->add_relationship( $r, $labels{$r->label}, \%rel_options );
+                   } else {
+                       # Transposition
+                       $r->set_identical( $labels{$r->label} );
+                   }
+               }
+           }
+       } elsif( $type =~ /^(gr|lex|sp(el)?)$/i ) {
+
+           # Grammar/spelling/lexical: this can be a one-to-one or
+           # one-to-many mapping.  We should think about merging
+           # readings if it is one-to-many.
+
+           $type = 'grammatical' if $type =~ /gr/i;
+           $type = 'spelling' if $type =~ /sp/i;
+           $type = 'repetition' if $type =~ /rep/i;
+           $type = 'lexical' if $type =~ /lex/i;
+           $rel_options{'type'} = $type;
+           if( @$lemma == @$var ) {
+               foreach my $i ( 0 .. $#{$lemma} ) {
+                   $collation->add_relationship( $var->[$i], $lemma->[$i],
+                       \%rel_options );
+               } 
+           } else {
+               # An uneven many-to-many mapping.  Make a segment out of
+               # whatever we have.
+               my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
+               my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
+               $collation->add_relationship( $varseg, $lemseg, \%rel_options );
+           }
+       } elsif( $type !~ /^(add|om)$/i ) {
+           warn "Unrecognized type $type";
+       }
+    }
+}
+       
 
-=cut
 
-sub remove_duplicate_edges {
-    my( $graph, $from, $to ) = @_;
-    my @edges = $from->edges_to( $to );
-    if( scalar @edges > 1 ) {
-       my @base = grep { $_->label eq 'base text' } @edges;
-       if ( scalar @base ) {
-           # Remove the edges that are not base.
-           foreach my $e ( @edges ) {
-               $graph->del_edge( $e )
-                   unless $e eq $base[0];
+sub apply_edits {
+    my( $collation, $edit_sequence, $debug ) = @_;
+    my @lemma_text = $collation->reading_sequence( $collation->start,
+                                          $collation->reading( '#END#' ) );
+    my $drift = 0;
+    foreach my $correction ( @$edit_sequence ) {
+       my( $lemma_start, $length, $items ) = @$correction;
+       my $offset = $base_text_index{$lemma_start};
+       my $realoffset = $offset + $drift;
+       if( $debug ||
+           $lemma_text[$realoffset]->name ne $lemma_start ) {
+           my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
+           my @base_phrase;
+           my $i = $realoffset;
+           my $l = $collation->reading( $lemma_start );
+           while( $i < $realoffset+$length ) {
+               push( @base_phrase, $l );
+               $l = $collation->next_reading( $l );
+               $i++;
            }
-       } else {
-           # Combine the edges into one.
-           my $new_edge_name = join( ', ', map { $_->label() } @edges );
-           my $new_edge = shift @edges;
-           $new_edge->set_attribute( 'label', $new_edge_name );
-           foreach my $e ( @edges ) {
-               $graph->del_edge( $e );
+           
+           print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
+                                 "with %s (%s) with drift %d\n",
+                                 join( ' ', map {$_->label} @base_phrase ),
+                                 join( ' ', map {$_->name} @base_phrase ),
+                                 $realoffset,
+                                 join( ' ', map {$_->label} @$items ),
+                                 join( ' ', map {$_->name} @$items ),
+                                 $drift,
+                                 ) if $debug;
+                                 
+           if( $lemma_text[$realoffset]->name ne $lemma_start ) {
+               warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
+                              "but %s (%s) is there instead", 
+                              join( ' ', map {$_->label} @base_phrase ),
+                              join( ' ', map {$_->name} @base_phrase ),
+                              join( ' ', map {$_->label} @$items ),
+                              join( ' ', map {$_->name} @$items ),
+                              join( ' ', map {$_->label} @this_phrase ),
+                              join( ' ', map {$_->name} @this_phrase ),
+                     ) );
+               # next;
            }
        }
+       splice( @lemma_text, $realoffset, $length, @$items );
+       $drift += @$items - $length;
+    }
+    return @lemma_text;
+}
+       
+
+# Helper function. Given a witness sigil, if it is a post-correctione
+# sigil,return the base witness.  If not, return a false value.
+sub _is_post_corr {
+    my( $sigil ) = @_;
+    if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
+       return $1;
     }
+    return undef;
 }
 
+sub _add_hash_entry {
+    my( $hash, $key, $entry ) = @_;
+    if( exists $hash->{$key} ) {
+       push( @{$hash->{$key}}, $entry );
+    } else {
+       $hash->{$key} = [ $entry ];
+    }
+}
+
+
 =item B<cmp_str>
 
 Pretend you never saw this method.  Really it needs to not be hardcoded.
@@ -421,8 +654,8 @@ Pretend you never saw this method.  Really it needs to not be hardcoded.
 =cut
 
 sub cmp_str {
-    my( $node ) = @_;
-    my $word = $node->label();
+    my( $reading ) = @_;
+    my $word = $reading->label();
     $word = lc( $word );
     $word =~ s/\W//g;
     $word =~ s/v/u/g;