X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FParser%2FBaseText.pm;h=ed2a447c0c8dd26d92d7c6bd80f89c676e29ced5;hb=7e450e44cb188a835f5ca2b9f007b9782fcc753c;hp=e1867e92e5e3a57649ee4c1dab87a1f3492c27e6;hpb=e58153d6e15d5d21fb2c3fdc743487dceae06aa1;p=scpubgit%2Fstemmatology.git

diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm
index e1867e9..ed2a447 100644
--- a/lib/Text/Tradition/Parser/BaseText.pm
+++ b/lib/Text/Tradition/Parser/BaseText.pm
@@ -2,285 +2,565 @@ package Text::Tradition::Parser::BaseText;
 
 use strict;
 use warnings;
-use Exporter 'import';
-use vars qw( @EXPORT_OK );
-@EXPORT_OK = qw( merge_base );
+use Module::Load;
+use Text::Tradition::Parser::Util qw( collate_variants cmp_str check_for_repeated add_hash_entry );
+
+=head1 NAME
+
+Text::Tradition::Parser::BaseText
+
+=head1 SYNOPSIS
+
+use Text::Tradition::Parser::BaseText qw( merge_base );
+merge_base( $graph, 'reference.txt', @apparatus_entries )
+
+=head1 DESCRIPTION
+
+For an overview of the package, see the documentation for the
+Text::Tradition::Graph module.
+
+This module is meant for use with certain of the other Parser classes
+- whenever a list of variants is given with reference to a base text,
+these must be joined into a single collation.  The parser should
+therefore make a list of variants and their locations, and BaseText
+will join those listed variants onto the reference text.  
+
+=head1 SUBROUTINES
+
+=over
+
+=item B<parse>
+
+parse( $graph, %opts );
+
+Takes an initialized graph and a set of options, which must include:
+- 'base' - the base text referenced by the variants
+- 'format' - the format of the variant list
+- 'data' - the variants, in the given format.
+
+=cut
+
+sub parse {
+    my( $tradition, %opts ) = @_;
+
+    my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
+    load( $format_mod );
+    my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
+    merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
+}
+
+=item B<merge_base>
+
+merge_base( $graph, 'reference.txt', @apparatus_entries )
+
+Takes three arguments: a newly-initialized Text::Tradition::Graph
+object, a text file containing the reference text, and a list of
+variants (apparatus entries).  Adds the base text to the graph, and
+joins the variants to that.
+
+The list of variants is an array of hash references; each hash takes
+the form
+ { '_id' => line reference,
+   'rdg_0' => lemma reading,
+   'rdg_1' => first variant,
+   ...  # and so on until all distinct readings are listed
+   'WitnessA' => 'rdg_0',
+   'WitnessB' => 'rdg_1',
+   ...  # and so on until all witnesses are listed with their readings
+ }
+
+Any hash key that is not of the form /^rdg_\d+$/ and that does not
+begin with an underscore is assumed to be a witness name.  Any 'meta'
+information to be passed must be passed in a key with a leading
+underscore in its name.
+
+=cut
+
+my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
+
+my %base_text_index;
+my $edits_required = {};
+
+# edits_required -> wit -> [ { start_idx, end_idx, items } ]
 
 sub merge_base {
-    my( $graph, $base_file, @app_entries ) = @_;
-    my @base_line_starts = read_base( $base_file, $graph );
+    my( $collation, $base_file, @app_entries ) = @_;
+    my @base_line_starts = read_base( $base_file, $collation );
 
+    my %all_witnesses;
+    my @unwitnessed_lemma_nodes;
     foreach my $app ( @app_entries ) {
-	my( $line, $num ) = split( /\./, $app->{_id} );
-	# DEBUG with a short graph
-	# last if $line > 2;
-	my $scrutinize = "21.8";
-	my $first_line_node = $base_line_starts[ $line ];
-	my $too_far = $base_line_starts[ $line+1 ];
-	
-	my $lemma = $app->{rdg_0};
-	my $seq = 1; 
-	# Is this the Nth occurrence of this reading in the line?
-	if( $lemma =~ s/(_)?(\d)$// ) {
-	    $seq = $2;
-	}
-	my @lemma_words = split( /\s+/, $lemma );
-	
-	# Now search for the lemma words within this line.
-	my $lemma_start = $first_line_node;
-	my $lemma_end;
-	my %seen;
-	while( $lemma_start ne $too_far ) {
-	    # Loop detection
-	    if( $seen{ $lemma_start->name() } ) {
-		warn "Detected loop at " . $lemma_start->name() . 
-		    ", ref $line,$num";
-		last;
-	    }
-	    $seen{ $lemma_start->name() } = 1;
-	    
-	    # Try to match the lemma.
-	    my $unmatch = 0;
-	    print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
-		$lemma_words[0] . "...\n"
-		if "$line.$num" eq $scrutinize;
-	    if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
-		# Skip it if we need a match that is not the first.
-		if( --$seq < 1 ) {
-		    # Now we have to compare the rest of the words here.
-		    if( scalar( @lemma_words ) > 1 ) {
-			my $next_node = $graph->next_word( $lemma_start );
-			foreach my $w ( @lemma_words[1..$#lemma_words] ) {
-			    printf STDERR "Now matching %s against %s\n", 
-				    cmp_str($next_node), $w
-				if "$line.$num" eq $scrutinize;
-			    if( $w ne cmp_str($next_node) ) {
-				$unmatch = 1;
-				last;
-			    } else {
-				$lemma_end = $next_node;
-				$next_node = $graph->next_word( $lemma_end );
-			    }
-			}
-		    } else {
-			$lemma_end = $lemma_start;
-		    }
-		} else {
-		    $unmatch = 1;
-		}
-	    }
-	    last unless ( $unmatch || !defined( $lemma_end ) );
-	    $lemma_end = undef;
-	    $lemma_start = $graph->next_word( $lemma_start );
-	}
-	
-	unless( $lemma_end ) {
-	    warn "No match found for @lemma_words at $line.$num";
-	    next;
-	} else {
-	    # These are no longer common nodes; unmark them as such.
-	    my @lemma_nodes = $graph->node_sequence( $lemma_start, 
-						     $lemma_end );
-	    map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes;
-	}
-	
-	# Now we have our lemma nodes; we add the variant nodes to the graph.
-	
-	# For each reading that is not rdg_0, we make a chain of nodes
-	# and connect them to the anchor.  Edges are named after the mss
-	# that are relevant.
-	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
-	    next if $k eq 'rdg_0'; # that's the lemma.
-	    my @variant = split( /\s+/, $app->{$k} );
-	    @variant = () if $app->{$k} eq '/'; # This is an omission.
-	    my @mss = grep { $app->{$_} eq $k } keys( %$app );
-	    
-	    unless( @mss ) {
-		print STDERR "Skipping '@variant' at $line.$num: no mss\n";
-		next;
-	    }
-	    
-	    # Determine the label name for the edges here.
-	    my $edge_name = join(', ', @mss );
-	    
-	    # Make the variant into a set of nodes.
-	    my $ctr = 0;
-	    my $last_node = $graph->prior_word( $lemma_start );
-	    my $var_start;
-	    foreach my $vw ( @variant ) {
-		my $vwname = "$k/$line.$num.$ctr"; $ctr++;
-		my $vwnode = $graph->add_node( $vwname );
-		$vwnode->set_attribute( 'label', $vw );
-		$vwnode->set_attribute( 'class', 'variant' );
-		$graph->add_edge( $last_node, $vwnode, $edge_name );
-		$var_start = $vwnode unless $var_start;
-		$last_node = $vwnode;
-	    }
-	    # Now hook it up at the end.
-	    $graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
-					$edge_name );
-	    
-	    # Now collate and collapse the identical nodes within the graph.
-	    collate_variant( $graph, $lemma_start, $lemma_end, 
-			     $var_start, $last_node );
-	    
-	}
+        my( $line, $num ) = split( /\./, $app->{_id} );
+        # DEBUG with a short graph
+        last if $SHORTEND && $line > $SHORTEND;
+        # DEBUG for problematic entries
+        my $scrutinize = '';
+        my $first_line_reading = $base_line_starts[ $line ];
+        my $too_far = $base_line_starts[ $line+1 ];
+        
+        my $lemma = $app->{rdg_0};
+        my $seq = 1; 
+        # Is this the Nth occurrence of this reading in the line?
+        if( $lemma =~ s/(_)?(\d)$// ) {
+            $seq = $2;
+        }
+        my @lemma_words = split( /\s+/, $lemma );
+        
+        # Now search for the lemma words within this line.
+        my $lemma_start = $first_line_reading;
+        my $lemma_end;
+        my %seen;
+        while( $lemma_start ne $too_far ) {
+            # Loop detection
+            if( $seen{ $lemma_start->name() } ) {
+                warn "Detected loop at " . $lemma_start->name() . 
+                    ", ref $line,$num";
+                last;
+            }
+            $seen{ $lemma_start->name() } = 1;
+            
+            # Try to match the lemma.
+            my $unmatch = 0;
+            print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
+                $lemma_words[0] . "...\n"
+                if "$line.$num" eq $scrutinize;
+            if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
+                # Skip it if we need a match that is not the first.
+                if( --$seq < 1 ) {
+                    # Now we have to compare the rest of the words here.
+                    if( scalar( @lemma_words ) > 1 ) {
+                        my $next_reading = 
+                            $collation->next_reading( $lemma_start );
+                        foreach my $w ( @lemma_words[1..$#lemma_words] ) {
+                            printf STDERR "Now matching %s against %s\n", 
+                                    cmp_str($next_reading), $w
+                                if "$line.$num" eq $scrutinize;
+                            if( $w ne cmp_str($next_reading) ) {
+                                $unmatch = 1;
+                                last;
+                            } else {
+                                $lemma_end = $next_reading;
+                                $next_reading = 
+                                    $collation->next_reading( $lemma_end );
+                            }
+                        }
+                    } else {
+                        $lemma_end = $lemma_start;
+                    }
+                } else {
+                    $unmatch = 1;
+                }
+            }
+            last unless ( $unmatch || !defined( $lemma_end ) );
+            $lemma_end = undef;
+            $lemma_start = $collation->next_reading( $lemma_start );
+        }
+        
+        unless( $lemma_end ) {
+            warn "No match found for @lemma_words at $line.$num";
+            next;
+        }
+        
+        # Now we have found the lemma; we will record an 'edit', in
+        # terms of a splice operation, for each subsequent reading.
+        # We also note which witnesses take the given edit.
+
+        my @lemma_set = $collation->reading_sequence( $lemma_start, 
+                                                      $lemma_end );
+        my @reading_sets = [ @lemma_set ];
+
+        # For each reading that is not rdg_0, we create the variant
+        # reading nodes, and store the range as an edit operation on
+        # the base text.
+        my $variant_objects;
+        my %pc_seen; # Keep track of mss with explicit post-corr data
+        foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
+            my @mss = grep { $app->{$_} eq $k } keys( %$app );
+
+            # Keep track of lemma nodes that don't actually appear in
+            # any MSS; we will want to remove them from the collation.
+            push( @unwitnessed_lemma_nodes, @lemma_set )
+                if !@mss && $k eq 'rdg_0';
+
+            # Keep track of what witnesses we have seen.
+            @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
+            # Keep track of which witnesses bear corrected readings here.
+            foreach my $m ( @mss ) {
+                my $base = _is_post_corr( $m );
+                next unless $base;
+                $pc_seen{$base} = 1;
+            }
+            next if $k eq 'rdg_0';
+
+            # Parse the variant into reading tokens.
+            # TODO don't hardcode the reading split operation
+            my @variant = split( /\s+/, $app->{$k} );
+            @variant = () if $app->{$k} eq '/'; # This is an omission.
+            
+            my @variant_readings;
+            my $ctr = 0;
+            foreach my $vw ( @variant ) {
+                my $vwname = "$k/$line.$num.$ctr"; $ctr++;
+                my $vwreading = $collation->add_reading( $vwname );
+                $vwreading->text( $vw );
+                push( @variant_readings, $vwreading );
+            }
+
+            $variant_objects->{$k} = { 'mss' => \@mss,
+                                       'reading' => \@variant_readings,
+            };
+            push( @reading_sets, \@variant_readings );
+        }
+
+        # Now collate and collapse the identical readings within the
+        # collated sets.  Modifies the reading sets that were passed.
+        collate_variants( $collation, @reading_sets );
+
+        # Record any stated relationships between the nodes and the lemma.
+        set_relationships( $collation, $app, \@lemma_set, $variant_objects );
+
+        # Now create the splice-edit objects that will be used
+        # to reconstruct each witness.
+
+        foreach my $rkey ( keys %$variant_objects ) {
+            # Object is argument list for splice, so:
+            # offset, length, replacements
+            my $edit_object = [ $lemma_start->name,
+                                scalar( @lemma_set ),
+                                $variant_objects->{$rkey}->{reading} ];
+            foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
+                # Is this a p.c. entry?
+                my $base = _is_post_corr( $ms );
+                if( $base ) { # this is a post-corr witness
+                    my $pc_key = $base . "_post";
+                    add_hash_entry( $edits_required, $pc_key, $edit_object );
+                } else { # this is an ante-corr witness
+                    my $pc_key = $ms . "_post";
+                    add_hash_entry( $edits_required, $ms, $edit_object );
+                    unless( $pc_seen{$ms} ) {
+                        # If this witness carries no correction, add this 
+                        # same object to its post-corrected state.
+                        add_hash_entry( $edits_required, $pc_key, 
+                                         $edit_object );
+                    }
+                }
+            }
+        }
+    } # Finished going through the apparatus entries
+
+    # Now make the witness objects, and create their text sequences
+    foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
+        print STDERR "Creating witness $w\n";
+        my $witness_obj = $collation->tradition->add_witness( sigil => $w );
+        my $debug; #  = $w eq 'Vb11';
+        my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
+        my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
+            if exists( $edits_required->{$w."_post"} );
+
+        my @repeated = check_for_repeated( @ante_corr_seq );
+        warn "Repeated elements @repeated in $w a.c."
+            if @repeated;
+        @repeated = check_for_repeated( @post_corr_seq );
+        warn "Repeated elements @repeated in $w p.c."
+            if @repeated;
+
+        # Now save these paths in my witness object
+        if( @post_corr_seq ) {
+            $witness_obj->path( \@post_corr_seq );
+            $witness_obj->uncorrected_path( \@ante_corr_seq );
+        } else {
+            $witness_obj->path( \@ante_corr_seq );
+        }
     }
 
-    ## Now in theory I have a graph.  I want to make it a little easier to
-    ## read.  So I collapse nodes that have only one edge in and one edge
-    ## out, and I do this by looking at the edges.
-    
-    foreach my $edge ( $graph->edges() ) {
-	my @out_edges = $edge->from()->outgoing();
-	my @in_edges = $edge->to()->incoming();
-	
-	next unless scalar( @out_edges ) == 1;
-	next unless scalar( @in_edges ) == 1;
-	next unless $out_edges[0] eq $in_edges[0];
-	# In theory if we've got this far, we're safe, but just to
-	# double-check...
-	next unless $out_edges[0] eq $edge;
-	
-	$graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
+    # Now remove our 'base text' edges, which is to say, the only
+    # ones we have created so far.  Also remove any unwitnessed
+    # lemma nodes (TODO unless we are treating base as witness)
+    foreach ( $collation->paths() ) {
+        $collation->del_path( $_ );
     }
+    foreach( @unwitnessed_lemma_nodes ) {
+        $collation->del_reading( $_ );
+        # TODO do we need to delete any relationship paths here?
+    }
+
+    ### HACKY HACKY Do some one-off path corrections here.
+    if( $collation->linear ) {
+       my $c = $collation;
+       my $end = $SHORTEND ? $SHORTEND : 155;
+       # Vb11
+       my $path;
+       if( $end > 16 ) {
+           $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
+           $path = $c->tradition->witness('Vb11')->path;
+           splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) );
+           $path = $c->tradition->witness('Vb11')->uncorrected_path;
+           splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) );
+       }
+       # What else?
+       # Vb26:
+       $path = $c->tradition->witness('Vb26')->path;
+       splice( @$path, 618, 0, $c->reading('rdg_1/46.1.1') ) if $end > 46;
+       # Vb13:
+       $path = $c->tradition->witness('Vb13')->path;
+       splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58;
+       $path = $c->tradition->witness('Vb13')->uncorrected_path;
+       splice( @$path, 758, 0, $c->reading( '58,5' ) ) if $end > 58;
+       # Vb20 a.c.: 
+       $path = $c->tradition->witness('Vb20')->uncorrected_path;
+       splice( @$path, 1251, 1, $c->reading( '94,4' ) ) if $end > 94;
+       # Vb5:
+       $path = $c->tradition->witness('Vb5')->path;
+       splice( @$path, 1436, 0, $c->reading('rdg_1/106.5.1') ) if $end > 106;
+       # extraneous:
+       $c->del_reading( 'rdg_2/147.6.13' );
+       $c->del_reading( 'rdg_2/147.6.14' );
+       $c->del_reading( 'rdg_2/147.6.15' );
+       
+    } else {
+       my $c = $collation;
+       my $end = $SHORTEND ? $SHORTEND : 155;
+       # Vb5:
+       my $path = $c->tradition->witness('Vb5')->path;
+       splice( @$path, 1436, 0, $c->reading('106,14') ) if $end > 106;
+       # Vb11: 
+       $path = $c->tradition->witness('Vb11')->path;
+       if( $end > 16 ) {
+           $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
+           splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( '16,1' ) );
+       }
+       # Vb13:
+       $path = $c->tradition->witness('Vb13')->path;
+       splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58;
+       $path = $c->tradition->witness('Vb13')->uncorrected_path;
+       splice( @$path, 758, 0, $c->reading( '58,5' ) ) if $end > 58;
+       # Vb20 a.c.: 
+       $path = $c->tradition->witness('Vb20')->uncorrected_path;
+       splice( @$path, 1251, 1, $c->reading( '94,4' ) ) if $end > 94;
+       # Vb26: 
+       $path = $c->tradition->witness('Vb26')->path;
+       splice( @$path, 618, 0, $c->reading('46,2') ) if $end > 46;
+    }
+
+    # Now walk paths and calculate positional rank.
+    $collation->make_witness_paths();
+    # Have to check relationship validity at this point, because before that
+    # we had no paths.
+#     foreach my $rel ( $collation->relationships ) {
+#         next unless $rel->equal_rank;
+#         unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
+#             warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
+#                             $rel->type, $rel->from->name, $rel->to->name );
+#         }
+#     }
+    $collation->calculate_ranks();
 }
 
-# read_base: Takes a text file and a (presumed empty) graph object,
-# adds the words as simple linear nodes to the graph, and returns a
-# list of nodes that represent the beginning of lines. This graph is
-# now the starting point for application of apparatus entries in
-# merge_base, e.g. from a CSV file or a CTE file.
+=item B<read_base>
+
+my @line_beginnings = read_base( 'reference.txt', $collation );
+
+Takes a text file and a (presumed empty) collation object, adds the
+words as simple linear readings to the collation, and returns a
+list of readings that represent the beginning of lines. This collation
+is now the starting point for application of apparatus entries in
+merge_base, e.g. from a CSV file or a Classical Text Editor file.
+
+=cut
 
 sub read_base {
-    my( $base_file, $graph ) = @_;
+    my( $base_file, $collation ) = @_;
     
-    # This array gives the first node for each line.  We put the
+    # This array gives the first reading for each line.  We put the
     # common starting point in line zero.
-    my $last_node = $graph->start();
-    my $lineref_array = [ $last_node ]; # There is no line zero.
+    my $last_reading = $collation->start();
+    $base_text_index{$last_reading->name} = 0;
+    my $lineref_array = [ $last_reading ]; # There is no line zero.
 
     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
+    my $i = 1;
     while(<BASE>) {
-	# Make the nodes, and connect them up for the base, but also
-	# save the first node of each line in an array for the purpose.
-	chomp;
-	my @words = split;
-	my $started = 0;
-	my $wordref = 0;
-	my $lineref = scalar @$lineref_array;
-	foreach my $w ( @words ) {
-	    my $noderef = join( ',', $lineref, ++$wordref );
-	    my $node = $graph->add_node( $noderef );
-	    $node->set_attribute( 'label', $w );
-	    $node->set_attribute( 'class', 'common' );
-	    unless( $started ) {
-		push( @$lineref_array, $node );
-		$started = 1;
-	    }
-	    if( $last_node ) {
-		$graph->add_edge( $last_node, $node, "base text" );
-		$last_node = $node;
-	    } # TODO there should be no else here...
-	}
+        # Make the readings, and connect them up for the base, but
+        # also save the first reading of each line in an array for the
+        # purpose.
+        # TODO use configurable reading separator
+        chomp;
+        my @words = split;
+        my $started = 0;
+        my $wordref = 0;
+        my $lineref = scalar @$lineref_array;
+        last if $SHORTEND && $lineref > $SHORTEND;
+        foreach my $w ( @words ) {
+            my $readingref = join( ',', $lineref, ++$wordref );
+            my $reading = $collation->add_reading( $readingref );
+            $reading->text( $w );
+            unless( $started ) {
+                push( @$lineref_array, $reading );
+                $started = 1;
+            }
+            # Add edge paths in the graph, for easier tracking when
+            # we start applying corrections.  These paths will be
+            # removed when we're done.
+            my $path = $collation->add_path( $last_reading, $reading, 
+                                             $collation->baselabel );
+            $last_reading = $reading;
+
+            # Note an array index for the reading, for later correction splices.
+            $base_text_index{$readingref} = $i++;
+        }
     }
     close BASE;
     # Ending point for all texts
-    my $endpoint = $graph->add_node( '#END#' );
-    $graph->add_edge( $last_node, $endpoint, "base text" );
-    push( @$lineref_array, $endpoint );
+    $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
+    push( @$lineref_array, $collation->end );
+    $base_text_index{$collation->end->name} = $i;
 
     return( @$lineref_array );
 }
 
+sub set_relationships {
+    my( $collation, $app, $lemma, $variants ) = @_;
+    foreach my $rkey ( keys %$variants ) {
+        my $var = $variants->{$rkey}->{'reading'};
+        my $type = $app->{sprintf( "_%s_type", $rkey )};
+        my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
+        my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
+        
+        my %rel_options = ();
+        $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
+        $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
+        
+        if( $type =~ /^(inv|tr|rep)$/i ) {
+            # Transposition or repetition: look for nodes with the
+            # same label but different IDs and mark them.
+            $type = 'repetition' if $type =~ /^rep/i;
+            $rel_options{'type'} = $type;
+            $rel_options{'equal_rank'} = undef;
+            my %labels;
+            foreach my $r ( @$lemma ) {
+                $labels{cmp_str( $r )} = $r;
+            }
+            foreach my $r( @$var ) {
+                if( exists $labels{$r->label} &&
+                    $r->name ne $labels{$r->label}->name ) {
+                    if( $type eq 'repetition' ) {
+                        # Repetition
+                        $collation->add_relationship( $r, $labels{$r->label}, \%rel_options );
+                    } else {
+                        # Transposition
+                        $r->set_identical( $labels{$r->label} );
+                    }
+                }
+            }
+        } elsif( $type =~ /^(gr|sp(el)?)$/i ) {
 
-## Helper methods for merge_base
+            # Grammar/spelling/lexical: this can be a one-to-one or
+            # one-to-many mapping.  We should think about merging
+            # readings if it is one-to-many.
 
-sub collate_variant {
-    my( $graph, $lemma_start, $lemma_end, $var_start, $var_end ) = @_;
-    # If var_start is undef, then the variant is an omission and
-    # there's nothing to collate. Return.
-    return unless $var_start;
+            $type = 'grammatical' if $type =~ /gr/i;
+            $type = 'spelling' if $type =~ /sp/i;
+            $type = 'repetition' if $type =~ /rep/i;
+            # $type = 'lexical' if $type =~ /lex/i;
+            $rel_options{'type'} = $type;
+            $rel_options{'equal_rank'} = 1;
+            if( @$lemma == @$var ) {
+                foreach my $i ( 0 .. $#{$lemma} ) {
+                    $collation->add_relationship( $var->[$i], $lemma->[$i],
+                        \%rel_options );
+                } 
+            } else {
+                # An uneven many-to-many mapping.  Skip for now.
+                # We really want to make a segment out of whatever we have.
+                # my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
+                # my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
+                # $collation->add_relationship( $varseg, $lemseg, \%rel_options );
+                if( @$lemma == 1 && @$var == 1 ) {
+                    $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
+                }
+            }
+        } elsif( $type !~ /^(add|om|lex)$/i ) {
+            warn "Unrecognized type $type";
+        }
+    }
+}
+        
 
-    # I want to look at the nodes in the variant and lemma, and
-    # collapse nodes that are the same word.  This is mini-collation.
-    my %collapsed = ();
-    # There will only be one outgoing edge at first, so this is safe.
-    my @out = $var_start->outgoing();
-    my $var_label = $out[0]->label();
 
-    my @lemma_nodes;
-    while( $lemma_start ne $lemma_end ) {
-	push( @lemma_nodes, $lemma_start );
-	$lemma_start = $graph->next_word( $lemma_start );
-    } 
-    push( @lemma_nodes, $lemma_end );
-    
-    my @variant_nodes;
-    while( $var_start ne $var_end ) {
-	push( @variant_nodes, $var_start );
-	$var_start = $graph->next_word( $var_start, $var_label );
-    }
-    push( @variant_nodes, $var_end );
-
-    # Go through the variant nodes, and if we find a lemma node that
-    # hasn't yet been collapsed with a node, equate them.
-
-    foreach my $w ( @variant_nodes ) {
-	my $word = $w->label();
-	foreach my $l ( @lemma_nodes ) {
-	    if( $word eq cmp_str( $l ) ) {
-		next if exists( $collapsed{ $l->label } )
-		    && $collapsed{ $l->label } eq $l;
-		# Collapse the nodes.
-		printf STDERR "Merging nodes %s/%s and %s/%s\n", 
-		    $l->name, $l->label, $w->name, $w->label;
-		$graph->merge_nodes( $l, $w );
-		$collapsed{ $l->label } = $l;
-		# Now collapse any multiple edges to and from the node.
-		# Rely on the presence of the 'base text' edge.
-		remove_duplicate_edges( $graph, $graph->prior_word( $l ), $l );
-		remove_duplicate_edges( $graph, $l, $graph->next_word( $l ) );
-	    }
-	}
+sub apply_edits {
+    my( $collation, $edit_sequence, $debug ) = @_;
+    my @lemma_text = $collation->reading_sequence( $collation->start,
+                                           $collation->reading( '#END#' ) );
+    my $drift = 0;
+    foreach my $correction ( @$edit_sequence ) {
+        my( $lemma_start, $length, $items ) = @$correction;
+        my $offset = $base_text_index{$lemma_start};
+        my $realoffset = $offset + $drift;
+        if( $debug ||
+            $lemma_text[$realoffset]->name ne $lemma_start ) {
+            my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
+            my @base_phrase;
+            my $i = $realoffset;
+            my $l = $collation->reading( $lemma_start );
+            while( $i < $realoffset+$length ) {
+                push( @base_phrase, $l );
+                $l = $collation->next_reading( $l );
+                $i++;
+            }
+            
+            print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
+                                  "with %s (%s) with drift %d\n",
+                                  join( ' ', map {$_->label} @base_phrase ),
+                                  join( ' ', map {$_->name} @base_phrase ),
+                                  $realoffset,
+                                  join( ' ', map {$_->label} @$items ),
+                                  join( ' ', map {$_->name} @$items ),
+                                  $drift,
+                                  ) if $debug;
+                                  
+            if( $lemma_text[$realoffset]->name ne $lemma_start ) {
+                warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
+                               "but %s (%s) is there instead", 
+                               join( ' ', map {$_->label} @base_phrase ),
+                               join( ' ', map {$_->name} @base_phrase ),
+                               join( ' ', map {$_->label} @$items ),
+                               join( ' ', map {$_->name} @$items ),
+                               join( ' ', map {$_->label} @this_phrase ),
+                               join( ' ', map {$_->name} @this_phrase ),
+                      ) );
+                # next;
+            }
+        }
+        splice( @lemma_text, $realoffset, $length, @$items );
+        $drift += @$items - $length;
     }
+    return @lemma_text;
 }
+        
 
-sub remove_duplicate_edges {
-    my( $graph, $from, $to ) = @_;
-    my @edges = $from->edges_to( $to );
-    if( scalar @edges > 1 ) {
-	my @base = grep { $_->label eq 'base text' } @edges;
-	if ( scalar @base ) {
-	    # Remove the edges that are not base.
-	    foreach my $e ( @edges ) {
-		$graph->del_edge( $e )
-		    unless $e eq $base[0];
-	    }
-	} else {
-	    # Combine the edges into one.
-	    my $new_edge_name = join( ', ', map { $_->label() } @edges );
-	    my $new_edge = shift @edges;
-	    $new_edge->set_attribute( 'label', $new_edge_name );
-	    foreach my $e ( @edges ) {
-		$graph->del_edge( $e );
-	    }
-	}
+# Helper function. Given a witness sigil, if it is a post-correctione
+# sigil,return the base witness.  If not, return a false value.
+sub _is_post_corr {
+    my( $sigil ) = @_;
+    if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
+        return $1;
     }
+    return undef;
 }
 
-# TODO need to make this configurable!
-sub cmp_str {
-    my( $node ) = @_;
-    my $word = $node->label();
-    $word = lc( $word );
-    $word =~ s/\W//g;
-    $word =~ s/v/u/g;
-    $word =~ s/j/i/g;
-    $word =~ s/cha/ca/g;
-    $word =~ s/quatuor/quattuor/g;
-    $word =~ s/ioannes/iohannes/g;
-    return $word;
-}
+
+=back
+
+=head1 LICENSE
+
+This package is free software and is provided "as is" without express
+or implied warranty.  You can redistribute it and/or modify it under
+the same terms as Perl itself.
+
+=head1 AUTHOR
+
+Tara L Andrews, aurum@cpan.org
+
+=cut
 
 1;