From: Tara L Andrews <tla\@mit.edu>
Date: Tue, 27 Dec 2011 01:07:16 +0000 (+0100)
Subject: various bugfixes, getting real traditions to parse
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=49d4f2accb29737fdbaa42f75062d2055c4bc2ef;p=scpubgit%2Fstemmatology.git

various bugfixes, getting real traditions to parse
---

diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm
index 8609c96..17eed6a 100644
--- a/lib/Text/Tradition/Collation.pm
+++ b/lib/Text/Tradition/Collation.pm
@@ -135,7 +135,6 @@ around del_reading => sub {
 	if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) {
 		$arg = $arg->id;
 	}
-	
 	# Remove the reading from the graphs.
 	$self->sequence->delete_vertex( $arg );
 	$self->relations->delete_vertex( $arg );
@@ -151,7 +150,7 @@ sub merge_readings {
 
 	# We only need the IDs for adding paths to the graph, not the reading
 	# objects themselves.
-    my( $kept, $deleted ) = $self->_stringify_args( @_ );
+    my( $kept, $deleted, $combine_char ) = $self->_stringify_args( @_ );
 
     # The kept reading should inherit the paths and the relationships
     # of the deleted reading.
@@ -159,6 +158,7 @@ sub merge_readings {
 		my @vector = ( $kept );
 		push( @vector, $path->[1] ) if $path->[0] eq $deleted;
 		unshift( @vector, $path->[0] ) if $path->[1] eq $deleted;
+		next if $vector[0] eq $vector[1]; # Don't add a self loop
 		my %wits = %{$self->sequence->get_edge_attributes( @$path )};
 		$self->sequence->add_edge( @vector );
 		my $fwits = $self->sequence->get_edge_attributes( @vector );
@@ -168,6 +168,7 @@ sub merge_readings {
 	foreach my $rel ( $self->relations->edges_at( $deleted ) ) {
 		my @vector = ( $kept );
 		push( @vector, $rel->[0] eq $deleted ? $rel->[1] : $rel->[0] );
+		next if $vector[0] eq $vector[1]; # Don't add a self loop
 		# Is there a relationship here already? If so, keep it.
 		# TODO Warn about conflicting relationships
 		next if $self->relations->has_edge( @vector );
@@ -178,6 +179,12 @@ sub merge_readings {
 	}
 	
 	# Do the deletion deed.
+	if( $combine_char ) {
+		my $kept_obj = $self->reading( $kept );
+		my $new_text = join( $combine_char, $kept_obj->text, 
+			$self->reading( $deleted )->text );
+		$kept_obj->alter_text( $new_text );
+	}
 	$self->del_reading( $deleted );
 }
 
@@ -209,13 +216,20 @@ sub add_path {
 
 sub del_path {
 	my $self = shift;
+	my @args;
+	if( ref( $_[0] ) eq 'ARRAY' ) {
+		my $e = shift @_;
+		@args = ( @$e, @_ );
+	} else {
+		@args = @_;
+	}
 
 	# We only need the IDs for adding paths to the graph, not the reading
 	# objects themselves.
-    my( $source, $target, $wit ) = $self->_stringify_args( @_ );
+    my( $source, $target, $wit ) = $self->_stringify_args( @args );
 
 	if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) {
-		$self->sequence->del_edge_attribute( $source, $target, $wit );
+		$self->sequence->delete_edge_attribute( $source, $target, $wit );
 	}
 	unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) {
 		$self->sequence->delete_edge( $source, $target );
@@ -262,11 +276,11 @@ sub add_relationship {
 
 	# Check the options
 	if( !defined $options->{'type'} ||
-		$options->{'type'} !~ /^(spelling|orthographic|grammatical|meaning|repetition|transposition)$/i ) {
+		$options->{'type'} !~ /^(spelling|orthographic|grammatical|meaning|lookalike|repetition|transposition)$/i ) {
 		my $t = $options->{'type'} ? $options->{'type'} : '';
-		return( undef, "Invalid or missing type" . $options->{'type'} );
+		return( undef, "Invalid or missing type " . $options->{'type'} );
 	}
-	if( $options->{'type'} =~ /^(spelling|orthographic|grammatical|meaning)$/ ) {
+	unless ( $options->{'type'} =~ /^(repetition|transposition)$/ ) {
 		$options->{'colocated'} = 1;
 	}
 	
@@ -305,10 +319,10 @@ sub relationship_valid {
 		# to a path loop for any witness.  First make a lookup table of all the
 		# readings related to either the source or the target.
 		my @proposed_related = ( $source, $target );
-		push( @proposed_related, $source->related_readings( 'colocated' ) );
-		push( @proposed_related, $target->related_readings( 'colocated' ) );
+		push( @proposed_related, $self->related_readings( $source, 'colocated' ) );
+		push( @proposed_related, $self->related_readings( $target, 'colocated' ) );
 		my %pr_ids;
-		map { $pr_ids{ $_->id } = 1 } @proposed_related;
+		map { $pr_ids{ $_ } = 1 } @proposed_related;
 	
 		# None of these proposed related readings should have a neighbor that
 		# is also in proposed_related.
@@ -784,7 +798,6 @@ sub reading_sequence {
         $seen{$n->id} = 1;
         
         my $next = $self->next_reading( $n, $witness, $backup );
-        $DB::single = 1 if $next->id eq $end->id;
         unless( $next ) {
             warn "Did not find any path for $witness from reading " . $n->id;
             last;
@@ -952,6 +965,7 @@ sub calculate_ranks {
         foreach my $n ( $self->sequence->successors( $r->id ) ) {
         	my( $tfrom, $tto ) = ( $rel_containers{$r->id},
         		$rel_containers{$n} );
+        	$DB::single = 1 unless $tfrom && $tto;
             $topo_graph->add_edge( $tfrom, $tto );
         }
     }
diff --git a/lib/Text/Tradition/Collation/Reading.pm b/lib/Text/Tradition/Collation/Reading.pm
index 9c30ea2..d0d0385 100644
--- a/lib/Text/Tradition/Collation/Reading.pm
+++ b/lib/Text/Tradition/Collation/Reading.pm
@@ -78,6 +78,7 @@ has 'text' => (
 	is => 'ro',
 	isa => 'Str',
 	required => 1,
+	writer => 'alter_text',
 	);
 
 has 'is_start' => (
diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm
index 545ca6d..a5ab34a 100644
--- a/lib/Text/Tradition/Parser/BaseText.pm
+++ b/lib/Text/Tradition/Parser/BaseText.pm
@@ -89,7 +89,6 @@ sub merge_base {
     my @base_line_starts = read_base( $base_file, $collation );
 
     my %all_witnesses;
-    my @unwitnessed_lemma_nodes;
     foreach my $app ( @app_entries ) {
         my( $line, $num ) = split( /\./, $app->{_id} );
         # DEBUG with a short graph
@@ -113,12 +112,12 @@ sub merge_base {
         my %seen;
         while( $lemma_start ne $too_far ) {
             # Loop detection
-            if( $seen{ $lemma_start->name() } ) {
-                warn "Detected loop at " . $lemma_start->name() . 
+            if( $seen{ $lemma_start->id() } ) {
+                warn "Detected loop at " . $lemma_start->id() . 
                     ", ref $line,$num";
                 last;
             }
-            $seen{ $lemma_start->name() } = 1;
+            $seen{ $lemma_start->id() } = 1;
             
             # Try to match the lemma.
             my $unmatch = 0;
@@ -178,11 +177,6 @@ sub merge_base {
         foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
             my @mss = grep { $app->{$_} eq $k } keys( %$app );
 
-            # Keep track of lemma nodes that don't actually appear in
-            # any MSS; we will want to remove them from the collation.
-            push( @unwitnessed_lemma_nodes, @lemma_set )
-                if !@mss && $k eq 'rdg_0';
-
             # Keep track of what witnesses we have seen.
             @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
             # Keep track of which witnesses bear corrected readings here.
@@ -202,8 +196,9 @@ sub merge_base {
             my $ctr = 0;
             foreach my $vw ( @variant ) {
                 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
-                my $vwreading = $collation->add_reading( $vwname );
-                $vwreading->text( $vw );
+                my $vwreading = $collation->add_reading( {
+                	'id' => $vwname,
+                	'text' => $vw } );
                 push( @variant_readings, $vwreading );
             }
 
@@ -226,7 +221,7 @@ sub merge_base {
         foreach my $rkey ( keys %$variant_objects ) {
             # Object is argument list for splice, so:
             # offset, length, replacements
-            my $edit_object = [ $lemma_start->name,
+            my $edit_object = [ $lemma_start->id,
                                 scalar( @lemma_set ),
                                 $variant_objects->{$rkey}->{reading} ];
             foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
@@ -278,11 +273,7 @@ sub merge_base {
     # ones we have created so far.  Also remove any unwitnessed
     # lemma nodes (TODO unless we are treating base as witness)
     foreach ( $collation->paths() ) {
-        $collation->del_path( $_ );
-    }
-    foreach( @unwitnessed_lemma_nodes ) {
-        $collation->del_reading( $_ );
-        # TODO do we need to delete any relationship paths here?
+        $collation->del_path( $_, $collation->baselabel );
     }
 
     ### HACKY HACKY Do some one-off path corrections here.
@@ -291,6 +282,13 @@ sub merge_base {
 
     # Now walk paths and calculate positional rank.
     $collation->make_witness_paths();
+    # Now delete any orphaned readings.
+	foreach my $r ( $collation->sequence->isolated_vertices ) {
+		print STDERR "Deleting unconnected reading $r / " . 
+			$collation->reading( $r )->text . "\n";
+		$collation->del_reading( $r );
+	}
+	
     KUL::HACK::post_path_hack( $collation );
     # Have to check relationship validity at this point, because before that
     # we had no paths.
@@ -298,7 +296,7 @@ sub merge_base {
 #         next unless $rel->equal_rank;
 #         unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
 #             warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
-#                             $rel->type, $rel->from->name, $rel->to->name );
+#                             $rel->type, $rel->from->id, $rel->to->id );
 #         }
 #     }
     $collation->calculate_ranks();
@@ -321,8 +319,8 @@ sub read_base {
     
     # This array gives the first reading for each line.  We put the
     # common starting point in line zero.
-    my $last_reading = $collation->start();
-    $base_text_index{$last_reading->name} = 0;
+    my $last_reading = $collation->start;
+    $base_text_index{$last_reading->id} = 0;
     my $lineref_array = [ $last_reading ]; # There is no line zero.
 
     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
@@ -340,8 +338,7 @@ sub read_base {
         last if $SHORTEND && $lineref > $SHORTEND;
         foreach my $w ( @words ) {
             my $readingref = join( ',', $lineref, ++$wordref );
-            my $reading = $collation->add_reading( $readingref );
-            $reading->text( $w );
+            my $reading = $collation->add_reading( { id => $readingref, text => $w } );
             unless( $started ) {
                 push( @$lineref_array, $reading );
                 $started = 1;
@@ -361,7 +358,7 @@ sub read_base {
     # Ending point for all texts
     $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
     push( @$lineref_array, $collation->end );
-    $base_text_index{$collation->end->name} = $i;
+    $base_text_index{$collation->end->id} = $i;
 
     return( @$lineref_array );
 }
@@ -389,14 +386,14 @@ sub set_relationships {
                 $labels{cmp_str( $r )} = $r;
             }
             foreach my $r( @$var ) {
-                if( exists $labels{$r->label} &&
-                    $r->name ne $labels{$r->label}->name ) {
+                if( exists $labels{$r->text} &&
+                    $r->id ne $labels{$r->text}->id ) {
                     if( $type eq 'repetition' ) {
                         # Repetition
-                        $collation->add_relationship( $r, $labels{$r->label}, \%rel_options );
+                        $collation->add_relationship( $r, $labels{$r->text}, \%rel_options );
                     } else {
                         # Transposition
-                        $r->set_identical( $labels{$r->label} );
+                        $r->set_identical( $labels{$r->text} );
                     }
                 }
             }
@@ -437,15 +434,15 @@ sub set_relationships {
 
 sub apply_edits {
     my( $collation, $edit_sequence, $debug ) = @_;
-    my @lemma_text = $collation->reading_sequence( $collation->start,
-                                           $collation->reading( '#END#' ) );
+    my @lemma_text = $collation->reading_sequence( 
+    	$collation->start, $collation->end );
     my $drift = 0;
     foreach my $correction ( @$edit_sequence ) {
         my( $lemma_start, $length, $items ) = @$correction;
         my $offset = $base_text_index{$lemma_start};
         my $realoffset = $offset + $drift;
         if( $debug ||
-            $lemma_text[$realoffset]->name ne $lemma_start ) {
+            $lemma_text[$realoffset]->id ne $lemma_start ) {
             my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
             my @base_phrase;
             my $i = $realoffset;
@@ -458,23 +455,23 @@ sub apply_edits {
             
             print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
                                   "with %s (%s) with drift %d\n",
-                                  join( ' ', map {$_->label} @base_phrase ),
-                                  join( ' ', map {$_->name} @base_phrase ),
+                                  join( ' ', map {$_->text} @base_phrase ),
+                                  join( ' ', map {$_->id} @base_phrase ),
                                   $realoffset,
-                                  join( ' ', map {$_->label} @$items ),
-                                  join( ' ', map {$_->name} @$items ),
+                                  join( ' ', map {$_->text} @$items ),
+                                  join( ' ', map {$_->id} @$items ),
                                   $drift,
                                   ) if $debug;
                                   
-            if( $lemma_text[$realoffset]->name ne $lemma_start ) {
+            if( $lemma_text[$realoffset]->id ne $lemma_start ) {
                 warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
                                "but %s (%s) is there instead", 
-                               join( ' ', map {$_->label} @base_phrase ),
-                               join( ' ', map {$_->name} @base_phrase ),
-                               join( ' ', map {$_->label} @$items ),
-                               join( ' ', map {$_->name} @$items ),
-                               join( ' ', map {$_->label} @this_phrase ),
-                               join( ' ', map {$_->name} @this_phrase ),
+                               join( ' ', map {$_->text} @base_phrase ),
+                               join( ' ', map {$_->id} @base_phrase ),
+                               join( ' ', map {$_->text} @$items ),
+                               join( ' ', map {$_->id} @$items ),
+                               join( ' ', map {$_->text} @this_phrase ),
+                               join( ' ', map {$_->id} @this_phrase ),
                       ) );
                 # next;
             }
diff --git a/lib/Text/Tradition/Parser/CollateX.pm b/lib/Text/Tradition/Parser/CollateX.pm
index 7123d4d..6618a73 100644
--- a/lib/Text/Tradition/Parser/CollateX.pm
+++ b/lib/Text/Tradition/Parser/CollateX.pm
@@ -92,7 +92,6 @@ sub parse {
         }
         my %node_data = %$n;
         my $gnode_args = { 
-        	'collation' => $collation,
         	'id' => delete $node_data{$IDKEY},
         	'text' => delete $node_data{$CONTENTKEY},
         };
diff --git a/lib/Text/Tradition/Parser/Self.pm b/lib/Text/Tradition/Parser/Self.pm
index 62f9f1f..a1725d6 100644
--- a/lib/Text/Tradition/Parser/Self.pm
+++ b/lib/Text/Tradition/Parser/Self.pm
@@ -138,7 +138,6 @@ sub parse {
     my $use_version;
     print STDERR "Setting graph globals\n";
     $tradition->name( $graph_data->{'name'} );
-	$DB::single = 1;
     foreach my $gkey ( keys %{$graph_data->{'global'}} ) {
 		my $val = $graph_data->{'global'}->{$gkey};
 		if( $gkey eq 'version' ) {
diff --git a/lib/Text/Tradition/Parser/TEI.pm b/lib/Text/Tradition/Parser/TEI.pm
index fa29702..1e64680 100644
--- a/lib/Text/Tradition/Parser/TEI.pm
+++ b/lib/Text/Tradition/Parser/TEI.pm
@@ -364,7 +364,6 @@ sub _return_rdg {
                 foreach my $sig ( keys %$text ) {
                     next if $active_wits{$sig};
                     my $l = $tradition->collation->add_reading( {
-                    	'collation' => $tradition->collation,
                     	'id' => $current_app . "_$i",
                     	'is_lacuna' => 1 } );
                     $i++;
@@ -380,7 +379,6 @@ sub _return_rdg {
                 foreach my $i ( 0 .. $#cur_wits ) {
                     my $w = $cur_wits[$i];
                     my $l = $tradition->collation->add_reading( {
-                    	'collation' => $tradition->collation,
                     	'id' => $current_app . "_$i",
                     	'is_lacuna' => 1 } );
                     push( @{$text->{$w}}, $l );
@@ -472,8 +470,7 @@ sub _get_sigla {
             }
         }
         my $rdg = $graph->add_reading(
-        	{ 'collation' => $graph,
-        	  'id' => $xml_id,
+        	{ 'id' => $xml_id,
         	  'text' => $word }
         	);
         $used_nodeids{$xml_id} = $rdg;
diff --git a/lib/Text/Tradition/Parser/Tabular.pm b/lib/Text/Tradition/Parser/Tabular.pm
index 4c1e511..b70285f 100644
--- a/lib/Text/Tradition/Parser/Tabular.pm
+++ b/lib/Text/Tradition/Parser/Tabular.pm
@@ -157,7 +157,6 @@ sub parse {
                 		$l = $c->reading( $l_id );
                 	} else {
                     	$l = $c->add_reading( {
-							'collation' => $c,
 							'id' => $l_id,
 							'is_lacuna' => 1,
 							} );
@@ -198,7 +197,6 @@ sub make_nodes {
     my $ctr = 1;
     foreach my $w ( keys %unique ) {
     	my $rargs = {
-    		'collation' => $collation,
     		'id' => "$index,$ctr",
     		'rank' => $index,
     		'text' => $w,