From: tla <tla@mit.edu>
Date: Tue, 17 May 2011 15:03:12 +0000 (+0200)
Subject: BROKEN - making lemmatization work
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=3a1f25236a16c5505e0395baec4f085ccd6b2ab1;p=scpubgit%2Fstemmatology.git

BROKEN - making lemmatization work
---

diff --git a/lib/Text/Tradition.pm b/lib/Text/Tradition.pm
index 92673be..36c74ee 100644
--- a/lib/Text/Tradition.pm
+++ b/lib/Text/Tradition.pm
@@ -26,7 +26,6 @@ sub BUILD {
     my( $self, $init_args ) = @_;
     print STDERR "Calling tradition build\n";
 
-    $DB::single = 1;
     if( exists $init_args->{'witnesses'} ) {
 	# We got passed an uncollated list of witnesses.  Make a
 	# witness object for each witness, and then send them to the
diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm
index fd5bda5..063d413 100644
--- a/lib/Text/Tradition/Collation.pm
+++ b/lib/Text/Tradition/Collation.pm
@@ -48,6 +48,14 @@ has 'graphml' => (
     predicate => 'has_graphml',
     );
 
+# Keeps track of the lemmas within the collation.  At most one lemma
+# per position in the graph.
+has 'lemmata' => (
+    is => 'ro',
+    isa => 'HashRef[Maybe[Str]]',
+    default => sub { {} },
+    );
+
 has 'wit_list_separator' => (
 			     is => 'rw',
 			     isa => 'Str',
@@ -315,7 +323,7 @@ sub walk_witness_paths {
     # TODO This method is going to fall down if we have a very gappy 
     # text in the collation.
     my $paths = {};
-    my @common_nodes;
+    my @common_readings;
     foreach my $wit ( @{$self->tradition->witnesses} ) {
 	my $curr_reading = $self->start;
 	my @wit_path = ( $curr_reading );
@@ -334,26 +342,28 @@ sub walk_witness_paths {
 	    $curr_reading = $next_reading;
 	}
 	$wit->path( \@wit_path );
-	if( @common_nodes ) {
+	if( @common_readings ) {
 	    my @cn;
 	    foreach my $n ( @wit_path ) {
-		push( @cn, $n ) if grep { $_ eq $n } @common_nodes;
+		push( @cn, $n ) if grep { $_ eq $n } @common_readings;
 	    }
-	    @common_nodes = ();
-	    push( @common_nodes, @cn );
+	    @common_readings = ();
+	    push( @common_readings, @cn );
 	} else {
-	    push( @common_nodes, @wit_path );
+	    push( @common_readings, @wit_path );
 	}
     }
 
     # Mark all the nodes as either common or not.
-    foreach my $cn ( @common_nodes ) {
+    foreach my $cn ( @common_readings ) {
 	print STDERR "Setting " . $cn->name . " as common node\n";
 	$cn->make_common;
     }
     foreach my $n ( $self->readings() ) {
 	$n->make_variant unless $n->is_common;
     }
+    # Return an array of the common nodes in order.
+    return @common_readings;
 }
 
 sub common_readings {
@@ -365,14 +375,14 @@ sub common_readings {
 # Calculate the relative positions of nodes in the graph, if they
 # were not given to us.
 sub calculate_positions {
-    my $self = shift;
+    my( $self, @ordered_common ) = @_;
 
     # We have to calculate the position identifiers for each word,
     # keyed on the common nodes.  This will be 'fun'.  The end result
     # is a hash per witness, whose key is the word node and whose
     # value is its position in the text.  Common nodes are always N,1
     # so have identical positions in each text.
-    my @common = $self->common_readings();
+    $DB::single = 1;
 
     my $node_pos = {};
     foreach my $wit ( @{$self->tradition->witnesses} ) {
@@ -384,7 +394,7 @@ sub calculate_positions {
 	my $cn = 0;  # We should hit the common readings in order.
 	my $row = [];
 	foreach my $wn ( @{$wit->path} ) {
-	    if( $wn eq $common[$cn] ) {
+	    if( $wn eq $ordered_common[$cn] ) {
 		# Set up to look for the next common node, and
 		# start a new row of words.
 		$cn++;
@@ -416,6 +426,8 @@ sub calculate_positions {
 	    }
 	}
     }
+
+    $self->init_lemmata();
 }
 
 sub _cmp_position {
@@ -428,6 +440,105 @@ sub _cmp_position {
     # else 
     return $pos_a[1] <=> $pos_b[1];
 }
+
+sub all_positions {
+    my $self = shift;
+    my %positions = ();
+    map { $positions{$_->position} = 1 } $self->readings;
+    return keys( %positions );
+}
+
+sub readings_at_position {
+    my( $self, $pos ) = @_;
+    my @answer = grep { $_->position eq $pos } $self->readings;
+    return @answer;
+}
+
+## Lemmatizer functions
+
+sub init_lemmata {
+    my $self = shift;
+    
+    foreach my $position ( $self->all_positions ) {
+	$self->lemmata->{$position} = undef;
+    }
+
+    foreach my $cr ( $self->common_readings ) {
+	$self->lemmata->{$cr->position} = $cr->name;
+    }
+}
+    
+=item B<lemma_readings>
+
+my @state = $graph->lemma_readings( @readings_delemmatized );
+
+Takes a list of readings that have just been delemmatized, and returns
+a set of tuples of the form ['reading', 'state'] that indicates what
+changes need to be made to the graph.
+
+=over
+
+=item * 
+
+A state of 1 means 'lemmatize this reading'
+
+=item * 
+
+A state of 0 means 'delemmatize this reading'
+
+=item * 
+
+A state of undef means 'an ellipsis belongs in the text here because
+no decision has been made / an earlier decision was backed out'
+
+=back
+
+=cut
+
+sub lemma_readings {
+    my( $self, @toggled_off_nodes ) = @_;
+
+    # First get the positions of those nodes which have been
+    # toggled off.
+    my $positions_off = {};
+    map { $positions_off->{ $_->position } = $_->name } @toggled_off_nodes;
+ 
+    
+    # Now for each position, we have to see if a node is on, and we
+    # have to see if a node has been turned off.
+    my @answer;
+    foreach my $pos ( $self->all_positions() ) {
+	# Find the state of this position.  If there is an active node,
+	# its name will be the state; otherwise the state will be 0 
+	# (nothing at this position) or undef (ellipsis at this position)
+	my $active = $self->lemmata->{$pos};
+	
+	# Is there a formerly active node that was toggled off?
+	if( exists( $positions_off->{$pos} ) ) {
+	    my $off_node = $positions_off->{$pos};
+	    if( $active && $active ne $off_node) {
+		push( @answer, [ $off_node, 0 ], [ $active, 1 ] );
+	    } else {
+		push( @answer, [ $off_node, $active ] );
+	    }
+
+	# No formerly active node, so we just see if there is a currently
+	# active one.
+	} elsif( $active ) {
+	    # Push the active node, whatever it is.
+	    push( @answer, [ $active, 1 ] );
+	} else {
+	    # Push the state that is there. Arbitrarily use the first node
+	    # at that position.
+	    my @pos_nodes = $self->readings_at_position( $pos );
+	    push( @answer, [ $pos_nodes[0], $self->lemmata->{$pos} ] );
+	}
+    }
+    
+    return @answer;
+}
+
+
  
 # Return the string that joins together a list of witnesses for
 # display on a single path.
diff --git a/lib/Text/Tradition/Parser/GraphML.pm b/lib/Text/Tradition/Parser/GraphML.pm
index 63e8deb..5c6244c 100644
--- a/lib/Text/Tradition/Parser/GraphML.pm
+++ b/lib/Text/Tradition/Parser/GraphML.pm
@@ -138,12 +138,11 @@ sub parse {
 	}
     }
 
-    $collation->walk_witness_paths( $end_node );
+    my @common_nodes = $collation->walk_witness_paths( $end_node );
     # Now we have added the witnesses and their paths, so have also
     # implicitly marked the common nodes. Now we can calculate their
     # explicit permissions.  This is separate because it won't always
     # be necessary with the GraphML parsing.
-    $collation->calculate_positions() unless $has_explicit_positions;
     if( $has_explicit_positions ) {
 	# Record the positions that came with each graph node.
 	# TODO we really need to translate these into our own style of
@@ -154,7 +153,7 @@ sub parse {
 	}
     } else {
 	# Calculate a position for each graph node.
-	$collation->calculate_positions();
+	$collation->calculate_positions( @common_nodes );
     }
 }
 
diff --git a/t/graph.t b/t/graph.t
index 0c1ce26..08566c1 100644
--- a/t/graph.t
+++ b/t/graph.t
@@ -31,16 +31,13 @@ is( scalar @svg_nodes, 24, "Correct number of nodes in the graph" );
 my @svg_edges = $svg_xpc->findnodes( '//svg:g[@class="edge"]' );
 is( scalar @svg_edges, 30, "Correct number of edges in the graph" );
 
-__END__
-
 # Test for the correct common nodes
 my @expected_nodes = map { [ $_, 1 ] } qw/ #START# n1 n5 n6 n7 n12 n13
                                             n16 n19 n20 n23 n27 /;
 foreach my $idx ( qw/2 3 4 8 11 13 16 18/ ) {
     splice( @expected_nodes, $idx, 0, [ "node_null", undef ] );
 }
-my @active_nodes = $collation->active_nodes();
-# is_deeply( \@active_nodes, \@expected_nodes, "Initial common points" );
+my @active_nodes = $collation->lemma_readings();
 subtest 'Initial common points' => \&compare_active;
 my $string = '# when ... ... ... showers sweet with ... fruit the ... of ... has pierced ... the ... #';
 is( make_text( @active_nodes ), $string, "Got the right starting text" );
@@ -63,7 +60,7 @@ sub make_text {
     my @words;
     foreach my $n ( @_ ) {
 	if( $n->[1] ) {
-	    push( @words, $collation->text_of_node( $n->[0] ) );
+	    push( @words, $collation->reading( $n->[0] )->label );
 	} elsif ( !defined $n->[1] ) {
 	    push( @words, '...' );
 	}
@@ -71,6 +68,8 @@ sub make_text {
     return join( ' ', @words );
 }
 
+__END__
+
 # Test the manuscript paths
 my $wit_a = '# when april with his showers sweet with fruit the drought of march has pierced unto the root #';
 my $wit_b = '# when showers sweet with april fruit the march of drought has pierced to the root #';