From: tla Date: Tue, 17 May 2011 15:03:12 +0000 (+0200) Subject: BROKEN - making lemmatization work X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=3a1f25236a16c5505e0395baec4f085ccd6b2ab1;p=scpubgit%2Fstemmatology.git BROKEN - making lemmatization work --- diff --git a/lib/Text/Tradition.pm b/lib/Text/Tradition.pm index 92673be..36c74ee 100644 --- a/lib/Text/Tradition.pm +++ b/lib/Text/Tradition.pm @@ -26,7 +26,6 @@ sub BUILD { my( $self, $init_args ) = @_; print STDERR "Calling tradition build\n"; - $DB::single = 1; if( exists $init_args->{'witnesses'} ) { # We got passed an uncollated list of witnesses. Make a # witness object for each witness, and then send them to the diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index fd5bda5..063d413 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -48,6 +48,14 @@ has 'graphml' => ( predicate => 'has_graphml', ); +# Keeps track of the lemmas within the collation. At most one lemma +# per position in the graph. +has 'lemmata' => ( + is => 'ro', + isa => 'HashRef[Maybe[Str]]', + default => sub { {} }, + ); + has 'wit_list_separator' => ( is => 'rw', isa => 'Str', @@ -315,7 +323,7 @@ sub walk_witness_paths { # TODO This method is going to fall down if we have a very gappy # text in the collation. my $paths = {}; - my @common_nodes; + my @common_readings; foreach my $wit ( @{$self->tradition->witnesses} ) { my $curr_reading = $self->start; my @wit_path = ( $curr_reading ); @@ -334,26 +342,28 @@ sub walk_witness_paths { $curr_reading = $next_reading; } $wit->path( \@wit_path ); - if( @common_nodes ) { + if( @common_readings ) { my @cn; foreach my $n ( @wit_path ) { - push( @cn, $n ) if grep { $_ eq $n } @common_nodes; + push( @cn, $n ) if grep { $_ eq $n } @common_readings; } - @common_nodes = (); - push( @common_nodes, @cn ); + @common_readings = (); + push( @common_readings, @cn ); } else { - push( @common_nodes, @wit_path ); + push( @common_readings, @wit_path ); } } # Mark all the nodes as either common or not. - foreach my $cn ( @common_nodes ) { + foreach my $cn ( @common_readings ) { print STDERR "Setting " . $cn->name . " as common node\n"; $cn->make_common; } foreach my $n ( $self->readings() ) { $n->make_variant unless $n->is_common; } + # Return an array of the common nodes in order. + return @common_readings; } sub common_readings { @@ -365,14 +375,14 @@ sub common_readings { # Calculate the relative positions of nodes in the graph, if they # were not given to us. sub calculate_positions { - my $self = shift; + my( $self, @ordered_common ) = @_; # We have to calculate the position identifiers for each word, # keyed on the common nodes. This will be 'fun'. The end result # is a hash per witness, whose key is the word node and whose # value is its position in the text. Common nodes are always N,1 # so have identical positions in each text. - my @common = $self->common_readings(); + $DB::single = 1; my $node_pos = {}; foreach my $wit ( @{$self->tradition->witnesses} ) { @@ -384,7 +394,7 @@ sub calculate_positions { my $cn = 0; # We should hit the common readings in order. my $row = []; foreach my $wn ( @{$wit->path} ) { - if( $wn eq $common[$cn] ) { + if( $wn eq $ordered_common[$cn] ) { # Set up to look for the next common node, and # start a new row of words. $cn++; @@ -416,6 +426,8 @@ sub calculate_positions { } } } + + $self->init_lemmata(); } sub _cmp_position { @@ -428,6 +440,105 @@ sub _cmp_position { # else return $pos_a[1] <=> $pos_b[1]; } + +sub all_positions { + my $self = shift; + my %positions = (); + map { $positions{$_->position} = 1 } $self->readings; + return keys( %positions ); +} + +sub readings_at_position { + my( $self, $pos ) = @_; + my @answer = grep { $_->position eq $pos } $self->readings; + return @answer; +} + +## Lemmatizer functions + +sub init_lemmata { + my $self = shift; + + foreach my $position ( $self->all_positions ) { + $self->lemmata->{$position} = undef; + } + + foreach my $cr ( $self->common_readings ) { + $self->lemmata->{$cr->position} = $cr->name; + } +} + +=item B + +my @state = $graph->lemma_readings( @readings_delemmatized ); + +Takes a list of readings that have just been delemmatized, and returns +a set of tuples of the form ['reading', 'state'] that indicates what +changes need to be made to the graph. + +=over + +=item * + +A state of 1 means 'lemmatize this reading' + +=item * + +A state of 0 means 'delemmatize this reading' + +=item * + +A state of undef means 'an ellipsis belongs in the text here because +no decision has been made / an earlier decision was backed out' + +=back + +=cut + +sub lemma_readings { + my( $self, @toggled_off_nodes ) = @_; + + # First get the positions of those nodes which have been + # toggled off. + my $positions_off = {}; + map { $positions_off->{ $_->position } = $_->name } @toggled_off_nodes; + + + # Now for each position, we have to see if a node is on, and we + # have to see if a node has been turned off. + my @answer; + foreach my $pos ( $self->all_positions() ) { + # Find the state of this position. If there is an active node, + # its name will be the state; otherwise the state will be 0 + # (nothing at this position) or undef (ellipsis at this position) + my $active = $self->lemmata->{$pos}; + + # Is there a formerly active node that was toggled off? + if( exists( $positions_off->{$pos} ) ) { + my $off_node = $positions_off->{$pos}; + if( $active && $active ne $off_node) { + push( @answer, [ $off_node, 0 ], [ $active, 1 ] ); + } else { + push( @answer, [ $off_node, $active ] ); + } + + # No formerly active node, so we just see if there is a currently + # active one. + } elsif( $active ) { + # Push the active node, whatever it is. + push( @answer, [ $active, 1 ] ); + } else { + # Push the state that is there. Arbitrarily use the first node + # at that position. + my @pos_nodes = $self->readings_at_position( $pos ); + push( @answer, [ $pos_nodes[0], $self->lemmata->{$pos} ] ); + } + } + + return @answer; +} + + # Return the string that joins together a list of witnesses for # display on a single path. diff --git a/lib/Text/Tradition/Parser/GraphML.pm b/lib/Text/Tradition/Parser/GraphML.pm index 63e8deb..5c6244c 100644 --- a/lib/Text/Tradition/Parser/GraphML.pm +++ b/lib/Text/Tradition/Parser/GraphML.pm @@ -138,12 +138,11 @@ sub parse { } } - $collation->walk_witness_paths( $end_node ); + my @common_nodes = $collation->walk_witness_paths( $end_node ); # Now we have added the witnesses and their paths, so have also # implicitly marked the common nodes. Now we can calculate their # explicit permissions. This is separate because it won't always # be necessary with the GraphML parsing. - $collation->calculate_positions() unless $has_explicit_positions; if( $has_explicit_positions ) { # Record the positions that came with each graph node. # TODO we really need to translate these into our own style of @@ -154,7 +153,7 @@ sub parse { } } else { # Calculate a position for each graph node. - $collation->calculate_positions(); + $collation->calculate_positions( @common_nodes ); } } diff --git a/t/graph.t b/t/graph.t index 0c1ce26..08566c1 100644 --- a/t/graph.t +++ b/t/graph.t @@ -31,16 +31,13 @@ is( scalar @svg_nodes, 24, "Correct number of nodes in the graph" ); my @svg_edges = $svg_xpc->findnodes( '//svg:g[@class="edge"]' ); is( scalar @svg_edges, 30, "Correct number of edges in the graph" ); -__END__ - # Test for the correct common nodes my @expected_nodes = map { [ $_, 1 ] } qw/ #START# n1 n5 n6 n7 n12 n13 n16 n19 n20 n23 n27 /; foreach my $idx ( qw/2 3 4 8 11 13 16 18/ ) { splice( @expected_nodes, $idx, 0, [ "node_null", undef ] ); } -my @active_nodes = $collation->active_nodes(); -# is_deeply( \@active_nodes, \@expected_nodes, "Initial common points" ); +my @active_nodes = $collation->lemma_readings(); subtest 'Initial common points' => \&compare_active; my $string = '# when ... ... ... showers sweet with ... fruit the ... of ... has pierced ... the ... #'; is( make_text( @active_nodes ), $string, "Got the right starting text" ); @@ -63,7 +60,7 @@ sub make_text { my @words; foreach my $n ( @_ ) { if( $n->[1] ) { - push( @words, $collation->text_of_node( $n->[0] ) ); + push( @words, $collation->reading( $n->[0] )->label ); } elsif ( !defined $n->[1] ) { push( @words, '...' ); } @@ -71,6 +68,8 @@ sub make_text { return join( ' ', @words ); } +__END__ + # Test the manuscript paths my $wit_a = '# when april with his showers sweet with fruit the drought of march has pierced unto the root #'; my $wit_b = '# when showers sweet with april fruit the march of drought has pierced to the root #';