From: Tara L Andrews Date: Wed, 18 May 2011 21:53:34 +0000 (+0200) Subject: CHECKPOINT working on base text collation, need to fix path loops X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=7854e12eaa27e999633ccfffd5e1fc39c006562a;p=scpubgit%2Fstemmatology.git CHECKPOINT working on base text collation, need to fix path loops --- diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index aa905d2..ca3f9c1 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -57,10 +57,16 @@ has 'lemmata' => ( ); has 'wit_list_separator' => ( - is => 'rw', - isa => 'Str', - default => ', ', - ); + is => 'rw', + isa => 'Str', + default => ', ', + ); + +has 'baselabel' => ( + is => 'rw', + isa => 'Str', + default => 'base text', + ); # The collation can be created two ways: # 1. Collate a set of witnesses (with CollateX I guess) and process @@ -86,7 +92,33 @@ sub BUILD { $self->graph->set_attribute( 'node', 'shape', $shape ); } -# Wrappes around merge_nodes +# Wrapper around add_path + +around add_path => sub { + my $orig = shift; + my $self = shift; + + # Make sure there are three arguments + unless( @_ == 3 ) { + warn "Call add_path with args source, target, witness"; + return; + } + # Make sure the proposed path does not yet exist + my( $source, $target, $wit ) = @_; + $source = $self->reading( $source ) + unless ref( $source ) eq 'Text::Tradition::Collation::Reading'; + $target = $self->reading( $target ) + unless ref( $target ) eq 'Text::Tradition::Collation::Reading'; + foreach my $path ( $source->edges_to( $target ) ) { + if( $path->label eq $wit ) { + return; + } + } + # Do the deed + $self->$orig( @_ ); +}; + +# Wrapper around merge_nodes sub merge_readings { my $self = shift; @@ -367,7 +399,7 @@ sub _find_linked_reading { # Some set logic. sub _is_within { my( $set1, $set2 ) = @_; - my $ret = 1; + my $ret = @$set1; # will be 0, i.e. false, if set1 is empty foreach my $el ( @$set1 ) { $ret = 0 unless grep { /^\Q$el\E$/ } @$set2; } @@ -429,35 +461,73 @@ sub walk_and_expand_base { my( $self, $end ) = @_; foreach my $wit ( @{$self->tradition->witnesses} ) { - my $sig = $wit_sigil; + my $sig = $wit->sigil; my $post_sig; $post_sig = $wit->post_correctione if $wit->has_post_correctione; - my @wit_path = ( $self->start ); - my @wit_pc_path; - my $curr_rdg = $self->start; - my %seen; - while( $curr_rdg ne $end ) { - if( $seen{$curr_reading->name} ) { - warn "Detected loop in walk_and_expand_base with witness " - . "$sig on reading " . $curr_reading->name . "\n"; - last; + my @wit_path = $self->reading_sequence( $self->start, $end, $sig ); + $wit->path( \@wit_path ); + $self->connect_readings_for_witness( $wit ); + + # If there is a post-correctio, get its path and compare. + # Add a correction range for each divergence. + if( $post_sig ) { + # TODO this is looping + my @corr_wit_path = $self->reading_sequence( $self->start, $end, + "$sig$post_sig", $sig ); + + # Map ante-corr readings to their indices + my %in_orig; + my $i = 0; + map { $in_orig{$_->name} = $i++ } @wit_path; + + # Look for divergences + my $diverged = 0; + my $last_common; + my @correction; + foreach my $rdg ( @corr_wit_path ) { + if( exists( $in_orig{$rdg->name} ) && !$diverged ) { + # We are reading the same here + $last_common = $in_orig{$rdg->name}; + next; + } elsif ( exists( $in_orig{$rdg->name} ) ) { + # We have been diverging but are reading the same again. + # Add the correction to the witness. + my $offset = $last_common; + my $length = $in_orig{$rdg->name} - $last_common; + $wit->add_correction( $offset, $length, \@correction ); + $diverged = 0; + @correction = (); + $last_common = $in_orig{$rdg->name}; + } elsif( $diverged ) { + # We are in the middle of a divergence. + push( @correction, $rdg ); + } else { + # We have started to diverge. Note it. + $diverged = 1; + push( @correction, $rdg ); + } } - my $next_rdg = $self->next_reading( $curr_reading, $sig ); - unless( $self->has_explicit_path( $curr_reading, - $next_reading, $sig ) ) { - $self->add_path( $curr_reading, $next_reading, $sig ); + # Add any divergence that is at the end of the line + if( $diverged ) { + $wit->add_correction( $last_common, $#wit_path, \@correction ); } - push( @wit_path, $next_reading ); - $seen{$curr_reading->name} = 1; } - $wit->path( \@wit_path ); + } - # Now go through this path and look for p.c. divergences. - # TODO decide how to handle p.c. paths - # BIG TODO handle case where p.c. follows the base and a.c. doesn't! - - + # Remove any 'base text' paths. + foreach my $path ( $self->paths ) { + $self->del_path( $path ) + if $path->label eq $self->baselabel; + } +} + +sub connect_readings_for_witness { + my( $self, $wit ) = @_; + my @chain = @{$wit->path}; + foreach my $idx ( 0 .. $#chain-1 ) { + $self->add_path( $chain[$idx], $chain[$idx+1], $wit->sigil ); + } } sub common_readings { diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm index d7d090b..93ed1a3 100644 --- a/lib/Text/Tradition/Parser/BaseText.pm +++ b/lib/Text/Tradition/Parser/BaseText.pm @@ -85,7 +85,7 @@ sub merge_base { # DEBUG with a short graph # last if $line > 2; # DEBUG for problematic entries - my $scrutinize = ""; + my $scrutinize = "7.3"; my $first_line_reading = $base_line_starts[ $line ]; my $too_far = $base_line_starts[ $line+1 ]; @@ -168,9 +168,20 @@ sub merge_base { # and connect them to the anchor. Edges are named after the mss # that are relevant. foreach my $k ( grep { /^rdg/ } keys( %$app ) ) { - next if $k eq 'rdg_0'; # that's the lemma. - # TODO look at the lemma for any p.c. readings, and add - # them explicitly! + if( $k eq 'rdg_0' ) { # that's the lemma + # The lemma is already in the graph, but we need to look for + # any explicit post-correctione readings and add the + # relevant path. + my @mss = grep { $app->{$_} eq $k } keys( %$app ); + foreach my $m ( @mss ) { + my $base = _is_post_corr( $m ); + next unless $base; + my @lem = $collation->reading_sequence( $lemma_start, $lemma_end ); + foreach my $i ( 0 .. $#lem-1 ) { + $collation->add_path( $lem[$i], $lem[$i++], $m ); + } + } + } my @variant = split( /\s+/, $app->{$k} ); @variant = () if $app->{$k} eq '/'; # This is an omission. my @mss = grep { $app->{$_} eq $k } keys( %$app ); @@ -201,7 +212,7 @@ sub merge_base { # Now hook it up at the end. foreach ( @mss ) { $collation->add_path( $last_reading, - $collation->next_word( $lemma_end ), + $collation->next_reading( $lemma_end ), $_ ); } @@ -221,11 +232,11 @@ sub merge_base { my $pctag = substr( $w, length( $base ) ); my $existing_wit = $collation->tradition->witness( $base ); unless( $existing_wit ) { - $existing_wit = $collation->tradition->add_witness( $base ); + $existing_wit = $collation->tradition->add_witness( sigil => $base ); } $existing_wit->post_correctione( $pctag ); } else { - $collation->tradition->add_witness( $w ) + $collation->tradition->add_witness( sigil => $w ) unless $collation->tradition->witness( $w ); } } @@ -312,19 +323,15 @@ sub collate_variants { my( $collation, @readings ) = @_; my $lemma_start = shift @readings; my $lemma_end = shift @readings; - my $detranspose = 0; - - # We need to calculate positions at this point, which is where - # we are getting the implicit information from the apparatus. + my $detranspose = 1; # Start the list of distinct readings with those readings in the lemma. my @distinct_readings; - my $position = 0; while( $lemma_start ne $lemma_end ) { - push( @distinct_readings, [ $lemma_start, 'base text', $position++ ] ); - $lemma_start = $collation->next_word( $lemma_start ); + push( @distinct_readings, [ $lemma_start, 'base text' ] ); + $lemma_start = $collation->next_reading( $lemma_start ); } - push( @distinct_readings, [ $lemma_end, 'base text', $position++ ] ); + push( @distinct_readings, [ $lemma_end, 'base text' ] ); while( scalar @readings ) { @@ -336,21 +343,23 @@ sub collate_variants { # word from the current reading. my %collapsed = (); - # Get the label. There will only be one outgoing path to start - # with, so this is safe. - my @out = $var_start->outgoing(); - my $var_label = $out[0]->label(); + # Get the variant witnesses. They will all be going along the + # same path, so just use the first one as representative for + # the purpose of following the path. + my @var_wits = map { $_->label } $var_start->outgoing(); + my $rep_wit = $var_wits[0]; my @variant_readings; while( $var_start ne $var_end ) { push( @variant_readings, $var_start ); - $var_start = $collation->next_word( $var_start, $var_label ); + $var_start = $collation->next_reading( $var_start, $rep_wit ); } push( @variant_readings, $var_end ); # Go through the variant readings, and if we find a lemma reading that # hasn't yet been collapsed with a reading, equate them. If we do # not, keep them to push onto the end of all_readings. + # TODO replace this with proper mini-collation my @remaining_readings; my $last_index = 0; my $curr_pos = 0; @@ -358,7 +367,7 @@ sub collate_variants { my $word = $w->label(); my $matched = 0; foreach my $idx ( $last_index .. $#distinct_readings ) { - my( $l, $pathlabel, $pos ) = @{$distinct_readings[$idx]}; + my( $l, $pathlabel ) = @{$distinct_readings[$idx]}; if( $word eq cmp_str( $l ) ) { next if exists( $collapsed{ $l->label } ) && $collapsed{ $l->label } eq $l; @@ -371,24 +380,16 @@ sub collate_variants { $collapsed{ $l->label } = $l; # Now collapse any multiple paths to and from the reading. remove_duplicate_paths( $collation, - $collation->prior_word( $l, $pathlabel ), $l ); + $collation->prior_reading( $l, $rep_wit ), $l ); remove_duplicate_paths( $collation, $l, - $collation->next_word( $l, $pathlabel ) ); - $curr_pos = $pos; + $collation->next_reading( $l, $rep_wit ) ); last; } } - push( @remaining_readings, [ $w, $var_label, $curr_pos++ ] ) unless $matched; + push( @remaining_readings, [ $w, $rep_wit ] ) unless $matched; } push( @distinct_readings, @remaining_readings) if scalar( @remaining_readings ); } - - # Now set the positions of all the readings in this variation. - #$DB::single = 1; - print STDERR "Readings and their positions are:\n"; - foreach my $n ( @distinct_readings ) { - printf STDERR "\t%s (position %s)\n", $n->[0]->label(), $n->[2]; - } } =item B @@ -396,29 +397,19 @@ sub collate_variants { remove_duplicate_paths( $collation, $from, $to ); Given two readings, reduce the number of paths between those readings to -one. If neither path represents a base text, combine their labels. +a set of unique paths. =cut +# TODO wonder if this is necessary sub remove_duplicate_paths { my( $collation, $from, $to ) = @_; - my @paths = $from->paths_to( $to ); - if( scalar @paths > 1 ) { - my @base = grep { $_->label eq 'base text' } @paths; - if ( scalar @base ) { - # Remove the paths that are not base. - foreach my $e ( @paths ) { - $collation->del_path( $e ) - unless $e eq $base[0]; - } + my %seen_paths; + foreach my $p ( $from->edges_to( $to ) ) { + if( exists $seen_paths{$p->name} ) { + $collation->del_path( $p ); } else { - # Combine the paths into one. - my $new_path_name = join( ', ', map { $_->label() } @paths ); - my $new_path = shift @paths; - $new_path->set_attribute( 'label', $new_path_name ); - foreach my $e ( @paths ) { - $collation->del_path( $e ); - } + $seen_paths{$p->name} = 1; } } } diff --git a/lib/Text/Tradition/Witness.pm b/lib/Text/Tradition/Witness.pm index d48a350..9462f34 100644 --- a/lib/Text/Tradition/Witness.pm +++ b/lib/Text/Tradition/Witness.pm @@ -1,5 +1,6 @@ package Text::Tradition::Witness; use Moose; +use Moose::Util::TypeConstraints; # Sigil. Required identifier for a witness. has 'sigil' => ( @@ -36,6 +37,21 @@ has 'post_correctione' => ( isa => 'Str', predicate => 'has_post_correctione', ); + +subtype 'Correction', + as 'ArrayRef', + where { @{$_} == 3 && + $_->[0]->isa( 'Int' ) && + $_->[1]->isa( 'Int' ) && + $_->[2]->isa( 'ArrayRef[Text::Tradition::Collation::Reading]' ); + }, + message { 'Correction must be a tuple of [offset, length, list]' }; + +has 'corrections' => ( + is => 'ro', + isa => 'ArrayRef[Correction]', + default => sub { [] }, + ); sub BUILD { @@ -69,5 +85,11 @@ around text => sub { $self->$orig( @_ ); }; +sub add_correction { + my $self = shift; + # Rely on Moose for type checking of the remaining arguments + push( @{$self->corrections}, \@_ ); +} + no Moose; __PACKAGE__->meta->make_immutable;