From: tla Date: Tue, 31 May 2011 19:54:42 +0000 (+0200) Subject: CHECKPOINT untested and unfinished changes to BaseText X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=1ed3973e1e7d7a188070210ec2f8b2cb447ef60a;p=scpubgit%2Fstemmatology.git CHECKPOINT untested and unfinished changes to BaseText --- diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index f66e902..82aae47 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -757,7 +757,7 @@ sub calculate_positions { print STDERR "Calculating positions in " . $wit->sigil . "\n"; _update_positions_from_path( $wit->path, @ordered_common ); _update_positions_from_path( $wit->uncorrected_path, @ordered_common ) - if $wit->has_ante_corr; + if $wit->has_uncorrected; } # DEBUG diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm index 49edd0b..4c2af19 100644 --- a/lib/Text/Tradition/Parser/BaseText.pm +++ b/lib/Text/Tradition/Parser/BaseText.pm @@ -176,6 +176,9 @@ sub merge_base { my %pc_seen; # Keep track of mss with explicit post-corr data foreach my $k ( grep { /^rdg/ } keys( %$app ) ) { my @mss = grep { $app->{$_} eq $k } keys( %$app ); + + # Keep track of lemma nodes that don't actually appear in + # any MSS; we will want to remove them from the collation. push( @unwitnessed_lemma_nodes, @lemma_set ) if !@mss && $k eq 'rdg_0'; @@ -189,11 +192,11 @@ sub merge_base { } next if $k eq 'rdg_0'; + # Parse the variant into reading tokens. # TODO don't hardcode the reading split operation my @variant = split( /\s+/, $app->{$k} ); @variant = () if $app->{$k} eq '/'; # This is an omission. - # Make the variant into a set of readings. my @variant_readings; my $ctr = 0; foreach my $vw ( @variant ) { @@ -213,8 +216,7 @@ sub merge_base { # collated sets. Modifies the reading sets that were passed. collate_variants( $collation, @reading_sets ); - # TODO Here would be a very good place to set up relationships - # between the nodes and the lemma. + # Record any stated relationships between the nodes and the lemma. set_relationships( $collation, $app, \@lemma_set, $variant_objects ); # Now create the splice-edit objects that will be used @@ -251,29 +253,24 @@ sub merge_base { print STDERR "Creating witness $w\n"; my $witness_obj = $collation->tradition->add_witness( sigil => $w ); my $debug = undef; # $w eq 'Vb10'; - my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug ); - my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug ) - if exists( $edits_required->{$w."_post"} ); + my ( $text_seq, $ac ) = apply_edits( $collation, + $edits_required->{$w}, + $edits_required->{$w."_post"}, + $debug ); - my @repeated = _check_for_repeated( @ante_corr_seq ); - warn "Repeated elements @repeated in $w a.c." - if @repeated; - @repeated = _check_for_repeated( @post_corr_seq ); - warn "Repeated elements @repeated in $w p.c." + my @repeated = _check_for_repeated( @$text_seq ); + warn "Repeated elements @repeated in $w" if @repeated; - # Now save these paths in my witness object - if( @post_corr_seq ) { - $witness_obj->path( \@post_corr_seq ); - $witness_obj->uncorrected_path( \@ante_corr_seq ); - } else { - $witness_obj->path( \@ante_corr_seq ); + $witness_obj->path( $text_seq ); + if( $ac ) { + $witness_obj->uncorrected( $ac ); } } # Now remove our 'base text' edges, which is to say, the only - # ones we have created so far. Also remove any nodes that didn't - # appear in any witnesses. + # ones we have created so far. Also remove any unwitnessed + # lemma nodes (TODO unless we are treating base as witness) foreach ( $collation->paths() ) { $collation->del_path( $_ ); } @@ -485,7 +482,6 @@ sub set_relationships { # Transposition or repetition: look for nodes with the # same label but different IDs and mark them. $type = 'repetition' if $type =~ /^rep/i; - $DB::single = 1 if $type eq 'repetition'; my %labels; foreach my $r ( @$lemma ) { $labels{cmp_str( $r )} = $r; @@ -543,53 +539,62 @@ sub set_relationships { sub apply_edits { - my( $collation, $edit_sequence, $debug ) = @_; + my( $collation, $edit_sequence, $corrected_edit_sequence, $debug ) = @_; + + # Index the ante- and post-correctione edits that we have, so that + # for each spot in the text we can apply the original witness + # state and then apply its corrected state, if applicable. + my $all_edits = {}; + foreach my $c ( @$edit_sequence ) { + my $lemma_index = $base_text_index{$c->[0]}; + $all_edits->{$lemma_index}->{'ac'} = $c; + # If the text carries no corrections, pc == ac. + $all_edits->{$lemma_index}->{'pc'} = $c + unless $corrected_edit_sequence; + } + foreach my $c ( @$corrected_edit_sequence ) { + my $lemma_index = $base_text_index{$c->[0]}; + $all_edits->{$lemma_index}->{'pc'} = $c; + } + my @lemma_text = $collation->reading_sequence( $collation->start, $collation->reading( '#END#' ) ); my $drift = 0; - foreach my $correction ( @$edit_sequence ) { - my( $lemma_start, $length, $items ) = @$correction; - my $offset = $base_text_index{$lemma_start}; - my $realoffset = $offset + $drift; - if( $debug || - $lemma_text[$realoffset]->name ne $lemma_start ) { - my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1]; - my @base_phrase; - my $i = $realoffset; - my $l = $collation->reading( $lemma_start ); - while( $i < $realoffset+$length ) { - push( @base_phrase, $l ); - $l = $collation->next_reading( $l ); - $i++; - } - - print STDERR sprintf( "Trying to replace %s (%s) starting at %d " . - "with %s (%s) with drift %d\n", - join( ' ', map {$_->label} @base_phrase ), - join( ' ', map {$_->name} @base_phrase ), - $realoffset, - join( ' ', map {$_->label} @$items ), - join( ' ', map {$_->name} @$items ), - $drift, - ) if $debug; - - warn( sprintf( "Should be replacing %s (%s) with %s (%s) " . - "but %s (%s) is there instead", - join( ' ', map {$_->label} @base_phrase ), - join( ' ', map {$_->name} @base_phrase ), - join( ' ', map {$_->label} @$items ), - join( ' ', map {$_->name} @$items ), - join( ' ', map {$_->label} @this_phrase ), - join( ' ', map {$_->name} @this_phrase ), - ) ) - if $lemma_text[$realoffset]->name ne $lemma_start; + my @ac_sequence; + foreach my $lemma_index ( sort keys %$all_edits ) { + my $ac = $all_edits->{$lemma_index}->{'ac'}; + my $pc = $all_edits->{$lemma_index}->{'pc'}; + my $realoffset = $lemma_index + $drift; + if( $ac && $pc && $ac eq $pc ) { + # No correction, just apply the edit + my( $lemma_start, $length, $items ) = @$pc; + splice( @lemma_text, $realoffset, $length, @$items ); + $drift += @$items + $length; + } elsif ( !$pc ) { + # Lemma text is unaltered, save a.c. as an 'uncorrection' + my( $lemma_start, $length, $items ) = @$ac; + push( @ac_sequence, [ $realoffset, $length, $items ] ); + } elsif ( !$ac ) { + # Apply the edit, save lemma text as an 'uncorrection' + my( $lemma_start, $length, $items ) = @$pc; + my @old = splice( @lemma_text, $realoffset, $length, @$items ); + $drift += @$items + $length; + push( @ac_sequence, [ $realoffset, scalar( @$items ), \@old ] ); + } else { + # Apply the p.c. edit, then save the a.c. edit as an + # 'uncorrection' on the p.c. text + my( $lemma_start, $length, $items ) = @$pc; + my @old = splice( @lemma_text, $realoffset, $length, @$items ); + $drift += @$items + $length; + push( @ac_sequence, [ $realoffset, scalar( @$items ), \@old ] ); } - splice( @lemma_text, $realoffset, $length, @$items ); - $drift += @$items - $length; } - return @lemma_text; + return( \@lemma_text, \@ac_sequence ); } - + +# sub _apply_sequence_splice { +# my( $collation, $sequence, $correction + # Helper function. Given a witness sigil, if it is a post-correctione # sigil,return the base witness. If not, return a false value. diff --git a/lib/Text/Tradition/Witness.pm b/lib/Text/Tradition/Witness.pm index 934bb51..2d1996a 100644 --- a/lib/Text/Tradition/Witness.pm +++ b/lib/Text/Tradition/Witness.pm @@ -2,6 +2,19 @@ package Text::Tradition::Witness; use Moose; use Moose::Util::TypeConstraints; +subtype 'Correction', + => as 'ArrayRef', + => where { return 0 unless @$_ == 3; + return 0 unless $_->[0] =~ /^\d+$/; + return 0 unless $_->[1] =~ /^\d+$/; + foreach my $x ( @{$_->[2]} ) { + return $0 unless $x->isa( 'Text::Tradition::Collation::Reading' ); + } + return 1; + }, + => message { "Correction must be ref of [ offset, length, replacement_list ]" }; + + # Sigil. Required identifier for a witness. has 'sigil' => ( is => 'ro', @@ -26,16 +39,20 @@ has 'source' => ( predicate => 'has_source', ); +# Path. This is an array of Reading nodes that should mirror the +# text above. has 'path' => ( is => 'rw', isa => 'ArrayRef[Text::Tradition::Collation::Reading]', predicate => 'has_path', ); -has 'uncorrected_path' => ( +# Uncorrection. This is an array of sets of reading nodes that show +# where the witness was corrected. +has 'uncorrected' => ( is => 'rw', - isa => 'ArrayRef[Text::Tradition::Collation::Reading]', - predicate => 'has_ante_corr', + isa => 'ArrayRef[Correction]', + predicate => 'has_uncorrected', ); @@ -70,6 +87,15 @@ around text => sub { $self->$orig( @_ ); }; +sub uncorrected_path { + my $self = shift; + my @path; + push( @path, @{$self->path} ); + foreach my $corr ( @{$self->uncorrected} ) { + splice( @path, $corr->[0], $corr->[1], @{$corr->[2]} ); + } + return \@path; +} no Moose; __PACKAGE__->meta->make_immutable;