From: Tara L Andrews Date: Sun, 20 Nov 2011 14:41:29 +0000 (+0100) Subject: parsing Collate apparatus more or less works now X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=44771cf2d5a77263c8e3f17ae71b112247181380;p=scpubgit%2Fstemmatology.git parsing Collate apparatus more or less works now --- diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index d903191..7ca8db4 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -770,8 +770,10 @@ sub reading_sequence { $seen{$n->name()} = 1; my $next = $self->next_reading( $n, $witness, $backup ); - warn "Did not find any path for $witness from reading " . $n->name - unless $next; + unless( $next ) { + warn "Did not find any path for $witness from reading " . $n->name; + last; + } push( @readings, $next ); $n = $next; } diff --git a/lib/Text/Tradition/Parser/CollateText.pm b/lib/Text/Tradition/Parser/CollateText.pm index 6ee3712..944a8dd 100644 --- a/lib/Text/Tradition/Parser/CollateText.pm +++ b/lib/Text/Tradition/Parser/CollateText.pm @@ -171,13 +171,19 @@ sub merge_stone_apparatus { print STDERR "Lemma $lemma not found; skipping readings $rest\n"; next; } - my @lemma_chain = $c->reading_sequence( $lemma_start, $lemma_end ); - - # Splice in "start" and "end" placeholders on either - # side of the lemma. - my ( $rdg_start, $rdg_end ) = - _add_reading_placeholders( $c, $lemma_start, $lemma_end ); - + my( $rdg_start, $rdg_end, @lemma_chain ); + if( $lemma_start eq '__PRIOR__' ) { + # Deal with 'inc' readings: lemma chain should be empty, rdg_start + # is a placeholder, rdg_end is $lemma_end. + $rdg_start = _add_reading_placeholders( $c, $lemma_end ); + $rdg_end = $lemma_end; + } else { + @lemma_chain = $c->reading_sequence( $lemma_start, $lemma_end ); + # Splice in "start" and "end" placeholders on either + # side of the lemma. + ( $rdg_start, $rdg_end ) = + _add_reading_placeholders( $c, $lemma_start, $lemma_end ); + } # For each reading, attach it to the lemma. my @indiv = split( / /, $rest ); my $has_rel = 0; @@ -199,8 +205,9 @@ sub merge_stone_apparatus { my @readings; foreach my $rdg_word ( @$words ) { next if $rdg_word =~ /^__/; - my $reading_id = $lemma_start->name . '_' . $lemma_end->name - . '/' . $rdg_ctr++; + my $reading_id = ref( $lemma_start ) + ? $lemma_start->name : $lemma_start; + $reading_id .= '_' . $lemma_end->name . '/' . $rdg_ctr++; my $reading = $c->add_reading( $reading_id ); $reading->text( $rdg_word ); push( @readings, $reading ); @@ -208,7 +215,10 @@ sub merge_stone_apparatus { # Deal with any specials. my $lemma_sequence; - if( @$words && $words->[0] eq '__LEMMA__' ) { + if( @$words && $words->[0] eq '__LEMMA__' + && $lemma_end ne $rdg_end ) { + # It's an addition (unless lemma_end eq rdg_end, in which case + # it's an 'inc'.) Start from lemma rather than from placeholder. $lemma_sequence = [ $lemma_end, $rdg_end ]; } elsif ( @$words && $words->[0] eq '__TRANSPOSE__' ) { # Hope it is only two or three words in the lemma. @@ -241,12 +251,11 @@ sub merge_stone_apparatus { # Create the reading nodes in $rwords # TODO Hope we don't meet ~ in a recursion my $local_rdg = []; - $DB::single = 1; foreach my $i ( 0 .. $#$rwords ) { next if $i == 0 && $rwords->[$i] =~ /^__/; - my $reading_id = $llseq->[0]->text . '_' . - $llseq->[-1]->text . '/' . $i; - $DB::single = 1 if $reading_id =~ /ATTACH/; + my $reading_id = $llseq->[0]->name . '_' . + $llseq->[-1]->name . '/' . $i; + $reading_id =~ s/ATTACH//g; my $reading = $c->add_reading( $reading_id ); $reading->text( $rwords->[$i] ); push( @$local_rdg, $reading ); @@ -261,21 +270,25 @@ sub merge_stone_apparatus { } # end processing of $app } # end foreach my $app in line } # end while + $DB::single = 1; # Now reconcile all the paths in the collation, and delete our # temporary anchor nodes. expand_all_paths( $c ); # Finally, calculate the ranks we've got. - $c->calculate_ranks; + # $c->calculate_ranks; } sub _find_reading_on_line { my( $c, $lemma, $baseline, $prior ) = @_; - # We might want the whole line. if( $lemma eq 'totum' ) { + # We want the whole line. return( $baseline->{'start'}, $baseline->{'end'} ); + } elsif( $lemma eq 'inc' ) { + # We want to shove things in before the line begins. + return( '__PRIOR__', $baseline->{'start'} ); } my $lemma_start = $baseline->{'start'}; @@ -373,7 +386,8 @@ sub _add_reading_placeholders { $collation->add_path( $prior, $start_node, $collation->baselabel ); $collation->add_path( $start_node, $lemma_start, $collation->baselabel ); } - + return $start_node unless $lemma_end; + # Now the converse for the end. my $end_node = $collation->next_reading( $lemma_end ); unless( $end_node->name =~ /ATTACH/ ) { @@ -524,83 +538,34 @@ sub _add_sigil_path { } } -# Remove all ATTACH* nodes, linking the readings on either side of them. -# Then walk the collation for all witness paths, and make sure those paths -# explicitly exist. Then delete all the 'base' paths. +# Walk the collation for all witness paths, delete the ATTACH anchor nodes, +# and then nuke and re-draw all edges (thus getting rid of the base). sub expand_all_paths { my( $c ) = @_; - # Delete the anchors - foreach my $anchor ( grep { $_->name =~ /ATTACH/ } $c->readings ) { - # Map each path to its incoming/outgoing node. - my %incoming; - map { $incoming{$_->label} = $_->from } $anchor->incoming(); - my %outgoing; - map { $outgoing{$_->label} = $_->to } $anchor->outgoing(); - $c->del_reading( $anchor ); - - # Connect in and out. - my $aclabel = $c->ac_label; - foreach my $edge ( keys %incoming ) { - my $from = $incoming{$edge}; - my $to = $outgoing{$edge}; - if( !$to && $edge =~ /^(.*)\Q$aclabel\E$/ ) { - $to = $outgoing{$1}; - } - $to = $outgoing{$c->baselabel} unless $to; - $DB::single = 1 unless $to; - warn "Have no outbound base link on " . $anchor->name . "!" - unless $to; - $c->add_path( $from, $to, $edge ); - delete $outgoing{$edge} unless $edge eq $c->baselabel; - } - foreach my $edge ( keys %outgoing ) { - my $to = $outgoing{$edge}; - my $from = $incoming{$edge}; - if( !$from && $edge =~ /^(.*)\Q$aclabel\E$/ ) { - $from = $incoming{$1}; - } - $from = $incoming{$c->baselabel} unless $from; - warn "Have no inbound base link on " . $anchor->name . "!" - unless $from; - $c->add_path( $from, $to, $edge ); - } - } - - $DB::single = 1; - # Walk the collation and add paths if necessary + # Walk the collation and fish out the paths for each witness foreach my $sig ( keys %ALL_SIGLA ) { my $wit = $c->tradition->witness( $sig ); - my @path = $c->reading_sequence( $c->start, $c->end, $sig ); + my @path = grep { $_->name !~ /ATTACH/ } + $c->reading_sequence( $c->start, $c->end, $sig ); $wit->path( \@path ); if( $ALL_SIGLA{$sig} > 1 ) { - my @ac_path = $c->reading_sequence( $c->start, $c->end, - $sig.$c->ac_label, $sig ); - $wit->uncorrected_path( \@path ); - # a.c. paths are already there by default. - } - foreach my $i ( 1 .. $#path ) { - # If there is no explicit path for this sigil between n-1 and n, - # add it. - my @sigedges = grep { $_->label eq $sig } $path[$i]->incoming; - if( @sigedges ) { - warn "Found more than one path already for $sig" if @sigedges > 1; - warn "Would add second path for $sig" - unless $sigedges[0]->from eq $path[$i-1]; - next; - } - $c->add_path( $path[$i-1], $path[$i], $sig ); + my @ac_path = grep { $_->name !~ /ATTACH/ } + $c->reading_sequence( $c->start, $c->end, $sig.$c->ac_label, $sig ); + $wit->uncorrected_path( \@ac_path ); } - } + } - # Delete all baselabel edges - foreach my $edge ( grep { $_->label eq $c->baselabel } $c->paths ) { - $c->del_edge( $edge ); + # Delete the anchors + foreach my $anchor ( grep { $_->name =~ /ATTACH/ } $c->readings ) { + $c->del_reading( $anchor ); } + # Delete all edges + map { $c->del_path( $_ ) } $c->paths; - # Calculate ranks on graph nodes - $c->calculate_ranks(); + # Make the path edges + $c->make_witness_paths(); } sub _get_seq { @@ -640,7 +605,7 @@ sub next_real_reading { sub rstr { my @l = @_; if( ref( $_[0] ) eq 'ARRAY' ) { - @l = @$_[0]; + @l = @{$_[0]}; } my $str = join( ' ', map { $_->text } @l ); return $str;