From: Tara L Andrews Date: Thu, 21 Feb 2013 12:45:20 +0000 (+0100) Subject: refrain from calculating ranks in all parsers if asked; small optimizations to CTE... X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=82a45078ff2213f4a210957763dda2d025ed3b0e;p=scpubgit%2Fstemmatology.git refrain from calculating ranks in all parsers if asked; small optimizations to CTE parser --- diff --git a/base/lib/Text/Tradition/Parser/BaseText.pm b/base/lib/Text/Tradition/Parser/BaseText.pm index 76e4f36..54c66ed 100644 --- a/base/lib/Text/Tradition/Parser/BaseText.pm +++ b/base/lib/Text/Tradition/Parser/BaseText.pm @@ -304,7 +304,9 @@ sub merge_base { # $rel->type, $rel->from->id, $rel->to->id ); # } # } - $collation->calculate_common_readings(); # will implicitly rank + unless( $opts->{'nocalc'} ) { + $collation->calculate_common_readings(); # will implicitly rank + } } =item B diff --git a/base/lib/Text/Tradition/Parser/CTE.pm b/base/lib/Text/Tradition/Parser/CTE.pm index ceeb437..85d37da 100644 --- a/base/lib/Text/Tradition/Parser/CTE.pm +++ b/base/lib/Text/Tradition/Parser/CTE.pm @@ -61,11 +61,7 @@ sub parse { my @base_text; foreach my $pg_el ( $xpc->findnodes( '/TEI/text/body/p' ) ) { foreach my $xn ( $pg_el->childNodes ) { - my @items = _get_base( $xn ); - foreach my $i ( @items ) { - $DB::single = 1 if $i->{'type'} eq 'anchor' && !$i->{'content'}; - } - push( @base_text, @items ); + push( @base_text, _get_base( $xn ) ); } } # We now have to work through this array applying the alternate @@ -185,10 +181,10 @@ sub _get_base { push( @readings, { 'type' => 'app', 'content' => $xn } ); } elsif( $xn->nodeName eq 'anchor' ) { # Anchor to mark the end of some apparatus; save its ID. - unless( $xn->getAttribute('type') ) { + if( $xn->hasAttribute('xml:id') ) { push( @readings, { 'type' => 'anchor', 'content' => $xn->getAttribute( 'xml:id' ) } ); - } + } # if the anchor has no XML ID, it is not relevant to us. } elsif ( $xn->nodeName !~ /^(note|seg|milestone|emph)$/ ) { # Any tag we don't know to disregard say STDERR "Unrecognized tag " . $xn->nodeName; } @@ -225,7 +221,7 @@ sub _add_readings { # Get the lemma, which is all the readings between app and anchor, # excluding other apps or anchors. my @lemma = _return_lemma( $c, $app_id, $anchor ); - my $lemma_str = join( ' ', grep { $_ !~ /^__/ } map { $_->text } @lemma ); + my $lemma_str = join( ' ', map { $_->text } grep { !$_->is_ph } @lemma ); # For each reading, send its text to 'interpret' along with the lemma, # and then save the list of witnesses that these tokens belong to. @@ -318,6 +314,7 @@ sub interpret { # $lemma =~ s/\s+[[:punct:]]+$//; my $flag; # In case of p.c. indications my @words = split( /\s+/, $lemma ); + $reading =~ s/[[:punct:]]?\bsic\b[[:punct:]]?//g; if( $reading =~ /^(.*) praem.$/ ) { $reading = "$1 $lemma"; } elsif( $reading =~ /^(.*) add.$/ ) { @@ -429,10 +426,13 @@ sub _expand_all_paths { $c->make_witness_paths(); # Now remove any orphan nodes, and warn that we are doing so. - foreach my $v ( $c->sequence->isolated_vertices ) { - my $r = $c->reading( $v ); - say STDERR "Deleting orphan reading $r / " . $r->text; - $c->del_reading( $r ); + while( $c->sequence->predecessorless_vertices > 1 ) { + foreach my $v ( $c->sequence->predecessorless_vertices ) { + my $r = $c->reading( $v ); + next if $r->is_start; + say STDERR "Deleting orphan reading $r / " . $r->text; + $c->del_reading( $r ); + } } } diff --git a/base/lib/Text/Tradition/Parser/CollateX.pm b/base/lib/Text/Tradition/Parser/CollateX.pm index 8a95f26..2eec71b 100644 --- a/base/lib/Text/Tradition/Parser/CollateX.pm +++ b/base/lib/Text/Tradition/Parser/CollateX.pm @@ -177,9 +177,11 @@ sub parse { } # Rank the readings and find the commonalities - $collation->calculate_ranks(); - $collation->flatten_ranks(); - $collation->calculate_common_readings(); + unless( $opts->{'nocalc'} ) { + $collation->calculate_ranks(); + $collation->flatten_ranks(); + $collation->calculate_common_readings(); + } } else { my %merged; foreach my $k ( keys %transpositions ) { diff --git a/base/lib/Text/Tradition/Parser/TEI.pm b/base/lib/Text/Tradition/Parser/TEI.pm index 8c2c3f0..3e3c47e 100644 --- a/base/lib/Text/Tradition/Parser/TEI.pm +++ b/base/lib/Text/Tradition/Parser/TEI.pm @@ -180,19 +180,21 @@ sub parse { # Now make our witness paths. $tradition->collation->make_witness_paths(); - # Calculate the ranks for the nodes. - $tradition->collation->calculate_ranks(); - - # Now that we have ranks, see if we have distinct nodes with identical - # text and identical rank that can be merged. - $tradition->collation->flatten_ranks(); - - # And now that we've done that, calculate the common nodes. - $tradition->collation->calculate_common_readings(); - - # Save the text for each witness so that we can ensure consistency - # later on - $tradition->collation->text_from_paths(); + unless( $opts->{'nocalc'} ) { + # Calculate the ranks for the nodes. + $tradition->collation->calculate_ranks(); + + # Now that we have ranks, see if we have distinct nodes with identical + # text and identical rank that can be merged. + $tradition->collation->flatten_ranks(); + + # And now that we've done that, calculate the common nodes. + $tradition->collation->calculate_common_readings(); + + # Save the text for each witness so that we can ensure consistency + # later on + $tradition->collation->text_from_paths(); + } } sub _clean_sequence {