From: Tara L Andrews <tla@mit.edu>
Date: Thu, 21 Feb 2013 12:45:20 +0000 (+0100)
Subject: refrain from calculating ranks in all parsers if asked; small optimizations to CTE... 
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=82a45078ff2213f4a210957763dda2d025ed3b0e;p=scpubgit%2Fstemmatology.git

refrain from calculating ranks in all parsers if asked; small optimizations to CTE parser
---

diff --git a/base/lib/Text/Tradition/Parser/BaseText.pm b/base/lib/Text/Tradition/Parser/BaseText.pm
index 76e4f36..54c66ed 100644
--- a/base/lib/Text/Tradition/Parser/BaseText.pm
+++ b/base/lib/Text/Tradition/Parser/BaseText.pm
@@ -304,7 +304,9 @@ sub merge_base {
 #                             $rel->type, $rel->from->id, $rel->to->id );
 #         }
 #     }
-    $collation->calculate_common_readings(); # will implicitly rank
+	unless( $opts->{'nocalc'} ) {
+	    $collation->calculate_common_readings(); # will implicitly rank
+	}
 }
 
 =item B<read_base>
diff --git a/base/lib/Text/Tradition/Parser/CTE.pm b/base/lib/Text/Tradition/Parser/CTE.pm
index ceeb437..85d37da 100644
--- a/base/lib/Text/Tradition/Parser/CTE.pm
+++ b/base/lib/Text/Tradition/Parser/CTE.pm
@@ -61,11 +61,7 @@ sub parse {
 	my @base_text;
 	foreach my $pg_el ( $xpc->findnodes( '/TEI/text/body/p' ) ) {
 		foreach my $xn ( $pg_el->childNodes ) {
-			my @items = _get_base( $xn );
-			foreach my $i ( @items ) {
-				$DB::single = 1 if $i->{'type'} eq 'anchor' && !$i->{'content'};
-			}
-			push( @base_text, @items );
+			push( @base_text, _get_base( $xn ) );
 		}
 	}
 	# We now have to work through this array applying the alternate 
@@ -185,10 +181,10 @@ sub _get_base {
 		push( @readings, { 'type' => 'app', 'content' => $xn } );
 	} elsif( $xn->nodeName eq 'anchor' ) {
 		# Anchor to mark the end of some apparatus; save its ID.
-		unless( $xn->getAttribute('type') ) {
+		if( $xn->hasAttribute('xml:id') ) {
 			push( @readings, { 'type' => 'anchor', 
 			    'content' => $xn->getAttribute( 'xml:id' ) } );
-		}
+		} # if the anchor has no XML ID, it is not relevant to us.
 	} elsif ( $xn->nodeName !~ /^(note|seg|milestone|emph)$/ ) {  # Any tag we don't know to disregard
 	    say STDERR "Unrecognized tag " . $xn->nodeName;
 	}
@@ -225,7 +221,7 @@ sub _add_readings {
     # Get the lemma, which is all the readings between app and anchor,
     # excluding other apps or anchors.
     my @lemma = _return_lemma( $c, $app_id, $anchor );
-    my $lemma_str = join( ' ', grep { $_ !~ /^__/ } map { $_->text } @lemma );
+    my $lemma_str = join( ' ',  map { $_->text } grep { !$_->is_ph } @lemma );
     
     # For each reading, send its text to 'interpret' along with the lemma,
     # and then save the list of witnesses that these tokens belong to.
@@ -318,6 +314,7 @@ sub interpret {
 	# $lemma =~ s/\s+[[:punct:]]+$//;
 	my $flag;  # In case of p.c. indications
 	my @words = split( /\s+/, $lemma );
+	$reading =~ s/[[:punct:]]?\bsic\b[[:punct:]]?//g;
 	if( $reading =~ /^(.*) praem.$/ ) {
 		$reading = "$1 $lemma";
 	} elsif( $reading =~ /^(.*) add.$/ ) {
@@ -429,10 +426,13 @@ sub _expand_all_paths {
     $c->make_witness_paths();
     
     # Now remove any orphan nodes, and warn that we are doing so.
-    foreach my $v ( $c->sequence->isolated_vertices ) {
-    	my $r = $c->reading( $v );
-    	say STDERR "Deleting orphan reading $r / " . $r->text;
-    	$c->del_reading( $r );
+    while( $c->sequence->predecessorless_vertices > 1 ) {
+    	foreach my $v ( $c->sequence->predecessorless_vertices ) {
+	    	my $r = $c->reading( $v );
+	    	next if $r->is_start;
+    		say STDERR "Deleting orphan reading $r / " . $r->text;
+    		$c->del_reading( $r );
+    	}
     }
 }
 
diff --git a/base/lib/Text/Tradition/Parser/CollateX.pm b/base/lib/Text/Tradition/Parser/CollateX.pm
index 8a95f26..2eec71b 100644
--- a/base/lib/Text/Tradition/Parser/CollateX.pm
+++ b/base/lib/Text/Tradition/Parser/CollateX.pm
@@ -177,9 +177,11 @@ sub parse {
     	}
     
     	# Rank the readings and find the commonalities
-    	$collation->calculate_ranks();
-    	$collation->flatten_ranks();
-    	$collation->calculate_common_readings();
+    	unless( $opts->{'nocalc'} ) {
+			$collation->calculate_ranks();
+			$collation->flatten_ranks();
+			$collation->calculate_common_readings();
+		}
     } else {
     	my %merged;
     	foreach my $k ( keys %transpositions ) {
diff --git a/base/lib/Text/Tradition/Parser/TEI.pm b/base/lib/Text/Tradition/Parser/TEI.pm
index 8c2c3f0..3e3c47e 100644
--- a/base/lib/Text/Tradition/Parser/TEI.pm
+++ b/base/lib/Text/Tradition/Parser/TEI.pm
@@ -180,19 +180,21 @@ sub parse {
     # Now make our witness paths.
     $tradition->collation->make_witness_paths();
     
-    # Calculate the ranks for the nodes.
-	$tradition->collation->calculate_ranks();
-    
-    # Now that we have ranks, see if we have distinct nodes with identical
-    # text and identical rank that can be merged.
-    $tradition->collation->flatten_ranks();
-    
-    # And now that we've done that, calculate the common nodes.
-    $tradition->collation->calculate_common_readings();
-    
-    # Save the text for each witness so that we can ensure consistency
-    # later on
-	$tradition->collation->text_from_paths();	
+    unless( $opts->{'nocalc'} ) {
+		# Calculate the ranks for the nodes.
+		$tradition->collation->calculate_ranks();
+	
+		# Now that we have ranks, see if we have distinct nodes with identical
+		# text and identical rank that can be merged.
+		$tradition->collation->flatten_ranks();
+	
+		# And now that we've done that, calculate the common nodes.
+		$tradition->collation->calculate_common_readings();
+	
+		# Save the text for each witness so that we can ensure consistency
+		# later on
+		$tradition->collation->text_from_paths();	
+	}
 }
 
 sub _clean_sequence {