From: tla <tla@mit.edu>
Date: Thu, 15 Aug 2013 22:01:59 +0000 (+0200)
Subject: Handle a2 app entries separately and more correctly. Fixes #5
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=9e0a97867df3c97a7af6d4a0aa52de2831e8e457;p=scpubgit%2Fstemmatology.git

Handle a2 app entries separately and more correctly. Fixes #5
---

diff --git a/base/lib/Text/Tradition/Parser/CTE.pm b/base/lib/Text/Tradition/Parser/CTE.pm
index de87755..70248d5 100644
--- a/base/lib/Text/Tradition/Parser/CTE.pm
+++ b/base/lib/Text/Tradition/Parser/CTE.pm
@@ -84,7 +84,9 @@ sub parse {
 	# everything on the graph, from which we will delete the apps and
 	# anchors when we are done.
 	
-	# First, put the base tokens, apps, and anchors in the graph.
+	# First, put the base tokens, apps, and anchors in the graph. Save the
+	# app siglorum separately as it has to be processed in order.
+	my @app_sig;
 	my $counter = 0;
 	my $last = $c->start;
 	foreach my $item ( @base_text ) {
@@ -98,7 +100,12 @@ sub parse {
         } elsif ( $item->{'type'} eq 'app' ) {
             my $tag = '__APP_' . $counter++ . '__';
             $r = $c->add_reading( { id => $tag, is_ph => 1 } );
-            $apps{$tag} = $item->{'content'};
+            # Apparatus criticus is type a1; app siglorum is type a2
+            if( $item->{'content'}->getAttribute('type') eq 'a1' ) {
+	            $apps{$tag} = $item->{'content'};
+	        } else {
+	        	push( @app_sig, $item->{'content'} );
+	        }
         }
         $c->add_path( $last, $r, $c->baselabel );
         $last = $r;
@@ -110,6 +117,7 @@ sub parse {
     foreach my $app_id ( keys %apps ) {
         _add_readings( $c, $app_id, $opts );
     }
+    _add_lacunae( $c, @app_sig );
     
     # Finally, add explicit witness paths, remove the base paths, and remove
     # the app/anchor tags.
@@ -205,8 +213,6 @@ sub _get_base {
 			push( @readings, { type => 'anchor', 
 			    content => $xn->getAttribute( 'xml:id' ) } );
 		} # if the anchor has no XML ID, it is not relevant to us.
-	} elsif( $xn->nodeName =~ /^wit(Start|End)$/ ){
-		push( @readings, { type => 'token', content => '#' . uc( $1 ) . '#' } );
 	} elsif( $xn->nodeName !~ /^(note|seg|milestone|emph)$/ ) {  # Any tag we don't know to disregard
 	    say STDERR "Unrecognized tag " . $xn->nodeName;
 	}
@@ -239,22 +245,12 @@ sub _append_tokens {
 sub _add_readings {
     my( $c, $app_id, $opts ) = @_;
     my $xn = $apps{$app_id};
-    # If the app is of type a1, it is an apparatus criticus.
-    # If it is of type a2, it is an apparatus codicum and might not
-    # have an anchor.
-    my $anchor;
-	if( $xn->hasAttribute('to') ) {
-		$anchor = _anchor_name( $xn->getAttribute( 'to' ) );
-	}
+    my $anchor = _anchor_name( $xn->getAttribute( 'to' ) );
     
     # Get the lemma, which is all the readings between app and anchor,
     # excluding other apps or anchors.
-	my @lemma;
-	my $lemma_str = '';
-    if( $anchor ) {
-	    @lemma = _return_lemma( $c, $app_id, $anchor );
-    	$lemma_str = join( ' ',  map { $_->text } grep { !$_->is_ph } @lemma );
-    }
+	my @lemma = _return_lemma( $c, $app_id, $anchor );
+	my $lemma_str = join( ' ',  map { $_->text } grep { !$_->is_ph } @lemma );
         
     # For each reading, send its text to 'interpret' along with the lemma,
     # and then save the list of witnesses that these tokens belong to.
@@ -301,23 +297,11 @@ sub _add_readings {
 				}
 			}
         } else {
-        	if ( $flag->{'START'}
-	        	 && $c->prior_reading( $app_id, $c->baselabel ) ne $c->start ) {
-				# Add a lacuna for the witness start.
-				push( @rdg_nodes, $c->add_reading( { id => 'r'.$tag.".".$ctr++,
-        										 	 is_lacuna => 1 } ) );	
-			}
 			foreach my $w ( split( /\s+/, $interpreted ) ) {
 				my $r = $c->add_reading( { id => 'r'.$tag.".".$ctr++,
 										   text => $w } );
 				push( @rdg_nodes, $r );
 			}
-			if( $flag->{'END'}
-        		&& $c->next_reading( $app_id, $c->baselabel ) ne $c->end ) {
-				# Add a lacuna for the witness end.
-				push( @rdg_nodes, $c->add_reading( { id => 'r'.$tag.".".$ctr++,
-        										 	 is_lacuna => 1 } ) );
-        	}
         }
         
         # For each listed wit, save the reading.
@@ -352,13 +336,10 @@ sub _add_readings {
         
     # Now add the witness paths for each reading. If we don't have an anchor
     # (e.g. with an initial witStart) there was no witness path to speak of.
-    if( $anchor ) {
-		my $aclabel = $c->ac_label;
-		foreach my $wit_id ( keys %wit_rdgs ) {
-			my $witstr = _get_sigil( $wit_id, $aclabel );
-			my $rdg_list = $wit_rdgs{$wit_id};
-			_add_wit_path( $c, $rdg_list, $app_id, $anchor, $witstr );
-		}
+	foreach my $wit_id ( keys %wit_rdgs ) {
+		my $witstr = _get_sigil( $wit_id, $c->ac_label );
+		my $rdg_list = $wit_rdgs{$wit_id};
+		_add_wit_path( $c, $rdg_list, $app_id, $anchor, $witstr );
 	}
 }
 
@@ -501,13 +482,6 @@ sub interpret {
 		say STDERR "Will attempt transposition: $reading at $anchor";
 		$reading = $lemma;
 		$flag->{'TR'} = $+{lem};
-	# Look for processed witStart and witEnd tags
-	} elsif( $reading =~ /^\#START\#\s*(.*)$/ ) {
-		$reading = $1;
-		$flag->{'START'} = 1;
-	} elsif( $reading =~ /^(.*?)\s*\#END\#$/ ) {
-		$reading = $1;
-		$flag->{'END'} = 1;
 	}
 	return( $reading, $flag );
 }
@@ -532,6 +506,69 @@ sub _parse_wit_detail {
     }
 }
 
+sub _add_lacunae {
+	my( $c, @apps ) = @_;
+	# Go through the apparatus entries in order, noting where to start and stop our
+	# various witnesses.
+	my %lacunose;
+	my $ctr = 0;
+	foreach my $app ( @apps ) {
+		# Find the anchor, if any. This marks the point where the text starts
+		# or ends.
+		my $anchor = $app->getAttribute( 'to' );
+		my $aname;
+		if( $anchor ) {
+			$aname = _anchor_name( $anchor );
+		}
+
+		foreach my $rdg ( $app->getChildrenByTagName( 'rdg' ) ) {
+    		my @witlist = map { _get_sigil( $_, $c->ac_label ) }
+    			split( /\s+/, $rdg->getAttribute( 'wit' ) );
+			my @start = $rdg->getChildrenByTagName( 'witStart' );
+			my @end = $rdg->getChildrenByTagName( 'witEnd' );
+			if( @start && @end ) {
+				throw( "App sig entry at $anchor has both witStart and witEnd!" );
+			}
+			if( @start && $anchor &&
+				$c->prior_reading( $aname, $c->baselabel ) ne $c->start ) {
+				# We are picking back up after a hiatus. Find the last end and
+				# add a lacuna link between there and here.
+				foreach my $wit ( @witlist ) {
+					my $stoppoint = delete $lacunose{$wit};
+					$stoppoint = $c->start unless $stoppoint;
+					my $stopname = _anchor_name( $stoppoint );
+					say STDERR "Adding lacuna for $wit between $stoppoint and $anchor";
+					my $lacuna = $c->add_reading( { id => "as_$anchor.".$ctr++,
+        				is_lacuna => 1 } );
+        			_add_wit_path( $c, [ $lacuna ], $stopname, $aname, $wit );
+				}
+			} elsif( @end && $anchor && 
+				$c->next_reading( $aname, $c->baselabel ) ne $c->end ) {
+				# We are stopping. If we've already stopped for the given witness,
+				# flag an error; otherwise record the stopping point.
+				foreach my $wit ( @witlist ) {
+					if( $lacunose{$wit} ) {
+						throw( "Trying to end $wit at $anchor when already ended at "
+							. $lacunose{$wit} );
+					}
+					$lacunose{$wit} = $anchor;
+				}
+			}
+		}
+	}
+	
+	# For whatever remains in the %lacunose hash, add a lacuna between that spot and
+	# $c->end for each of the witnesses.
+	foreach my $wit ( keys %lacunose ) {
+		next unless $lacunose{$wit};
+		my $aname = _anchor_name( $lacunose{$wit} );
+		say STDERR "Adding lacuna for $wit from $aname to end";
+		my $lacuna = $c->add_reading( { id => 'as_'.$lacunose{$wit}.'.'.$ctr++,
+			is_lacuna => 1 } );
+		_add_wit_path( $c, [ $lacuna ], $aname, $c->end, $wit );
+	}
+}
+
 sub _get_sigil {
     my( $xml_id, $layerlabel ) = @_;
     if( $xml_id =~ /^(.*)_ac$/ ) {