From: tla <tla@mit.edu>
Date: Tue, 13 Aug 2013 11:40:10 +0000 (+0200)
Subject: Current state of transposition parsing
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=e9442e1c0df576ef6f95bfb2b110812a14ed3757;p=scpubgit%2Fstemmatology.git

Current state of transposition parsing
---

diff --git a/base/lib/Text/Tradition/Parser/CTE.pm b/base/lib/Text/Tradition/Parser/CTE.pm
index 2926a4a..bd49559 100644
--- a/base/lib/Text/Tradition/Parser/CTE.pm
+++ b/base/lib/Text/Tradition/Parser/CTE.pm
@@ -38,6 +38,9 @@ sub parse {
 	my( $tradition, $opts ) = @_;
 	my $c = $tradition->collation;	# Some shorthand
 	
+	## DEBUG/TEST
+	$opts->{interpret_transposition} = 1;
+	
 	# First, parse the XML.
     my( $tei, $xpc ) = _remove_formatting( $opts );
     return unless $tei; # we have already warned.
@@ -104,9 +107,8 @@ sub parse {
     
     # Now we can parse the apparatus entries, and add the variant readings 
     # to the graph.
-    
     foreach my $app_id ( keys %apps ) {
-        _add_readings( $c, $app_id );
+        _add_readings( $c, $app_id, $opts );
     }
     
     # Finally, add explicit witness paths, remove the base paths, and remove
@@ -235,7 +237,7 @@ sub _append_tokens {
 }
 
 sub _add_readings {
-    my( $c, $app_id ) = @_;
+    my( $c, $app_id, $opts ) = @_;
     my $xn = $apps{$app_id};
     # If the app is of type a1, it is an apparatus criticus.
     # If it is of type a2, it is an apparatus codicum and might not
@@ -262,6 +264,7 @@ sub _add_readings {
     $tag =~ s/^\__APP_(.*)\__$/$1/;
 
     foreach my $rdg ( $xn->getChildrenByTagName( 'rdg' ) ) {
+    	my @witlist = split( /\s+/, $rdg->getAttribute( 'wit' ) );
         my @text;
         foreach ( $rdg->childNodes ) {
             push( @text, _get_base( $_ ) );
@@ -269,39 +272,52 @@ sub _add_readings {
         my( $interpreted, $flag ) = ( '', undef );
         if( @text ) {
         	( $interpreted, $flag ) = interpret( 
-        		join( ' ', map { $_->{'content'} } @text ), $lemma_str );
+        		join( ' ', map { $_->{'content'} } @text ), $lemma_str, $anchor, $opts );
         }
-        next if( $interpreted eq $lemma_str ) && !$flag;  # Reading is lemma.
+        next if( $interpreted eq $lemma_str ) && !keys %$flag;  # Reading is lemma.
         
         my @rdg_nodes;
+        my @transp_nodes;
         if( $interpreted eq '#LACUNA#' ) {
         	push( @rdg_nodes, $c->add_reading( { id => 'r'.$tag.".".$ctr++,
         										 is_lacuna => 1 } ) );
+        } elsif( $flag->{'TR'} ) {
+        	# Our reading is transposed to after the given string. Look
+        	# down the collation base text and try to find it.
+        	# The @rdg_nodes should remain blank here, so that the correct
+        	# omission goes into the graph.
+        	foreach my $w ( split(  /\s+/, $interpreted ) ) {
+        		my $r = $c->add_reading( { id => 'r'.$tag.".".$ctr++,
+										   text => $w } );
+				push( @transp_nodes, $r );
+			}
+			if( $anchor && @lemma ) {
+				_attach_transposition( $c, \@lemma, $anchor, \@transp_nodes, 
+					\@witlist, $flag->{'TR'} );
+			}
         } else {
-        	if ( $flag && $flag eq 'START'
+        	if ( $flag->{'START'}
 	        	 && $c->prior_reading( $app_id, $c->baselabel ) ne $c->start ) {
 				# Add a lacuna for the witness start.
 				push( @rdg_nodes, $c->add_reading( { id => 'r'.$tag.".".$ctr++,
         										 	 is_lacuna => 1 } ) );	
-        		$flag = '';
 			}
 			foreach my $w ( split( /\s+/, $interpreted ) ) {
 				my $r = $c->add_reading( { id => 'r'.$tag.".".$ctr++,
 										   text => $w } );
 				push( @rdg_nodes, $r );
 			}
-			if( $flag && $flag eq 'END'
+			if( $flag->{'END'}
         		&& $c->next_reading( $app_id, $c->baselabel ) ne $c->end ) {
 				# Add a lacuna for the witness end.
 				push( @rdg_nodes, $c->add_reading( { id => 'r'.$tag.".".$ctr++,
         										 	 is_lacuna => 1 } ) );
-        		$flag = '';
         	}
         }
         
         # For each listed wit, save the reading.
-        foreach my $wit ( split( /\s+/, $rdg->getAttribute( 'wit' ) ) ) {
-			$wit .= $flag if $flag;
+        foreach my $wit ( @witlist ) {
+			$wit .= '_ac' if $flag->{'AC'};
             $wit_rdgs{$wit} = \@rdg_nodes;
         }
         		
@@ -347,6 +363,57 @@ sub _return_lemma {
     return @nodes;
 }
 
+# Make a best-effort attempt to attach a transposition farther down the line.
+# $lemmaseq contains the Reading objects of the lemma
+# $anchor contains the point at which we should start scanning for a match
+# $rdgseq contains the Reading objects of the transposed reading 
+# 	(should be identical to the lemma)
+# $witlist contains the list of applicable witnesses
+# $reftxt contains the text to match, after which the $rdgseq should go.
+sub _attach_transposition {
+	my( $c, $lemmaseq, $anchor, $rdgseq, $witlist, $reftxt ) = @_;
+	my @refwords = split( /\s+/, $reftxt );
+	my $checked = $c->reading( $anchor );
+	my $found;
+	while( $checked ne $c->end && !$found ) {
+		my $next = $c->next_reading( $checked, $c->baselabel );
+		if( $next->text eq $refwords[0] ) {
+			# See if the entire sequence of words matches.
+			$found = $next;
+			foreach my $w ( 1..$#refwords ) {
+				$found = $c->next_reading( $next, $c->baselabel );
+				unless( $found->text eq $refwords[$w] ) {
+					$found = undef;
+					last;
+				}
+			}
+		}
+		$checked = $next;
+	}
+	if( $found ) {
+		# The $found variable should now contain the reading after which we
+		# should stick the transposition.
+		my $fnext = $c->next_reading( $found, $c->baselabel );
+		my $aclabel = $c->ac_label;
+		foreach my $wit_id ( @$witlist ) {
+			my $witstr = _get_sigil( $wit_id, $aclabel );
+			_add_wit_path( $c, $rdgseq, $found->id, $fnext->id, $witstr );
+		}
+		# ...and add the transposition relationship between lemma and rdgseq.
+		if( @$lemmaseq == @$rdgseq ) {
+			foreach my $i ( 0..$#{$lemmaseq} ) {
+				$c->add_relationship( $lemmaseq->[$i], $rdgseq->[$i],
+					{ type => 'transposition', annotation => 'Detected by CTE' } );
+			}
+		} else {
+			say STDERR "ERROR: lemma and transposed sequence different lengths?!"
+		}
+	} else {
+		say STDERR "WARNING: Unable to find $reftxt in base text for transposition";
+		map { $c->del_reading( $_ ) } @$rdgseq;
+	}
+}
+
 =head2 interpret( $reading, $lemma )
 
 Given a string in $reading and a corresponding lemma in $lemma, interpret what
@@ -357,11 +424,11 @@ marking transpositions, prefixed or suffixed words, and the like.
 
 sub interpret {
 	# A utility function to change apparatus-ese into a full variant.
-	my( $reading, $lemma ) = @_;
+	my( $reading, $lemma, $anchor, $opts ) = @_;
 	return $reading if $reading eq $lemma;
 	my $oldreading = $reading;
 	# $lemma =~ s/\s+[[:punct:]]+$//;
-	my $flag;  # In case of p.c. indications
+	my $flag = {};  # To pass back extra info about the interpretation
 	my @words = split( /\s+/, $lemma );
 	$reading =~ s/[[:punct:]]?\bsic\b([[:punct:]]+)?//g;
 	if( $reading =~ /^(.*) praem.$/ ) {
@@ -394,7 +461,14 @@ sub interpret {
 	} elsif( $reading eq 'in marg.' ) {
 		# There was nothing before a correction.
 		$reading = '';
-		$flag = '_ac';
+		$flag->{'AC'} = 1;
+	} elsif( $reading =~ /^(.*?)\s*\(?sic([\s\w!.]+)?\)?$/ ) {
+		# Discard any 'sic' notation; indeed, indeed.
+		$reading = $1;
+		if( $reading =~ /^(\W+)$/ ) {
+			# Nothing left but punctuation, so effectively it's the lemma.
+			$reading = $lemma;
+		}
 	} elsif( $reading =~ /^(.*) \.\.\. (.*)$/ ) {
 		# The first and last N words captured should replace the first and
 		# last N words of the lemma.
@@ -408,17 +482,20 @@ sub interpret {
 			splice( @words, -(scalar @end), scalar @end, @end );
 			$reading = join( ' ', @words );
 		}
+	} elsif( $opts->{interpret_transposition} &&
+			 $reading =~ /^\s*post\s*(.*?)\s*(tr(ans(p)?)?)?\.?\s*$/ ) {
+		# Try to deal with transposed readings
+		## DEBUG
+		say STDERR "Will attempt transposition: $reading at $anchor";
+		$reading = $lemma;
+		$flag->{'TR'} = $1;
+	# Look for processed witStart and witEnd tags
 	} elsif( $reading =~ /^\#START\#\s*(.*)$/ ) {
 		$reading = $1;
-		$flag = 'START';
+		$flag->{'START'} = 1;
 	} elsif( $reading =~ /^(.*?)\s*\#END\#$/ ) {
 		$reading = $1;
-		$flag = 'END';
-	}
-	if( $oldreading ne $reading || $flag || $oldreading =~ /\./ ) {
-		my $int = $reading;
-		$int .= " ($flag)" if $flag;
-		# say STDERR "Interpreted $oldreading as $int given $lemma";
+		$flag->{'END'} = 1;
 	}
 	return( $reading, $flag );
 }