From: Tara L Andrews <tla\@mit.edu>
Date: Mon, 16 Apr 2012 09:27:27 +0000 (+0200)
Subject: make XML witness parsing work
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=b39fb0b309212f41a85d6b4bb90af13aa03095c3;p=scpubgit%2Fstemmatology.git

make XML witness parsing work
---

diff --git a/lib/Text/Tradition/Witness.pm b/lib/Text/Tradition/Witness.pm
index d7047ba..e9321f4 100644
--- a/lib/Text/Tradition/Witness.pm
+++ b/lib/Text/Tradition/Witness.pm
@@ -154,18 +154,30 @@ is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
 is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness', 
 	"Found second JSON witness" );
 
-# # Test an XML witness via file
-# my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc', 
-# 	'file' => 't/data/witnesses/teiwit.xml' );
-# is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
-# if( $xmlwit ) {
-# 	is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
-# 	ok( $xmlwit->is_layered, "Picked up correction layer" );
-# 	is( @{$xmlwit->path}, 185, "Got correct text length" );
-# 	is( @{$xmlwit->uncorrected_path}, 185, "Got correct a.c. text length" );
-# }
+# Test an XML witness via file
+my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc', 
+	'file' => 't/data/witnesses/teiwit.xml' );
+is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
+if( $xmlwit ) {
+	is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
+	ok( $xmlwit->is_layered, "Picked up correction layer" );
+	is( @{$xmlwit->text}, 182, "Got correct text length" );
+	is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" );
+}
+my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings;
+is( @allwitwords, 184, "Reused appropriate readings" );
 
 ## Test use_text
+my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
+	'file' => 't/data/witnesses/group.xml',
+	'use_text' => '//tei:group/tei:text[2]' );
+is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" );
+if( $xpwit ) {
+	is( $xpwit->sigil, 'G', "XML part witness has correct sigil" );
+	ok( !$xpwit->is_layered, "Picked up no correction layer" );
+	is( @{$xpwit->text}, 157, "Got correct text length" );
+}
+
 
 =end testing 
 
@@ -198,6 +210,12 @@ has 'sigil' => (
 	writer => '_set_sigil',
 	);
 	
+has 'language' => (
+    is => 'ro',
+    isa => 'Str',
+    default => 'Default',
+    );
+
 # Other identifying information
 has 'identifier' => (
 	is => 'rw',
@@ -219,20 +237,14 @@ has 'idno' => (
 	isa => 'Str',
 	);
 
+# Source. Can be XML obj, JSON data struct, or string.
+# Not used if the witness is created by parsing a collation.
 has 'sourcetype' => (
 	is => 'ro',
 	isa => 'SourceType',
 	required => 1, 
 );
 
-has 'language' => (
-    is => 'ro',
-    isa => 'Str',
-    default => 'Default',
-    );
-
-# Source. Can be XML obj, JSON data struct, or string.
-# Not used if the witness is created by parsing a collation.
 has 'file' => (
 	is => 'ro',
 	isa => 'Str',
@@ -258,13 +270,6 @@ has 'use_text' => (
 	isa => 'Str',
 	);
 
-has 'msdesc' => (  # if we started with a TEI doc
-	is => 'ro',
-	isa => 'XML::LibXML::Element',
-	predicate => 'has_msdesc',
-	writer => '_save_msdesc',
-	);
-
 # Text.	 This is an array of strings (i.e. word tokens).
 # TODO Think about how to handle this for the case of pre-prepared
 # collations, where the tokens are in the graph already.
@@ -289,6 +294,7 @@ has 'path' => (
 	clearer => 'clear_path',
 	);		   
 
+## TODO change the name of this
 has 'uncorrected_path' => (
 	is => 'rw',
 	isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
@@ -365,7 +371,6 @@ sub _init_from_xmldesc {
 	# Get the identifier
 	if( my $desc = $xpc->find( $tags{msDesc} ) ) {
 		my $descnode = $desc->get_node(1);
-		$self->_save_msdesc( $descnode );
 		# First try to use settlement/repository/idno.
 		my( $setNode, $reposNode, $idNode ) =
 			( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
@@ -422,13 +427,14 @@ sub _init_from_xmldesc {
 	           message => "No text element in document '" . $self->{'identifier'} . "!" );
 	}
 	
+	my @text = map { $_->text } @words;
+	my @layertext = map { $_->text } @layerwords;
 	$self->path( \@words );
-	my $a = join( ' ', map { $_->text } @words );
-	my $b = join( ' ', map { $_->text } @layerwords );
-	if( $a ne $b ) {
+	$self->text( \@text );
+	if( join( ' ', @text ) ne join( ' ', @layertext ) ) {
 		$self->uncorrected_path( \@layerwords );
+		$self->layertext( \@layertext );
 	}
-	# TODO set self->text
 }
 
 sub _tokenize_text {
@@ -471,7 +477,7 @@ sub _objectify_words {
 		# Hunt down each wrapped word/seg, and make an object (or two objects)
 		# of it, if necessary.
 		foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
-			my( $text, $uncorr ) = _get_word_object( $c );
+			my( $text, $uncorr ) = _get_word_strings( $c );
 # 			try {
 # 				( $text, $uncorr ) = _get_word_object( $c );
 # 			} catch( Text::Tradition::Error $e 
@@ -533,8 +539,8 @@ sub _get_word_strings {
 	my $word_excluded = 0;
 	my $xpc = _xpc_for_el( $node );
 	# TODO This does not cope with nested add/dels.
-	my @addition = $xpc->findnodes( 'ancestor::' . $tags{add} );
-	my @deletion = $xpc->findnodes( 'ancestor::' . $tags{del} );
+	my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) );
+	my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) );
 	foreach my $c ($node->childNodes() ) {
 		if( $c->nodeName eq 'num' 
 			&& defined $c->getAttribute( 'value' ) ) {
@@ -555,26 +561,28 @@ sub _get_word_strings {
 			$word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
 			next;
 		} elsif( $c->nodeName eq 'add' ) {
-			my( $use, $discard ) = _get_text_from_node( $c );
+			my( $use, $discard ) = _get_word_strings( $c );
 			$text .= $use;
 		} elsif( $c->nodeName eq 'del' ) {
-			my( $discard, $use ) = _get_text_from_node( $c );
+			my( $discard, $use ) = _get_word_strings( $c );
 			$uncorrtext .= $use;
 		} else {
-			my $tagtxt;
+			my ( $tagtxt, $taguncorr );
 			if( ref( $c ) eq 'XML::LibXML::Text' ) {
 				# A text node.
 				$tagtxt = $c->textContent;
+				$taguncorr = $c->textContent;
 			} else {
-				$tagtxt = _get_text_from_node( $c );
+				( $tagtxt, $taguncorr ) = _get_word_strings( $c );
 			}
 			if( $strip_leading_space ) {
 				$tagtxt =~ s/^[\s\n]+//s;
+				$taguncorr =~ s/^[\s\n]+//s;
 				# Unset the flag as soon as we see non-whitespace.
 				$strip_leading_space = 0 if $tagtxt;
 			}
 			$text .= $tagtxt;
-			$uncorrtext .= $tagtxt;
+			$uncorrtext .= $taguncorr;
 		} 
 	}
 	throw( ident => "text not found",
diff --git a/t/data/witnesses/group.xml b/t/data/witnesses/group.xml
new file mode 100644
index 0000000..feb971c
--- /dev/null
+++ b/t/data/witnesses/group.xml
@@ -0,0 +1,67 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<TEI xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:svg="http://www.w3.org/2000/svg"
+  xmlns:math="http://www.w3.org/1998/Math/MathML" xmlns="http://www.tei-c.org/ns/1.0">
+  <teiHeader>
+    <fileDesc>
+      <titleStmt>
+        <title>Lorem ipsum</title>
+        <author>Cero</author>
+        <respStmt xml:id="tla">
+          <resp>Transcription by</resp>
+          <name> Tara L Andrews</name>
+        </respStmt>
+      </titleStmt>
+      <publicationStmt>
+        <p>Unpublished use case</p>
+      </publicationStmt>
+      <sourceDesc>
+        <msDesc xml:id="G">
+          <msIdentifier>
+            <msName>Lorem Ipsum Test</msName>
+          </msIdentifier>
+        </msDesc>
+      </sourceDesc>
+    </fileDesc>
+  </teiHeader>
+  <text>
+    <front>
+      <head>A selection of pseudo-Latin texts</head>
+    </front>
+    <group>
+      <text>
+        <body>
+          <p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec nec mi et felis gravida 
+            hendrerit ornare eget lorem. Proin nisi sem, aliquam eget aliquet ut, lacinia sed erat. 
+            Morbi posuere euismod turpis et volutpat. Mauris suscipit, nisi eget dignissim con 
+sectetur, tortor felis tristique lorem, lacinia feugiat leo ante vel diam. Duis ac 
+            mauris libero, at suscipit risus. Curabitur orci nunc, commodo sed ornare sed, dictum et 
+            nunc. Nulla facilisi. Suspendisse vestibulum dignissim turpis, ut pellentesque orci 
+            convallis aliquet. Mauris metus purus, ullamcorper ut imperdiet et, tristique fermentum 
+            arcu. Donec congue blandit aliquet. Nunc semper mollis mollis. Nulla tempus, augue vitae 
+            iaculis vulputate, neque diam placerat risus, lacinia luctus purus mauris in ligula. 
+            Fusce vehicula eleifend pharetra. Cras nec libero diam, at semper lacus. Nulla 
+            tristique, ligula id lobortis volutpat, eros metus condimentum orci, in interdum lorem 
+            nisi ut justo. </p>
+        </body>
+      </text>
+      <text>
+        <body>
+          <p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec nec mi et felis gravida 
+            hendrerit ornare eget lorem. Proin nisi sem, aliquam eget aliquet ut, lacinia sed erat. 
+            Morbi posuere euismod turpis et volutpat. Mauris suscipit, nisi eget dignissim 
+            consectetur, tortor felis tristique lorem, lacinia feugiat leo ante vel diam. Duis ac 
+            mauris libero, at suscipit risus. Curabitur orci nunc, commodo sed ornare sed, dictum et 
+            nunc. Nulla facilisi. Suspendisse vestibulum dignissim turpis, ut pellentesque orci 
+            convallis aliquet. Mauris metus purus, ullamcorper ut imperdiet et, tristique fermentum 
+            arcu. Donec congue blandit aliquet. Nunc semper mollis mollis. Nulla tempus, augue vitae 
+            iaculis vulputate, neque diam placerat risus, lacinia luctus purus mauris in ligula. 
+            Fusce vehicula eleifend pharetra. Cras nec libero diam, at semper lacus. Nulla 
+            tristique, ligula id lobortis volutpat, eros metus condimentum orci, in interdum lorem 
+            nisi ut justo. Fusce felis ante, vestibulum condimentum aliquet eget, lobortis quis 
+            nibh. Quisque eget malesuada sem. Integer congue luctus rhoncus. </p>
+        </body>
+      </text>
+    </group>
+  </text>
+</TEI>
+
diff --git a/t/data/witnesses/teiwit.xml b/t/data/witnesses/teiwit.xml
index cc30d0c..3a0caa0 100644
--- a/t/data/witnesses/teiwit.xml
+++ b/t/data/witnesses/teiwit.xml
@@ -1,6 +1,5 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<?oxygen RNGSchema="tools/tei_ms_crit.rng" type="xml"?>
-<TEI xmlns:xi="http://www.w3.org/2001/XInclude" xmlns:svg="http://www.w3.org/2000/svg" xmlns:math="http://www.w3.org/1998/Math/MathML" xmlns="http://www.tei-c.org/ns/1.0">
+<TEI xmlns="http://www.tei-c.org/ns/1.0">
   <teiHeader>
     <fileDesc>
       <titleStmt>
@@ -35,7 +34,7 @@
 Õ¡ÖÕ¶ Õ¸ÖÖ Õ¡Õ¯Õ¡Õ¶Õ¡Õ¿Õ¥Õ½Ö Õ§Õ«Õ¶ Õ¬Õ¥Õ¡ÕÕ¬Ö <expan>Õ¡Õ´Õ¥Õ¶Õ¡ÕµÕ¶</expan> Õ¥Õ²Õ¥Õ¬Õ¸ÖÕ½, Ö Õ¶Õ¥Õ²<expan>Õ¸ÖÕ©Õ¥Õ¡Õ¶ÖÕ½</expan> Õ¡ÕµÕ½Õ¸ÖÕ«Õ¯ <lb/>
 Õ¦Õ¸Ö <expan>Õ¾Õ¡Õ½Õ¶</expan> Õ´Õ¥Õ²<ex resp="#tla">Õ¡</ex>Ö Õ¯ÖÕ¥<ex resp="#tla">Õ¡</ex>Ö Õ¿Õ¸ÖÕ¶Õ¶ Õ°Õ¡ÕµÕ¸ÕÖ Ö Õ¡Õ°<ex resp="#tla">Õ¡</ex>Ö<ex resp="#tla">Õ¸</ex>Ö Õ¦Õ¡ÕµÕ½ <expan>Õ¢Õ¡Õ¦Õ¸ÖÕ´</expan> <expan>Õ¡Õ¶Õ£Õ¡Õ´</expan> Õ¦Õ´Õ¿Õ¡Ö <lb/>
 Õ¡Õ®Õ¥<ex resp="#tla">Õ¡</ex>Õ¬ Õ¥Õ´ <expan>Õ¾Õ¡Õ½Õ¶</expan> Õ¡ÕµÕ½ ÕµÕ¥Õ¿Õ«Õ¶ <expan>ÕªÕ¡Õ´Õ¡Õ¶Õ¡Õ¯Õ«Õ¶</expan> Õ£ÖÕ¥Õ¬ Õ¦Õ¤Õ¡Õ¼Õ¶Õ¡Õ·Õ¸ÖÕ¶Õ¹ <abbr>Õ¯Õ¿ÖÕ®Õ½Õ¶</abbr>Ö <lb/></p>
-      <p><hi>Ô»Õ½Õ¯ Õ« Õ¬Õ«Õ¶Õ¥Õ¬ Õ©Õ¾Õ¡Õ¯Õ¡Õ¶<expan>Õ¸ÖÕ©Õ¥Õ¡Õ¶Õ½</expan> Õ°Õ¡ÕµÕ¸Ö Õ« ÕµÕ¡Õ´Õ½ <num value="502">ÔµÕÔ²</num> Õ¥Õ²Ö Õ¡Õ°</hi><lb/>
+      <p><hi>Ô»Õ½Õ¯ Õ« Õ¬Õ«Õ¶Õ¥Õ¬ Õ©Õ¾Õ¡Õ¯Õ¡Õ¶<expan>Õ¸ÖÕ©Õ¥Õ¡Õ¶Õ½</expan> Õ°Õ¡ÕµÕ¸Ö Õ« ÕµÕ¡Õ´Õ½ <num value="502">ÔµÕÔ²</num> Õ¥Õ²Ö</hi> Õ¡Õ°<lb/>
 Õ¡ÖÕ¸Ö Õ¶Õ·Õ¡Õ¶ Ö Õ½Õ¸Õ½Õ¯<ex resp="#tla">Õ¡</ex>Õ¬Õ«, Ö <abbr>Õ¯Õ¿ÖÕ®</abbr> Õ´Õ¥Õ® Õ« Õ¢<ex resp="#tla">Õ¡</ex>ÖÕ¯Õ¸ÖÕ©Õ¥Õ¶Õ§, Õ¥ÖÖÕ¥<ex resp="#tla">Õ¡</ex>Õ¬ Õ« Õ´Õ¥Õ® <lb/><expan>ÖÕ¡Õ²Õ¡ÖÕ¶</expan> Õ¡Õ¶Õ¿Õ«Õ¸Ö, Õ¦Õ¸Ö Õ« Õ¶Õ¥ÖÖÕ¸Õµ <expan>Õ¡ÖÕ¥Õ£Õ¡Õ¯Õ¡Õ¶Õ¶</expan> <expan>Õ£Õ¸ÖÕ®Õ¥ÖÕ¡ÖÕ</expan> Õ¡ÕµÕ½ Õ½ÖÕ¡Õ¶Õ¹Õ¥Õ¬Õ«Õ½ Õ¸Ö Õ§ÕÖ Õ¿Õ¥Õ½<lb/>
 Õ¸Õ²Õ¡ÖÕ¶ Õ¡Õ°Õ¡ÖÕ¸Ö Ö <expan>Õ°ÖÕ¡Õ·Õ¡Õ¬Õ«</expan>. Ö Õ¥Õ²Ö Õ¡ÕµÕ½ <subst><del>Õ«Õ·Õ­</del><add place="above">Õ¶Õ·</add></subst>Õ¡Õ¶Õ½ Õ¡ÕÕ° Ö Õ¤Õ¸Õ²Õ¸ÖÕ´Õ¶ <expan>Õ¡Õ´Õ¥Õ¶Õ¡ÕµÕ¶</expan> <ex>Õ°Õ¡ÖÕ¡Õ¿Õ¡Ö</ex><lb/>
 Õ¥Õ¬Õ¸Ö <expan>ÖÖÕ«Õ½Õ¿Õ¸Õ½Õ«</expan>, Õ¦Õ¸ÖÕ½ <del>Õµ</del>Õ¡Õµ<del>Õ¶</del>Õª<del>Õ¡</del>Õ´ Õ´Õ¥Õ®Õ¡Õ½Õ¡Õ½Õ¿ Õ½ÕºÕ¡Õ¼Õ¶Õ¡Õ¬Õ¥ÖÖ ÕµÕ¡ÕµÕ¿Õ¶Õ¥<ex resp="#tla">Õ¡</ex>Ö <expan>Õ¡Õ½Õ¿Õ¸ÖÕ¡Õ®</expan> <expan>Õ¦Õ¤Õ¡Õ¿Õ¡Õ½Õ¿Õ¡Õ¶</expan> <lb/>
diff --git a/t/text_tradition_witness.t b/t/text_tradition_witness.t
index 20e8f22..2457b2c 100644
--- a/t/text_tradition_witness.t
+++ b/t/text_tradition_witness.t
@@ -36,18 +36,29 @@ is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
 is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness', 
 	"Found second JSON witness" );
 
-# # Test an XML witness via file
-# my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc', 
-# 	'file' => 't/data/witnesses/teiwit.xml' );
-# is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
-# if( $xmlwit ) {
-# 	is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
-# 	ok( $xmlwit->is_layered, "Picked up correction layer" );
-# 	is( @{$xmlwit->path}, 185, "Got correct text length" );
-# 	is( @{$xmlwit->uncorrected_path}, 185, "Got correct a.c. text length" );
-# }
+# Test an XML witness via file
+my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc', 
+	'file' => 't/data/witnesses/teiwit.xml' );
+is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
+if( $xmlwit ) {
+	is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
+	ok( $xmlwit->is_layered, "Picked up correction layer" );
+	is( @{$xmlwit->text}, 182, "Got correct text length" );
+	is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" );
+}
+my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings;
+is( @allwitwords, 184, "Reused appropriate readings" );
 
 ## Test use_text
+my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
+	'file' => 't/data/witnesses/group.xml',
+	'use_text' => '//tei:group/tei:text[2]' );
+is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" );
+if( $xpwit ) {
+	is( $xpwit->sigil, 'G', "XML part witness has correct sigil" );
+	ok( !$xpwit->is_layered, "Picked up no correction layer" );
+	is( @{$xpwit->text}, 157, "Got correct text length" );
+}
 }