X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FWitness.pm;h=ab5b2f42f5839d0ad350264d511234ff5562ef2e;hb=428bcf0bc79f77a7857b21ef881708faa792e33a;hp=410365100097517e0a62a79fcc882eaa76a65308;hpb=fae52efdaeb4b67f58e47ec8ebe110537f2535ea;p=scpubgit%2Fstemmatology.git

diff --git a/lib/Text/Tradition/Witness.pm b/lib/Text/Tradition/Witness.pm
index 4103651..ab5b2f4 100644
--- a/lib/Text/Tradition/Witness.pm
+++ b/lib/Text/Tradition/Witness.pm
@@ -119,6 +119,11 @@ Accessor method for the witness identifier.
 
 Accessor method for the general witness description.
 
+=head2 has_source
+
+Boolean method that returns a true value if the witness was created with a
+data source (that is, a file, string, or object to be parsed).
+
 =head2 is_layered
 
 Boolean method to note whether the witness has layers (e.g. pre-correction 
@@ -143,29 +148,41 @@ if( $ptwit ) {
     is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
 }
 
-# # Test some JSON witnesses via object
-# open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
-# binmode( JSIN, ':encoding(UTF-8)' );
-# my @lines = <JSIN>;
-# close JSIN;
-# $trad->add_json_witnesses( join( '', @lines ) );
-# is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness', 
-# 	"Found first JSON witness" );
-# is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness', 
-# 	"Found second JSON witness" );
-# 
-# # Test an XML witness via file
-# my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc', 
-# 	'file' => 't/data/witnesses/teiwit.xml' );
-# is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
-# if( $xmlwit ) {
-# 	is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
-# 	ok( $xmlwit->is_layered, "Picked up correction layer" );
-# 	is( @{$xmlwit->path}, 185, "Got correct text length" );
-# 	is( @{$xmlwit->uncorrected_path}, 185, "Got correct a.c. text length" );
-# }
+# Test some JSON witnesses via object
+open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
+binmode( JSIN, ':encoding(UTF-8)' );
+my @lines = <JSIN>;
+close JSIN;
+$trad->add_json_witnesses( join( '', @lines ) );
+is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness', 
+	"Found first JSON witness" );
+is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness', 
+	"Found second JSON witness" );
+
+# Test an XML witness via file
+my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc', 
+	'file' => 't/data/witnesses/teiwit.xml' );
+is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
+if( $xmlwit ) {
+	is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
+	ok( $xmlwit->is_layered, "Picked up correction layer" );
+	is( @{$xmlwit->text}, 182, "Got correct text length" );
+	is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" );
+}
+my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings;
+is( @allwitwords, 184, "Reused appropriate readings" );
 
 ## Test use_text
+my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
+	'file' => 't/data/witnesses/group.xml',
+	'use_text' => '//tei:group/tei:text[2]' );
+is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" );
+if( $xpwit ) {
+	is( $xpwit->sigil, 'G', "XML part witness has correct sigil" );
+	ok( !$xpwit->is_layered, "Picked up no correction layer" );
+	is( @{$xpwit->text}, 157, "Got correct text length" );
+}
+
 
 =end testing 
 
@@ -198,6 +215,12 @@ has 'sigil' => (
 	writer => '_set_sigil',
 	);
 	
+has 'language' => (
+    is => 'ro',
+    isa => 'Str',
+    default => 'Default',
+    );
+
 # Other identifying information
 has 'identifier' => (
 	is => 'rw',
@@ -219,20 +242,14 @@ has 'idno' => (
 	isa => 'Str',
 	);
 
+# Source. Can be XML obj, JSON data struct, or string.
+# Not used if the witness is created by parsing a collation.
 has 'sourcetype' => (
 	is => 'ro',
 	isa => 'SourceType',
 	required => 1, 
 );
 
-has 'language' => (
-    is => 'ro',
-    isa => 'Str',
-    default => 'Default',
-    );
-
-# Source. Can be XML obj, JSON data struct, or string.
-# Not used if the witness is created by parsing a collation.
 has 'file' => (
 	is => 'ro',
 	isa => 'Str',
@@ -258,13 +275,6 @@ has 'use_text' => (
 	isa => 'Str',
 	);
 
-has 'msdesc' => (  # if we started with a TEI doc
-	is => 'ro',
-	isa => 'XML::LibXML::Element',
-	predicate => 'has_msdesc',
-	writer => '_save_msdesc',
-	);
-
 # Text.	 This is an array of strings (i.e. word tokens).
 # TODO Think about how to handle this for the case of pre-prepared
 # collations, where the tokens are in the graph already.
@@ -289,6 +299,7 @@ has 'path' => (
 	clearer => 'clear_path',
 	);		   
 
+## TODO change the name of this
 has 'uncorrected_path' => (
 	is => 'rw',
 	isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
@@ -337,6 +348,7 @@ sub _init_from_xmldesc {
 		}
 		$xmlobj = $self->object;
 	} else {
+		require XML::LibXML;
 		my $parser = XML::LibXML->new();
 		my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
 		try {
@@ -365,7 +377,6 @@ sub _init_from_xmldesc {
 	# Get the identifier
 	if( my $desc = $xpc->find( $tags{msDesc} ) ) {
 		my $descnode = $desc->get_node(1);
-		$self->_save_msdesc( $descnode );
 		# First try to use settlement/repository/idno.
 		my( $setNode, $reposNode, $idNode ) =
 			( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
@@ -392,7 +403,8 @@ sub _init_from_xmldesc {
         if( $descnode->hasAttribute('xml:id') ) {
 			$self->_set_sigil( $descnode->getAttribute('xml:id') );
 		} elsif( !$self->has_sigil ) {
-			throw( 'Could not find xml:id witness sigil' );
+			throw( ident => 'missing sigil',
+				   message => 'Could not find xml:id witness sigil' );
 		}
 	} else {
 	    throw( ident => "bad source",
@@ -421,13 +433,14 @@ sub _init_from_xmldesc {
 	           message => "No text element in document '" . $self->{'identifier'} . "!" );
 	}
 	
+	my @text = map { $_->text } @words;
+	my @layertext = map { $_->text } @layerwords;
 	$self->path( \@words );
-	my $a = join( ' ', map { $_->text } @words );
-	my $b = join( ' ', map { $_->text } @layerwords );
-	if( $a ne $b ) {
+	$self->text( \@text );
+	if( join( ' ', @text ) ne join( ' ', @layertext ) ) {
 		$self->uncorrected_path( \@layerwords );
+		$self->layertext( \@layertext );
 	}
-	# TODO set self->text
 }
 
 sub _tokenize_text {
@@ -470,7 +483,7 @@ sub _objectify_words {
 		# Hunt down each wrapped word/seg, and make an object (or two objects)
 		# of it, if necessary.
 		foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
-			my( $text, $uncorr ) = _get_word_object( $c );
+			my( $text, $uncorr ) = _get_word_strings( $c );
 # 			try {
 # 				( $text, $uncorr ) = _get_word_object( $c );
 # 			} catch( Text::Tradition::Error $e 
@@ -532,8 +545,8 @@ sub _get_word_strings {
 	my $word_excluded = 0;
 	my $xpc = _xpc_for_el( $node );
 	# TODO This does not cope with nested add/dels.
-	my @addition = $xpc->findnodes( 'ancestor::' . $tags{add} );
-	my @deletion = $xpc->findnodes( 'ancestor::' . $tags{del} );
+	my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) );
+	my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) );
 	foreach my $c ($node->childNodes() ) {
 		if( $c->nodeName eq 'num' 
 			&& defined $c->getAttribute( 'value' ) ) {
@@ -554,26 +567,28 @@ sub _get_word_strings {
 			$word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
 			next;
 		} elsif( $c->nodeName eq 'add' ) {
-			my( $use, $discard ) = _get_text_from_node( $c );
+			my( $use, $discard ) = _get_word_strings( $c );
 			$text .= $use;
 		} elsif( $c->nodeName eq 'del' ) {
-			my( $discard, $use ) = _get_text_from_node( $c );
+			my( $discard, $use ) = _get_word_strings( $c );
 			$uncorrtext .= $use;
 		} else {
-			my $tagtxt;
+			my ( $tagtxt, $taguncorr );
 			if( ref( $c ) eq 'XML::LibXML::Text' ) {
 				# A text node.
 				$tagtxt = $c->textContent;
+				$taguncorr = $c->textContent;
 			} else {
-				$tagtxt = _get_text_from_node( $c );
+				( $tagtxt, $taguncorr ) = _get_word_strings( $c );
 			}
 			if( $strip_leading_space ) {
 				$tagtxt =~ s/^[\s\n]+//s;
+				$taguncorr =~ s/^[\s\n]+//s;
 				# Unset the flag as soon as we see non-whitespace.
 				$strip_leading_space = 0 if $tagtxt;
 			}
 			$text .= $tagtxt;
-			$uncorrtext .= $tagtxt;
+			$uncorrtext .= $taguncorr;
 		} 
 	}
 	throw( ident => "text not found",
@@ -603,14 +618,30 @@ sub _init_from_json {
 	my $wit;
 	if( $self->has_object ) {
 		$wit = $self->object;
-	} else {
-	
+	} elsif( $self->has_string ) {
+		$wit = from_json( $self->string );
+	} elsif( $self->has_file ) {
+    	my $ok = open( INPUT, $self->file );
+    	unless( $ok ) {
+			throw( ident => "bad source",
+				   message => 'Could not open ' . $self->file . ' for reading' );
+    	}
+    	binmode( INPUT, ':encoding(UTF-8)' );
+    	my @lines = <INPUT>;
+    	close INPUT;
+    	$wit = from_json( join( '', @lines ) );
 	}
 	
-	$self->sigil( $wit->{'id'} );
+	if( exists $wit->{'id'} ) {
+		$self->_set_sigil( $wit->{'id'} );
+	} elsif( !$self->has_sigil ) {
+		throw( ident => 'missing sigil',
+			   message => 'Could not find witness sigil (id) in JSON spec' );
+	}
 	$self->identifier( $wit->{'name'} );
 	my @words;
 	my @layerwords;
+	my( @text, @layertext );
 	if( exists $wit->{'content'} ) {
 		# We need to tokenize the text ourselves.
 		@words = _split_words( $self, $wit->{'content'} );
@@ -619,19 +650,22 @@ sub _init_from_json {
 		my $ctr = 0;
 		foreach my $token ( @{$wit->{'tokens'}} ) {
 			my $w_obj = $self->tradition->collation->add_reading({
-				'text' => $token, 'id' => $self->sigil . 'r' . $ctr++ });
+				'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
 			push( @words, $w_obj );
+			push( @text, $token->{'t'} ); # TODO unless...?
 		}
 		## TODO rethink this JSOn mechanism
 		if( exists $wit->{'layertokens'} ) {
 			foreach my $token ( @{$wit->{'layertokens'}} ) {
 				my $w_obj = $self->tradition->collation->add_reading({
-					'text' => $token, 'id' => $self->sigil . 'r' . $ctr++ });
+					'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
 				push( @layerwords, $w_obj );
+				push( @layertext, $token->{'t'} );
 			}
 		}
 	}
-	# TODO set self->text
+	$self->text( \@text );
+	$self->layertext( \@layertext ) if @layertext;
 	$self->path( \@words );
 	$self->uncorrected_path( \@layerwords ) if @layerwords;
 }
@@ -754,6 +788,8 @@ __PACKAGE__->meta->make_immutable;
 
 =over
 
+=item * Figure out how to serialize a witness
+
 =item * Support encodings other than UTF-8
 
 =back