1 package Text::Tradition::Witness;
6 use Moose::Util::TypeConstraints;
7 use Text::TEI::Markup qw( word_tag_wrap );
9 use XML::Easy::Syntax qw( $xml10_name_rx );
13 Text::Tradition::Witness - a manuscript witness to a text tradition
17 use Text::Tradition::Witness;
18 my $w = Text::Tradition::Witness->new(
20 'identifier' => 'Oxford MS Ex.1932',
25 Text::Tradition::Witness is an object representation of a manuscript
26 witness to a text tradition. A manuscript has a sigil (a short code that
27 represents it in the wider tradition), an identifier (e.g. the library ID),
34 Create a new witness. Options include:
38 =item * sigil - A short code to represent the manuscript. Required.
40 =item * sourcetype - What sort of witness data this is. Options are
41 'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be
42 used by Collation parsers.)
48 The data source for the witness. Use the appropriate option.
50 =item * use_text - An initialization option. If the witness is read from a
51 TEI document and more than one <text/> tag exists therein, the default
52 behavior is to use the first defined text. If this is not desired,
53 use_text should be set to an XPath expression that will select the correct
56 =item * identifier - The recognized name of the manuscript, e.g. a library
57 identifier. Taken from the msDesc element for a TEI file.
59 =item * other_info - A freeform string for any other description of the
66 The sigil by which to identify this manuscript, which must conform to the
67 specification for XML attribute strings (broadly speaking, it must begin
68 with a letter and can have only a few sorts of punctuation characters in
73 A freeform name by which to identify the manuscript, which may be longer
74 than the sigil. Defaults to 'Unidentified ms', but will be taken from the
75 TEI msName attribute, or constructed from the settlement and idno if
80 The city, town, etc. where the manuscript is held. Will be read from the
81 TEI msDesc element if supplied.
85 The institution that holds the manuscript. Will be read from the TEI msDesc
90 The identification or call number of the manuscript. Will be read from the
91 TEI msDesc element if supplied.
95 An array of strings (words) that contains the text of the
96 manuscript. This should not change after the witness has been
97 instantiated, and the path through the collation should always match it.
101 An array of strings (words) that contains the layered
102 text, if any, of the manuscript. This should not change after the witness
103 has been instantiated, and the path through the collation should always
108 Accessor method for the witness identifier.
112 Accessor method for the general witness description.
116 Boolean method that returns a true value if the witness was created with a
117 data source (that is, a file, string, or object to be parsed).
121 Boolean method to note whether the witness has layers (e.g. pre-correction
122 readings) in the collation.
127 my $trad = Text::Tradition->new( 'name' => 'test tradition' );
128 my $c = $trad->collation;
130 # Test a plaintext witness via string
131 my $str = 'This is a line of text';
132 my $ptwit = $trad->add_witness(
134 'sourcetype' => 'plaintext',
137 is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' );
139 is( $ptwit->sigil, 'A', "Witness has correct sigil" );
140 is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
143 # Test some JSON witnesses via object
144 open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
145 binmode( JSIN, ':encoding(UTF-8)' );
148 $trad->add_json_witnesses( join( '', @lines ) );
149 is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
150 "Found first JSON witness" );
151 is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness',
152 "Found second JSON witness" );
154 # Test an XML witness via file
155 my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
156 'file' => 't/data/witnesses/teiwit.xml' );
157 is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
159 is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
160 ok( $xmlwit->is_layered, "Picked up correction layer" );
161 is( @{$xmlwit->text}, 182, "Got correct text length" );
162 is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" );
164 my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings;
165 is( @allwitwords, 184, "Reused appropriate readings" );
168 my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
169 'file' => 't/data/witnesses/group.xml',
170 'use_text' => '//tei:group/tei:text[2]' );
171 is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" );
173 is( $xpwit->sigil, 'G', "XML part witness has correct sigil" );
174 ok( !$xpwit->is_layered, "Picked up no correction layer" );
175 is( @{$xpwit->text}, 157, "Got correct text length" );
183 # Enable plugin(s) if available
184 eval { with 'Text::Tradition::WitLanguage'; };
186 subtype 'SourceType',
188 where { $_ =~ /^(xmldesc|plaintext|json|collation)$/ },
189 message { 'Source type must be one of xmldesc, plaintext, json, collation' };
193 where { $_ =~ /\A$xml10_name_rx\z/ },
194 message { 'Sigil must be a valid XML attribute string' };
196 no Moose::Util::TypeConstraints;
200 'isa' => 'Text::Tradition',
205 # Sigil. Required identifier for a witness, but may be found inside
210 predicate => 'has_sigil',
211 writer => '_set_sigil',
214 # Other identifying information
215 has 'identifier' => (
220 has 'settlement' => (
225 has 'repository' => (
235 # Source. Can be XML obj, JSON data struct, or string.
236 # Not used if the witness is created by parsing a collation.
237 has 'sourcetype' => (
246 predicate => 'has_file',
252 predicate => 'has_string',
255 has 'object' => ( # could be anything.
257 predicate => 'has_object',
258 clearer => 'clear_object',
261 # In the case of a TEI document with multiple texts, specify
262 # which text is the root. Should be an XPath expression.
268 # Text. This is an array of strings (i.e. word tokens).
269 # TODO Think about how to handle this for the case of pre-prepared
270 # collations, where the tokens are in the graph already.
273 isa => 'ArrayRef[Str]',
274 predicate => 'has_text',
279 isa => 'ArrayRef[Str]',
280 predicate => 'has_layertext',
283 has 'is_collated' => (
288 # Path. This is an array of Reading nodes that can be saved during
289 # initialization, but should be cleared before saving in a DB.
292 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
293 predicate => 'has_path',
294 clearer => 'clear_path',
297 ## TODO change the name of this
298 has 'uncorrected_path' => (
300 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
301 clearer => 'clear_uncorrected_path',
304 has 'is_layered' => (
309 # If we set an uncorrected path, ever, remember that we did so.
310 around 'uncorrected_path' => sub {
314 $self->is_layered( 1 );
320 if( $self->has_source ) {
321 my $init_sub = '_init_from_' . $self->sourcetype;
323 # Remove our XML / source objects; we no longer need them.
324 $self->clear_object if $self->has_object;
325 # $self->tradition->collation->make_witness_path( $self );
327 if( $self->sourcetype eq 'collation' ) {
328 $self->is_collated( 1 );
335 return $self->has_file || $self->has_string || $self->has_object;
338 sub _init_from_xmldesc {
341 if( $self->has_object ) {
342 unless( ref( $self->object ) eq 'XML::LibXML::Element' ) {
343 throw( ident => "bad source",
344 message => "Source object must be an XML::LibXML::Element (this is "
345 . ref( $self->object ) . ");" );
347 $xmlobj = $self->object;
350 my $parser = XML::LibXML->new();
351 my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
353 $xmlobj = $parser->$parsersub( $self->file )->documentElement;
354 } catch( XML::LibXML::Error $e ) {
355 throw( ident => "bad source",
356 message => "XML parsing error: " . $e->as_string );
360 unless( $xmlobj->nodeName eq 'TEI' ) {
361 throw( ident => "bad source",
362 message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" );
365 # Set up the tags we need, with or without namespaces.
366 map { $tags{$_} = "//$_" }
367 qw/ msDesc msName settlement repository idno p lg w seg add del /;
368 # Set up our XPath object
369 my $xpc = _xpc_for_el( $xmlobj );
370 # Use namespace-aware tags if we have to
371 if( $xmlobj->namespaceURI ) {
372 map { $tags{$_} = "//tei:$_" } keys %tags;
376 if( my $desc = $xpc->find( $tags{msDesc} ) ) {
377 my $descnode = $desc->get_node(1);
378 # First try to use settlement/repository/idno.
379 my( $setNode, $reposNode, $idNode ) =
380 ( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
381 $xpc->find( $tags{repository}, $descnode )->get_node(1),
382 $xpc->find( $tags{idno}, $descnode )->get_node(1) );
383 $self->settlement( $setNode ? $setNode->textContent : '' );
384 $self->repository( $reposNode ? $reposNode->textContent : '' );
385 $self->idno( $idNode ? $idNode->textContent : '' );
386 if( $self->settlement && $self->idno ) {
387 $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) );
389 # Look for an msName.
390 my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1);
392 $self->identifier( $msNameNode->textContent );
394 # We have an msDesc but who knows what is in it?
395 my $desc = $descnode->textContent;
398 $self->identifier( $desc );
401 if( $descnode->hasAttribute('xml:id') ) {
402 $self->_set_sigil( $descnode->getAttribute('xml:id') );
403 } elsif( !$self->has_sigil ) {
404 throw( ident => 'missing sigil',
405 message => 'Could not find xml:id witness sigil' );
408 throw( ident => "bad source",
409 message => "Could not find manuscript description element in TEI header" );
412 # Now get the words out.
414 my @layerwords; # if the witness has layers
415 # First, make sure all the words are wrapped in tags.
416 # TODO Make this not necessarily dependent upon whitespace...
417 word_tag_wrap( $xmlobj );
418 # Now go text hunting.
420 if( $self->use_text ) {
421 @textnodes = $xpc->findnodes( $self->use_text );
423 # Use the first 'text' node in the document.
424 @textnodes = $xmlobj->getElementsByTagName( 'text' );
426 my $teitext = $textnodes[0];
428 _tokenize_text( $self, $teitext, \@words, \@layerwords );
430 throw( ident => "bad source",
431 message => "No text element in document '" . $self->{'identifier'} . "!" );
434 my @text = map { $_->text } @words;
435 my @layertext = map { $_->text } @layerwords;
436 $self->path( \@words );
437 $self->text( \@text );
438 if( join( ' ', @text ) ne join( ' ', @layertext ) ) {
439 $self->uncorrected_path( \@layerwords );
440 $self->layertext( \@layertext );
445 my( $self, $teitext, $wordlist, $uncorrlist ) = @_;
446 # Strip out the words.
447 my $xpc = _xpc_for_el( $teitext );
448 my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' );
451 if( my $n = $_->getAttribute( 'n' ) ) {
452 $place_str = '#DIV_' . $n . '#';
454 $place_str = '#DIV#';
456 $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str );
459 # But maybe we don't have any divs. Just paragraphs.
461 $self->_objectify_words( $teitext, $wordlist, $uncorrlist );
465 sub _objectify_words {
466 my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_;
468 my $xpc = _xpc_for_el( $element );
469 my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg};
470 my @pgraphs = $xpc->findnodes( $xpexpr );
471 return () unless @pgraphs;
472 # Set up an expression to look for words and segs
473 $xpexpr = '.' . $tags{w} . '|.' . $tags{seg};
474 foreach my $pg ( @pgraphs ) {
475 # If this paragraph is the descendant of a note element,
477 my @noop_container = $xpc->findnodes( 'ancestor::note', $pg );
478 next if scalar @noop_container;
479 # Get the text of each node
481 # Hunt down each wrapped word/seg, and make an object (or two objects)
482 # of it, if necessary.
483 foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
484 my( $text, $uncorr ) = _get_word_strings( $c );
486 # ( $text, $uncorr ) = _get_word_object( $c );
487 # } catch( Text::Tradition::Error $e
488 # where { $_->has_tag( 'lb' ) } ) {
491 unless( defined $text || defined $uncorr ) {
492 print STDERR "WARNING: no text in node " . $c->nodeName
493 . "\n" unless $c->nodeName eq 'lb';
496 print STDERR "DEBUG: space found in element node "
497 . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/;
499 my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist;
500 while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) {
503 my $id = $self->sigil . 'r' . $ctr;
504 my( $word, $acword );
506 $word = $self->tradition->collation->add_reading(
507 { 'id' => $id, 'text' => $text });
509 if( $uncorr && $uncorr ne $text ) {
511 $acword = $self->tradition->collation->add_reading(
512 { 'id' => $id, 'text' => $uncorr });
517 # if( $first_word ) {
519 # # Set the relevant sectioning markers
521 # $w->add_placeholder( $divmarker );
522 # $divmarker = undef;
524 # $w->add_placeholder( '#PG#' );
526 push( @$wordlist, $word ) if $word;
527 push( @$uncorrlist, $acword ) if $acword;
532 # Given a word or segment node, make a Reading object for the word
533 # therein. Make two Reading objects if there is an 'uncorrected' vs.
536 sub _get_word_strings {
538 my( $text, $uncorrtext );
539 # We can have an lb or pb in the middle of a word; if we do, the
540 # whitespace (including \n) after the break becomes insignificant
541 # and we want to nuke it.
542 my $strip_leading_space = 0;
543 my $word_excluded = 0;
544 my $xpc = _xpc_for_el( $node );
545 # TODO This does not cope with nested add/dels.
546 my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) );
547 my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) );
548 foreach my $c ($node->childNodes() ) {
549 if( $c->nodeName eq 'num'
550 && defined $c->getAttribute( 'value' ) ) {
552 $text .= $c->getAttribute( 'value' ) unless @deletion;
553 $uncorrtext .= $c->getAttribute( 'value' ) unless @addition;
554 # If this is just after a line/page break, return to normal behavior.
555 $strip_leading_space = 0;
556 } elsif ( $c->nodeName =~ /^[lp]b$/ ) {
557 # Set a flag that strips leading whitespace until we
558 # get to the next bit of non-whitespace.
559 $strip_leading_space = 1;
560 } elsif ( $c->nodeName eq 'fw' # for catchwords
561 || $c->nodeName eq 'sic'
562 || $c->nodeName eq 'note' #TODO: decide how to deal with notes
563 || $c->textContent eq ''
564 || ref( $c ) eq 'XML::LibXML::Comment' ) {
565 $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
567 } elsif( $c->nodeName eq 'add' ) {
568 my( $use, $discard ) = _get_word_strings( $c );
570 } elsif( $c->nodeName eq 'del' ) {
571 my( $discard, $use ) = _get_word_strings( $c );
574 my ( $tagtxt, $taguncorr );
575 if( ref( $c ) eq 'XML::LibXML::Text' ) {
577 $tagtxt = $c->textContent;
578 $taguncorr = $c->textContent;
580 ( $tagtxt, $taguncorr ) = _get_word_strings( $c );
582 if( $strip_leading_space ) {
583 $tagtxt =~ s/^[\s\n]+//s;
584 $taguncorr =~ s/^[\s\n]+//s;
585 # Unset the flag as soon as we see non-whitespace.
586 $strip_leading_space = 0 if $tagtxt;
589 $uncorrtext .= $taguncorr;
592 throw( ident => "text not found",
593 tags => [ $node->nodeName ],
594 message => "No text found in node " . $node->toString(0) )
595 unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/;
596 return( $text, $uncorrtext );
600 my( $self, $string, $c ) = @_;
601 my @raw_words = split( /\s+/, $string );
603 foreach my $w ( @raw_words ) {
604 my $id = $self->sigil . 'r'. $c++;
605 my %opts = ( 'text' => $w, 'id' => $id );
606 my $w_obj = $self->tradition->collation->add_reading( \%opts );
607 # Skip any words that have been canonized out of existence.
608 next if( length( $w_obj->text ) == 0 );
609 push( @words, $w_obj );
614 sub _init_from_json {
617 if( $self->has_object ) {
618 $wit = $self->object;
619 } elsif( $self->has_string ) {
620 $wit = from_json( $self->string );
621 } elsif( $self->has_file ) {
622 my $ok = open( INPUT, $self->file );
624 throw( ident => "bad source",
625 message => 'Could not open ' . $self->file . ' for reading' );
627 binmode( INPUT, ':encoding(UTF-8)' );
630 $wit = from_json( join( '', @lines ) );
633 if( exists $wit->{'id'} ) {
634 $self->_set_sigil( $wit->{'id'} );
635 } elsif( !$self->has_sigil ) {
636 throw( ident => 'missing sigil',
637 message => 'Could not find witness sigil (id) in JSON spec' );
639 $self->identifier( $wit->{'name'} );
642 my( @text, @layertext );
643 if( exists $wit->{'content'} ) {
644 # We need to tokenize the text ourselves.
645 @words = _split_words( $self, $wit->{'content'} );
646 } elsif( exists $wit->{'tokens'} ) {
647 # We have a bunch of pretokenized words.
649 foreach my $token ( @{$wit->{'tokens'}} ) {
650 my $w_obj = $self->tradition->collation->add_reading({
651 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
652 push( @words, $w_obj );
653 push( @text, $token->{'t'} ); # TODO unless...?
655 ## TODO rethink this JSOn mechanism
656 if( exists $wit->{'layertokens'} ) {
657 foreach my $token ( @{$wit->{'layertokens'}} ) {
658 my $w_obj = $self->tradition->collation->add_reading({
659 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
660 push( @layerwords, $w_obj );
661 push( @layertext, $token->{'t'} );
665 $self->text( \@text );
666 $self->layertext( \@layertext ) if @layertext;
667 $self->path( \@words );
668 $self->uncorrected_path( \@layerwords ) if @layerwords;
671 sub _init_from_plaintext {
673 unless( $self->has_sigil ) {
674 throw( "No sigil defined for the plaintext witness" );
677 if( $self->has_file ) {
678 my $ok = open( INPUT, $self->file );
680 throw( ident => "bad source",
681 message => 'Could not open ' . $self->file . ' for reading' );
683 binmode( INPUT, ':encoding(UTF-8)' );
686 $str = join( '', @lines );
687 } elsif( $self->has_object ) { # ...seriously?
688 $str = ${$self->object};
690 $str = $self->string;
693 # TODO allow a different word separation expression
694 my @text = split( /\s+/, $str );
695 $self->text( \@text );
696 my @words = _split_words( $self, $str );
697 $self->path( \@words );
701 Text::Tradition::Error->throw(
702 'ident' => 'Witness parsing error',
709 my $xpc = XML::LibXML::XPathContext->new( $el );
710 if( $el->namespaceURI ) {
711 $xpc->registerNs( 'tei', $el->namespaceURI );
716 =head2 export_as_json
718 Exports the witness as a JSON structure, with the following keys:
722 =item * id - The witness sigil
724 =item * name - The witness identifier
726 =item * tokens - An array of hashes of the form { "t":"WORD" }
733 my $trad = Text::Tradition->new();
735 my @text = qw/ Thhis is a line of text /;
736 my $wit = $trad->add_witness(
738 'string' => join( ' ', @text ),
739 'sourcetype' => 'plaintext',
740 'identifier' => 'test witness',
742 my $jsonstruct = $wit->export_as_json;
743 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
744 is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" );
745 is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" );
746 foreach my $idx ( 0 .. $#text ) {
747 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" );
750 my @ctext = qw( when april with his showers sweet with fruit the drought of march
751 has pierced unto the root );
752 $trad = Text::Tradition->new(
753 'input' => 'CollateX',
754 'file' => 't/data/Collatex-16.xml' );
756 $jsonstruct = $trad->witness('A')->export_as_json;
757 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
758 is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" );
759 is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" );
760 foreach my $idx ( 0 .. $#ctext ) {
761 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" );
764 ## TODO test layertext export
772 my @wordlist = map { { 't' => $_ || '' } } @{$self->text};
774 'id' => $self->sigil,
775 'tokens' => \@wordlist,
776 'name' => $self->identifier,
778 if( $self->is_layered ) {
779 my @lwlist = map { { 't' => $_ || '' } } @{$self->uncorrected};
780 $obj->{'layertokens'} = \@lwlist;
786 __PACKAGE__->meta->make_immutable;
792 =item * Figure out how to serialize a witness
794 =item * Support encodings other than UTF-8
800 This package is free software and is provided "as is" without express
801 or implied warranty. You can redistribute it and/or modify it under
802 the same terms as Perl itself.
806 Tara L Andrews E<lt>aurum@cpan.orgE<gt>