1 package Text::Tradition::Witness;
6 use Moose::Util::TypeConstraints;
7 use Text::TEI::Markup qw( word_tag_wrap );
9 use XML::Easy::Syntax qw( $xml10_name_rx );
13 Text::Tradition::Witness - a manuscript witness to a text tradition
17 use Text::Tradition::Witness;
18 my $w = Text::Tradition::Witness->new(
20 'identifier' => 'Oxford MS Ex.1932',
25 Text::Tradition::Witness is an object representation of a manuscript
26 witness to a text tradition. A manuscript has a sigil (a short code that
27 represents it in the wider tradition), an identifier (e.g. the library ID),
34 Create a new witness. Options include:
38 =item * sigil - A short code to represent the manuscript. Required.
40 =item * sourcetype - What sort of witness data this is. Options are
41 'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be
42 used by Collation parsers.)
48 The data source for the witness. Use the appropriate option.
50 =item * use_text - An initialization option. If the witness is read from a
51 TEI document and more than one <text/> tag exists therein, the default
52 behavior is to use the first defined text. If this is not desired,
53 use_text should be set to an XPath expression that will select the correct
56 =item * language - The name of the applicable L<Text::Tradition::Lang>
57 module for language handling. Usually inherited from the language set in
58 the L<Text::Tradition> object, and defaults to Default.
60 =item * identifier - The recognized name of the manuscript, e.g. a library
61 identifier. Taken from the msDesc element for a TEI file.
63 =item * other_info - A freeform string for any other description of the
70 The sigil by which to identify this manuscript, which must conform to the
71 specification for XML attribute strings (broadly speaking, it must begin
72 with a letter and can have only a few sorts of punctuation characters in
77 A freeform name by which to identify the manuscript, which may be longer
78 than the sigil. Defaults to 'Unidentified ms', but will be taken from the
79 TEI msName attribute, or constructed from the settlement and idno if
84 The city, town, etc. where the manuscript is held. Will be read from the
85 TEI msDesc element if supplied.
89 The institution that holds the manuscript. Will be read from the TEI msDesc
94 The identification or call number of the manuscript. Will be read from the
95 TEI msDesc element if supplied.
99 An array of strings (words) that contains the text of the
100 manuscript. This should not change after the witness has been
101 instantiated, and the path through the collation should always match it.
105 An array of strings (words) that contains the layered
106 text, if any, of the manuscript. This should not change after the witness
107 has been instantiated, and the path through the collation should always
112 Accessor method to get the witness language.
116 Accessor method for the witness identifier.
120 Accessor method for the general witness description.
124 Boolean method that returns a true value if the witness was created with a
125 data source (that is, a file, string, or object to be parsed).
129 Boolean method to note whether the witness has layers (e.g. pre-correction
130 readings) in the collation.
135 my $trad = Text::Tradition->new( 'name' => 'test tradition' );
136 my $c = $trad->collation;
138 # Test a plaintext witness via string
139 my $str = 'This is a line of text';
140 my $ptwit = $trad->add_witness(
142 'sourcetype' => 'plaintext',
145 is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' );
147 is( $ptwit->sigil, 'A', "Witness has correct sigil" );
148 is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
151 # Test some JSON witnesses via object
152 open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
153 binmode( JSIN, ':encoding(UTF-8)' );
156 $trad->add_json_witnesses( join( '', @lines ) );
157 is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
158 "Found first JSON witness" );
159 is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness',
160 "Found second JSON witness" );
162 # Test an XML witness via file
163 my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
164 'file' => 't/data/witnesses/teiwit.xml' );
165 is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
167 is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
168 ok( $xmlwit->is_layered, "Picked up correction layer" );
169 is( @{$xmlwit->text}, 182, "Got correct text length" );
170 is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" );
172 my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings;
173 is( @allwitwords, 184, "Reused appropriate readings" );
176 my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
177 'file' => 't/data/witnesses/group.xml',
178 'use_text' => '//tei:group/tei:text[2]' );
179 is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" );
181 is( $xpwit->sigil, 'G', "XML part witness has correct sigil" );
182 ok( !$xpwit->is_layered, "Picked up no correction layer" );
183 is( @{$xpwit->text}, 157, "Got correct text length" );
191 subtype 'SourceType',
193 where { $_ =~ /^(xmldesc|plaintext|json|collation)$/ },
194 message { 'Source type must be one of xmldesc, plaintext, json, collation' };
198 where { $_ =~ /\A$xml10_name_rx\z/ },
199 message { 'Sigil must be a valid XML attribute string' };
201 no Moose::Util::TypeConstraints;
205 'isa' => 'Text::Tradition',
209 # Sigil. Required identifier for a witness, but may be found inside
214 predicate => 'has_sigil',
215 writer => '_set_sigil',
221 default => 'Default',
224 # Other identifying information
225 has 'identifier' => (
230 has 'settlement' => (
235 has 'repository' => (
245 # Source. Can be XML obj, JSON data struct, or string.
246 # Not used if the witness is created by parsing a collation.
247 has 'sourcetype' => (
256 predicate => 'has_old_source',
257 clearer => 'clear_old_source',
263 predicate => 'has_file',
269 predicate => 'has_string',
272 has 'object' => ( # could be anything.
274 predicate => 'has_object',
275 clearer => 'clear_object',
278 # In the case of a TEI document with multiple texts, specify
279 # which text is the root. Should be an XPath expression.
285 # Text. This is an array of strings (i.e. word tokens).
286 # TODO Think about how to handle this for the case of pre-prepared
287 # collations, where the tokens are in the graph already.
290 isa => 'ArrayRef[Str]',
291 predicate => 'has_text',
296 isa => 'ArrayRef[Str]',
297 predicate => 'has_layertext',
300 # Path. This is an array of Reading nodes that can be saved during
301 # initialization, but should be cleared before saving in a DB.
304 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
305 predicate => 'has_path',
306 clearer => 'clear_path',
309 ## TODO change the name of this
310 has 'uncorrected_path' => (
312 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
313 clearer => 'clear_uncorrected_path',
316 has 'is_layered' => (
321 # If we set an uncorrected path, ever, remember that we did so.
322 around 'uncorrected_path' => sub {
326 $self->is_layered( 1 );
332 if( $self->has_source ) {
333 my $init_sub = '_init_from_' . $self->sourcetype;
335 # Remove our XML / source objects; we no longer need them.
336 $self->clear_object if $self->has_object;
337 $self->tradition->collation->make_witness_path( $self );
344 return $self->has_file || $self->has_string || $self->has_object;
347 sub _init_from_xmldesc {
350 if( $self->has_object ) {
351 unless( ref( $self->object ) eq 'XML::LibXML::Element' ) {
352 throw( ident => "bad source",
353 message => "Source object must be an XML::LibXML::Element (this is "
354 . ref( $self->object ) . ");" );
356 $xmlobj = $self->object;
358 my $parser = XML::LibXML->new();
359 my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
361 $xmlobj = $parser->$parsersub( $self->file )->documentElement;
362 } catch( XML::LibXML::Error $e ) {
363 throw( ident => "bad source",
364 message => "XML parsing error: " . $e->as_string );
368 unless( $xmlobj->nodeName eq 'TEI' ) {
369 throw( ident => "bad source",
370 message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" );
373 # Set up the tags we need, with or without namespaces.
374 map { $tags{$_} = "//$_" }
375 qw/ msDesc msName settlement repository idno p lg w seg add del /;
376 # Set up our XPath object
377 my $xpc = _xpc_for_el( $xmlobj );
378 # Use namespace-aware tags if we have to
379 if( $xmlobj->namespaceURI ) {
380 map { $tags{$_} = "//tei:$_" } keys %tags;
384 if( my $desc = $xpc->find( $tags{msDesc} ) ) {
385 my $descnode = $desc->get_node(1);
386 # First try to use settlement/repository/idno.
387 my( $setNode, $reposNode, $idNode ) =
388 ( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
389 $xpc->find( $tags{repository}, $descnode )->get_node(1),
390 $xpc->find( $tags{idno}, $descnode )->get_node(1) );
391 $self->settlement( $setNode ? $setNode->textContent : '' );
392 $self->repository( $reposNode ? $reposNode->textContent : '' );
393 $self->idno( $idNode ? $idNode->textContent : '' );
394 if( $self->settlement && $self->idno ) {
395 $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) );
397 # Look for an msName.
398 my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1);
400 $self->identifier( $msNameNode->textContent );
402 # We have an msDesc but who knows what is in it?
403 my $desc = $descnode->textContent;
406 $self->identifier( $desc );
409 if( $descnode->hasAttribute('xml:id') ) {
410 $self->_set_sigil( $descnode->getAttribute('xml:id') );
411 } elsif( !$self->has_sigil ) {
412 throw( ident => 'missing sigil',
413 message => 'Could not find xml:id witness sigil' );
416 throw( ident => "bad source",
417 message => "Could not find manuscript description element in TEI header" );
420 # Now get the words out.
422 my @layerwords; # if the witness has layers
423 # First, make sure all the words are wrapped in tags.
424 # TODO Make this not necessarily dependent upon whitespace...
425 word_tag_wrap( $xmlobj );
426 # Now go text hunting.
428 if( $self->use_text ) {
429 @textnodes = $xpc->findnodes( $self->use_text );
431 # Use the first 'text' node in the document.
432 @textnodes = $xmlobj->getElementsByTagName( 'text' );
434 my $teitext = $textnodes[0];
436 _tokenize_text( $self, $teitext, \@words, \@layerwords );
438 throw( ident => "bad source",
439 message => "No text element in document '" . $self->{'identifier'} . "!" );
442 my @text = map { $_->text } @words;
443 my @layertext = map { $_->text } @layerwords;
444 $self->path( \@words );
445 $self->text( \@text );
446 if( join( ' ', @text ) ne join( ' ', @layertext ) ) {
447 $self->uncorrected_path( \@layerwords );
448 $self->layertext( \@layertext );
453 my( $self, $teitext, $wordlist, $uncorrlist ) = @_;
454 # Strip out the words.
455 my $xpc = _xpc_for_el( $teitext );
456 my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' );
459 if( my $n = $_->getAttribute( 'n' ) ) {
460 $place_str = '#DIV_' . $n . '#';
462 $place_str = '#DIV#';
464 $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str );
467 # But maybe we don't have any divs. Just paragraphs.
469 $self->_objectify_words( $teitext, $wordlist, $uncorrlist );
473 sub _objectify_words {
474 my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_;
476 my $xpc = _xpc_for_el( $element );
477 my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg};
478 my @pgraphs = $xpc->findnodes( $xpexpr );
479 return () unless @pgraphs;
480 # Set up an expression to look for words and segs
481 $xpexpr = '.' . $tags{w} . '|.' . $tags{seg};
482 foreach my $pg ( @pgraphs ) {
483 # If this paragraph is the descendant of a note element,
485 my @noop_container = $xpc->findnodes( 'ancestor::note', $pg );
486 next if scalar @noop_container;
487 # Get the text of each node
489 # Hunt down each wrapped word/seg, and make an object (or two objects)
490 # of it, if necessary.
491 foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
492 my( $text, $uncorr ) = _get_word_strings( $c );
494 # ( $text, $uncorr ) = _get_word_object( $c );
495 # } catch( Text::Tradition::Error $e
496 # where { $_->has_tag( 'lb' ) } ) {
499 unless( defined $text || defined $uncorr ) {
500 print STDERR "WARNING: no text in node " . $c->nodeName
501 . "\n" unless $c->nodeName eq 'lb';
504 print STDERR "DEBUG: space found in element node "
505 . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/;
507 my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist;
508 while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) {
511 my $id = $self->sigil . 'r' . $ctr;
512 my( $word, $acword );
514 $word = $self->tradition->collation->add_reading(
515 { 'id' => $id, 'text' => $text });
517 if( $uncorr && $uncorr ne $text ) {
519 $acword = $self->tradition->collation->add_reading(
520 { 'id' => $id, 'text' => $uncorr });
525 # if( $first_word ) {
527 # # Set the relevant sectioning markers
529 # $w->add_placeholder( $divmarker );
530 # $divmarker = undef;
532 # $w->add_placeholder( '#PG#' );
534 push( @$wordlist, $word ) if $word;
535 push( @$uncorrlist, $acword ) if $acword;
540 # Given a word or segment node, make a Reading object for the word
541 # therein. Make two Reading objects if there is an 'uncorrected' vs.
544 sub _get_word_strings {
546 my( $text, $uncorrtext );
547 # We can have an lb or pb in the middle of a word; if we do, the
548 # whitespace (including \n) after the break becomes insignificant
549 # and we want to nuke it.
550 my $strip_leading_space = 0;
551 my $word_excluded = 0;
552 my $xpc = _xpc_for_el( $node );
553 # TODO This does not cope with nested add/dels.
554 my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) );
555 my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) );
556 foreach my $c ($node->childNodes() ) {
557 if( $c->nodeName eq 'num'
558 && defined $c->getAttribute( 'value' ) ) {
560 $text .= $c->getAttribute( 'value' ) unless @deletion;
561 $uncorrtext .= $c->getAttribute( 'value' ) unless @addition;
562 # If this is just after a line/page break, return to normal behavior.
563 $strip_leading_space = 0;
564 } elsif ( $c->nodeName =~ /^[lp]b$/ ) {
565 # Set a flag that strips leading whitespace until we
566 # get to the next bit of non-whitespace.
567 $strip_leading_space = 1;
568 } elsif ( $c->nodeName eq 'fw' # for catchwords
569 || $c->nodeName eq 'sic'
570 || $c->nodeName eq 'note' #TODO: decide how to deal with notes
571 || $c->textContent eq ''
572 || ref( $c ) eq 'XML::LibXML::Comment' ) {
573 $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
575 } elsif( $c->nodeName eq 'add' ) {
576 my( $use, $discard ) = _get_word_strings( $c );
578 } elsif( $c->nodeName eq 'del' ) {
579 my( $discard, $use ) = _get_word_strings( $c );
582 my ( $tagtxt, $taguncorr );
583 if( ref( $c ) eq 'XML::LibXML::Text' ) {
585 $tagtxt = $c->textContent;
586 $taguncorr = $c->textContent;
588 ( $tagtxt, $taguncorr ) = _get_word_strings( $c );
590 if( $strip_leading_space ) {
591 $tagtxt =~ s/^[\s\n]+//s;
592 $taguncorr =~ s/^[\s\n]+//s;
593 # Unset the flag as soon as we see non-whitespace.
594 $strip_leading_space = 0 if $tagtxt;
597 $uncorrtext .= $taguncorr;
600 throw( ident => "text not found",
601 tags => [ $node->nodeName ],
602 message => "No text found in node " . $node->toString(0) )
603 unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/;
604 return( $text, $uncorrtext );
608 my( $self, $string, $c ) = @_;
609 my @raw_words = split( /\s+/, $string );
611 foreach my $w ( @raw_words ) {
612 my $id = $self->sigil . 'r'. $c++;
613 my %opts = ( 'text' => $w, 'id' => $id, 'language' => $self->language );
614 my $w_obj = $self->tradition->collation->add_reading( \%opts );
615 # Skip any words that have been canonized out of existence.
616 next if( length( $w_obj->text ) == 0 );
617 push( @words, $w_obj );
622 sub _init_from_json {
625 if( $self->has_object ) {
626 $wit = $self->object;
627 } elsif( $self->has_string ) {
628 $wit = from_json( $self->string );
629 } elsif( $self->has_file ) {
630 my $ok = open( INPUT, $self->file );
632 throw( ident => "bad source",
633 message => 'Could not open ' . $self->file . ' for reading' );
635 binmode( INPUT, ':encoding(UTF-8)' );
638 $wit = from_json( join( '', @lines ) );
641 if( exists $wit->{'id'} ) {
642 $self->_set_sigil( $wit->{'id'} );
643 } elsif( !$self->has_sigil ) {
644 throw( ident => 'missing sigil',
645 message => 'Could not find witness sigil (id) in JSON spec' );
647 $self->identifier( $wit->{'name'} );
650 my( @text, @layertext );
651 if( exists $wit->{'content'} ) {
652 # We need to tokenize the text ourselves.
653 @words = _split_words( $self, $wit->{'content'} );
654 } elsif( exists $wit->{'tokens'} ) {
655 # We have a bunch of pretokenized words.
657 foreach my $token ( @{$wit->{'tokens'}} ) {
658 my $w_obj = $self->tradition->collation->add_reading({
659 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
660 push( @words, $w_obj );
661 push( @text, $token->{'t'} ); # TODO unless...?
663 ## TODO rethink this JSOn mechanism
664 if( exists $wit->{'layertokens'} ) {
665 foreach my $token ( @{$wit->{'layertokens'}} ) {
666 my $w_obj = $self->tradition->collation->add_reading({
667 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
668 push( @layerwords, $w_obj );
669 push( @layertext, $token->{'t'} );
673 $self->text( \@text );
674 $self->layertext( \@layertext ) if @layertext;
675 $self->path( \@words );
676 $self->uncorrected_path( \@layerwords ) if @layerwords;
679 sub _init_from_plaintext {
682 if( $self->has_file ) {
683 my $ok = open( INPUT, $self->file );
685 throw( ident => "bad source",
686 message => 'Could not open ' . $self->file . ' for reading' );
688 binmode( INPUT, ':encoding(UTF-8)' );
691 $str = join( '', @lines );
692 } elsif( $self->has_object ) { # ...seriously?
693 $str = ${$self->object};
695 $str = $self->string;
698 # TODO allow a different word separation expression
699 my @text = split( /\s+/, $str );
700 $self->text( \@text );
701 my @words = _split_words( $self, $str );
702 $self->path( \@words );
706 Text::Tradition::Error->throw(
707 'ident' => 'Witness parsing error',
714 my $xpc = XML::LibXML::XPathContext->new( $el );
715 if( $el->namespaceURI ) {
716 $xpc->registerNs( 'tei', $el->namespaceURI );
721 =head2 export_as_json
723 Exports the witness as a JSON structure, with the following keys:
727 =item * id - The witness sigil
729 =item * name - The witness identifier
731 =item * tokens - An array of hashes of the form { "t":"WORD" }
738 my $trad = Text::Tradition->new();
740 my @text = qw/ Thhis is a line of text /;
741 my $wit = $trad->add_witness(
743 'string' => join( ' ', @text ),
744 'sourcetype' => 'plaintext',
745 'identifier' => 'test witness',
747 my $jsonstruct = $wit->export_as_json;
748 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
749 is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" );
750 is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" );
751 foreach my $idx ( 0 .. $#text ) {
752 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" );
755 my @ctext = qw( when april with his showers sweet with fruit the drought of march
756 has pierced unto the root );
757 $trad = Text::Tradition->new(
758 'input' => 'CollateX',
759 'file' => 't/data/Collatex-16.xml' );
761 $jsonstruct = $trad->witness('A')->export_as_json;
762 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
763 is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" );
764 is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" );
765 foreach my $idx ( 0 .. $#ctext ) {
766 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" );
769 ## TODO test layertext export
777 my @wordlist = map { { 't' => $_ || '' } } @{$self->text};
779 'id' => $self->sigil,
780 'tokens' => \@wordlist,
781 'name' => $self->identifier,
783 if( $self->is_layered ) {
784 my @lwlist = map { { 't' => $_ || '' } } @{$self->uncorrected};
785 $obj->{'layertokens'} = \@lwlist;
791 __PACKAGE__->meta->make_immutable;
797 =item * Figure out how to serialize a witness
799 =item * Support encodings other than UTF-8
805 This package is free software and is provided "as is" without express
806 or implied warranty. You can redistribute it and/or modify it under
807 the same terms as Perl itself.
811 Tara L Andrews E<lt>aurum@cpan.orgE<gt>