1 package Text::Tradition::Witness;
6 use Moose::Util::TypeConstraints;
7 use Text::TEI::Markup qw( word_tag_wrap );
9 use XML::Easy::Syntax qw( $xml10_name_rx );
13 Text::Tradition::Witness - a manuscript witness to a text tradition
17 use Text::Tradition::Witness;
18 my $w = Text::Tradition::Witness->new(
20 'identifier' => 'Oxford MS Ex.1932',
25 Text::Tradition::Witness is an object representation of a manuscript
26 witness to a text tradition. A manuscript has a sigil (a short code that
27 represents it in the wider tradition), an identifier (e.g. the library ID),
34 Create a new witness. Options include:
38 =item * sigil - A short code to represent the manuscript. Required.
40 =item * sourcetype - What sort of witness data this is. Options are
41 'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be
42 used by Collation parsers.)
48 The data source for the witness. Use the appropriate option.
50 =item * use_text - An initialization option. If the witness is read from a
51 TEI document and more than one <text/> tag exists therein, the default
52 behavior is to use the first defined text. If this is not desired,
53 use_text should be set to an XPath expression that will select the correct
56 =item * identifier - The recognized name of the manuscript, e.g. a library
57 identifier. Taken from the msDesc element for a TEI file.
59 =item * other_info - A freeform string for any other description of the
66 The sigil by which to identify this manuscript, which must conform to the
67 specification for XML attribute strings (broadly speaking, it must begin
68 with a letter and can have only a few sorts of punctuation characters in
73 A freeform name by which to identify the manuscript, which may be longer
74 than the sigil. Defaults to 'Unidentified ms', but will be taken from the
75 TEI msName attribute, or constructed from the settlement and idno if
80 The city, town, etc. where the manuscript is held. Will be read from the
81 TEI msDesc element if supplied.
85 The institution that holds the manuscript. Will be read from the TEI msDesc
90 The identification or call number of the manuscript. Will be read from the
91 TEI msDesc element if supplied.
95 An array of strings (words) that contains the text of the
96 manuscript. This should not change after the witness has been
97 instantiated, and the path through the collation should always match it.
101 An array of strings (words) that contains the layered
102 text, if any, of the manuscript. This should not change after the witness
103 has been instantiated, and the path through the collation should always
108 Accessor method for the witness identifier.
112 Accessor method for the general witness description.
116 Boolean method that returns a true value if the witness was created with a
117 data source (that is, a file, string, or object to be parsed).
121 Boolean method to note whether the witness has layers (e.g. pre-correction
122 readings) in the collation.
127 my $trad = Text::Tradition->new( 'name' => 'test tradition' );
128 my $c = $trad->collation;
130 # Test a plaintext witness via string
131 my $str = 'This is a line of text';
132 my $ptwit = $trad->add_witness(
134 'sourcetype' => 'plaintext',
137 is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' );
139 is( $ptwit->sigil, 'A', "Witness has correct sigil" );
140 $c->make_witness_path( $ptwit );
141 is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
144 # Test some JSON witnesses via object
145 open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
146 binmode( JSIN, ':encoding(UTF-8)' );
149 $trad->add_json_witnesses( join( '', @lines ) );
150 is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
151 "Found first JSON witness" );
152 is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness',
153 "Found second JSON witness" );
155 # Test an XML witness via file
156 my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
157 'file' => 't/data/witnesses/teiwit.xml' );
158 is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
160 is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
161 ok( $xmlwit->is_layered, "Picked up correction layer" );
162 is( @{$xmlwit->text}, 182, "Got correct text length" );
163 is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" );
165 my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings;
166 is( @allwitwords, 184, "Reused appropriate readings" );
169 my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
170 'file' => 't/data/witnesses/group.xml',
171 'use_text' => '//tei:group/tei:text[2]' );
172 is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" );
174 is( $xpwit->sigil, 'G', "XML part witness has correct sigil" );
175 ok( !$xpwit->is_layered, "Picked up no correction layer" );
176 is( @{$xpwit->text}, 157, "Got correct text length" );
184 # Enable plugin(s) if available
185 eval { with 'Text::Tradition::WitLanguage'; };
187 subtype 'SourceType',
189 where { $_ =~ /^(xmldesc|plaintext|json|collation)$/ },
190 message { 'Source type must be one of xmldesc, plaintext, json, collation' };
194 where { $_ =~ /\A$xml10_name_rx\z/ },
195 message { 'Sigil must be a valid XML attribute string' };
197 no Moose::Util::TypeConstraints;
201 isa => 'Text::Tradition',
206 # Sigil. Required identifier for a witness, but may be found inside
211 predicate => 'has_sigil',
212 writer => '_set_sigil',
215 # Other identifying information
216 has 'identifier' => (
221 has 'settlement' => (
226 has 'repository' => (
236 # Source. Can be XML obj, JSON data struct, or string.
237 # Not used if the witness is created by parsing a collation.
238 has 'sourcetype' => (
247 predicate => 'has_file',
253 predicate => 'has_string',
256 has 'object' => ( # could be anything.
258 predicate => 'has_object',
259 clearer => 'clear_object',
262 # In the case of a TEI document with multiple texts, specify
263 # which text is the root. Should be an XPath expression.
269 # Text. This is an array of strings (i.e. word tokens).
270 # TODO Think about how to handle this for the case of pre-prepared
271 # collations, where the tokens are in the graph already.
274 isa => 'ArrayRef[Str]',
275 predicate => 'has_text',
280 isa => 'ArrayRef[Str]',
281 predicate => 'has_layertext',
284 has 'is_collated' => (
289 # Path. This is an array of Reading nodes that can be saved during
290 # initialization, but should be cleared before saving in a DB.
293 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
294 predicate => 'has_path',
295 clearer => 'clear_path',
298 ## TODO change the name of this
299 has 'uncorrected_path' => (
301 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
302 clearer => 'clear_uncorrected_path',
305 has 'is_layered' => (
310 # If we set an uncorrected path, ever, remember that we did so.
311 around 'uncorrected_path' => sub {
315 $self->is_layered( 1 );
321 if( $self->has_source ) {
322 my $init_sub = '_init_from_' . $self->sourcetype;
324 # Remove our XML / source objects; we no longer need them.
325 $self->clear_object if $self->has_object;
326 # $self->tradition->collation->make_witness_path( $self );
328 if( $self->sourcetype eq 'collation' ) {
329 $self->is_collated( 1 );
336 return $self->has_file || $self->has_string || $self->has_object;
339 sub _init_from_xmldesc {
342 if( $self->has_object ) {
343 unless( ref( $self->object ) eq 'XML::LibXML::Element' ) {
344 throw( ident => "bad source",
345 message => "Source object must be an XML::LibXML::Element (this is "
346 . ref( $self->object ) . ");" );
348 $xmlobj = $self->object;
351 my $parser = XML::LibXML->new();
352 my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
354 $xmlobj = $parser->$parsersub( $self->file )->documentElement;
355 } catch( XML::LibXML::Error $e ) {
356 throw( ident => "bad source",
357 message => "XML parsing error: " . $e->as_string );
361 unless( $xmlobj->nodeName eq 'TEI' ) {
362 throw( ident => "bad source",
363 message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" );
366 # Set up the tags we need, with or without namespaces.
367 map { $tags{$_} = "//$_" }
368 qw/ msDesc msName settlement repository idno p lg w seg add del /;
369 # Set up our XPath object
370 my $xpc = _xpc_for_el( $xmlobj );
371 # Use namespace-aware tags if we have to
372 if( $xmlobj->namespaceURI ) {
373 map { $tags{$_} = "//tei:$_" } keys %tags;
377 if( my $desc = $xpc->find( $tags{msDesc} ) ) {
378 my $descnode = $desc->get_node(1);
379 # First try to use settlement/repository/idno.
380 my( $setNode, $reposNode, $idNode ) =
381 ( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
382 $xpc->find( $tags{repository}, $descnode )->get_node(1),
383 $xpc->find( $tags{idno}, $descnode )->get_node(1) );
384 $self->settlement( $setNode ? $setNode->textContent : '' );
385 $self->repository( $reposNode ? $reposNode->textContent : '' );
386 $self->idno( $idNode ? $idNode->textContent : '' );
387 if( $self->settlement && $self->idno ) {
388 $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) );
390 # Look for an msName.
391 my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1);
393 $self->identifier( $msNameNode->textContent );
395 # We have an msDesc but who knows what is in it?
396 my $desc = $descnode->textContent;
399 $self->identifier( $desc );
402 if( $descnode->hasAttribute('xml:id') ) {
403 $self->_set_sigil( $descnode->getAttribute('xml:id') );
404 } elsif( !$self->has_sigil ) {
405 throw( ident => 'missing sigil',
406 message => 'Could not find xml:id witness sigil' );
409 throw( ident => "bad source",
410 message => "Could not find manuscript description element in TEI header" );
413 # Now get the words out.
415 my @layerwords; # if the witness has layers
416 # First, make sure all the words are wrapped in tags.
417 # TODO Make this not necessarily dependent upon whitespace...
418 word_tag_wrap( $xmlobj );
419 # Now go text hunting.
421 if( $self->use_text ) {
422 @textnodes = $xpc->findnodes( $self->use_text );
424 # Use the first 'text' node in the document.
425 @textnodes = $xmlobj->getElementsByTagName( 'text' );
427 my $teitext = $textnodes[0];
429 _tokenize_text( $self, $teitext, \@words, \@layerwords );
431 throw( ident => "bad source",
432 message => "No text element in document '" . $self->{'identifier'} . "!" );
435 my @text = map { $_->text } @words;
436 my @layertext = map { $_->text } @layerwords;
437 $self->path( \@words );
438 $self->text( \@text );
439 if( join( ' ', @text ) ne join( ' ', @layertext ) ) {
440 $self->uncorrected_path( \@layerwords );
441 $self->layertext( \@layertext );
446 my( $self, $teitext, $wordlist, $uncorrlist ) = @_;
447 # Strip out the words.
448 my $xpc = _xpc_for_el( $teitext );
449 my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' );
452 if( my $n = $_->getAttribute( 'n' ) ) {
453 $place_str = '#DIV_' . $n . '#';
455 $place_str = '#DIV#';
457 $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str );
460 # But maybe we don't have any divs. Just paragraphs.
462 $self->_objectify_words( $teitext, $wordlist, $uncorrlist );
466 sub _objectify_words {
467 my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_;
469 my $xpc = _xpc_for_el( $element );
470 my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg};
471 my @pgraphs = $xpc->findnodes( $xpexpr );
472 return () unless @pgraphs;
473 # Set up an expression to look for words and segs
474 $xpexpr = '.' . $tags{w} . '|.' . $tags{seg};
475 foreach my $pg ( @pgraphs ) {
476 # If this paragraph is the descendant of a note element,
478 my @noop_container = $xpc->findnodes( 'ancestor::note', $pg );
479 next if scalar @noop_container;
480 # Get the text of each node
482 # Hunt down each wrapped word/seg, and make an object (or two objects)
483 # of it, if necessary.
484 foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
485 my( $text, $uncorr ) = _get_word_strings( $c );
487 # ( $text, $uncorr ) = _get_word_object( $c );
488 # } catch( Text::Tradition::Error $e
489 # where { $_->has_tag( 'lb' ) } ) {
492 unless( defined $text || defined $uncorr ) {
493 print STDERR "WARNING: no text in node " . $c->nodeName
494 . "\n" unless $c->nodeName eq 'lb';
497 print STDERR "DEBUG: space found in element node "
498 . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/;
500 my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist;
501 while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) {
504 my $id = $self->sigil . 'r' . $ctr;
505 my( $word, $acword );
507 $word = $self->tradition->collation->add_reading(
508 { 'id' => $id, 'text' => $text });
510 if( $uncorr && $uncorr ne $text ) {
512 $acword = $self->tradition->collation->add_reading(
513 { 'id' => $id, 'text' => $uncorr });
518 # if( $first_word ) {
520 # # Set the relevant sectioning markers
522 # $w->add_placeholder( $divmarker );
523 # $divmarker = undef;
525 # $w->add_placeholder( '#PG#' );
527 push( @$wordlist, $word ) if $word;
528 push( @$uncorrlist, $acword ) if $acword;
533 # Given a word or segment node, make a Reading object for the word
534 # therein. Make two Reading objects if there is an 'uncorrected' vs.
537 sub _get_word_strings {
539 my( $text, $uncorrtext );
540 # We can have an lb or pb in the middle of a word; if we do, the
541 # whitespace (including \n) after the break becomes insignificant
542 # and we want to nuke it.
543 my $strip_leading_space = 0;
544 my $word_excluded = 0;
545 my $xpc = _xpc_for_el( $node );
546 # TODO This does not cope with nested add/dels.
547 my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) );
548 my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) );
549 foreach my $c ($node->childNodes() ) {
550 if( $c->nodeName eq 'num'
551 && defined $c->getAttribute( 'value' ) ) {
553 $text .= $c->getAttribute( 'value' ) unless @deletion;
554 $uncorrtext .= $c->getAttribute( 'value' ) unless @addition;
555 # If this is just after a line/page break, return to normal behavior.
556 $strip_leading_space = 0;
557 } elsif ( $c->nodeName =~ /^[lp]b$/ ) {
558 # Set a flag that strips leading whitespace until we
559 # get to the next bit of non-whitespace.
560 $strip_leading_space = 1;
561 } elsif ( $c->nodeName eq 'fw' # for catchwords
562 || $c->nodeName eq 'sic'
563 || $c->nodeName eq 'note' #TODO: decide how to deal with notes
564 || $c->textContent eq ''
565 || ref( $c ) eq 'XML::LibXML::Comment' ) {
566 $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
568 } elsif( $c->nodeName eq 'add' ) {
569 my( $use, $discard ) = _get_word_strings( $c );
571 } elsif( $c->nodeName eq 'del' ) {
572 my( $discard, $use ) = _get_word_strings( $c );
575 my ( $tagtxt, $taguncorr );
576 if( ref( $c ) eq 'XML::LibXML::Text' ) {
578 $tagtxt = $c->textContent;
579 $taguncorr = $c->textContent;
581 ( $tagtxt, $taguncorr ) = _get_word_strings( $c );
583 if( $strip_leading_space ) {
584 $tagtxt =~ s/^[\s\n]+//s;
585 $taguncorr =~ s/^[\s\n]+//s;
586 # Unset the flag as soon as we see non-whitespace.
587 $strip_leading_space = 0 if $tagtxt;
590 $uncorrtext .= $taguncorr;
593 throw( ident => "text not found",
594 tags => [ $node->nodeName ],
595 message => "No text found in node " . $node->toString(0) )
596 unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/;
597 return( $text, $uncorrtext );
601 my( $self, $string, $c ) = @_;
602 my @raw_words = split( /\s+/, $string );
604 foreach my $w ( @raw_words ) {
605 my $id = $self->sigil . 'r'. $c++;
606 my %opts = ( 'text' => $w, 'id' => $id );
607 my $w_obj = $self->tradition->collation->add_reading( \%opts );
608 # Skip any words that have been canonized out of existence.
609 next if( length( $w_obj->text ) == 0 );
610 push( @words, $w_obj );
615 sub _init_from_json {
618 if( $self->has_object ) {
619 $wit = $self->object;
620 } elsif( $self->has_string ) {
621 $wit = from_json( $self->string );
622 } elsif( $self->has_file ) {
623 my $ok = open( INPUT, $self->file );
625 throw( ident => "bad source",
626 message => 'Could not open ' . $self->file . ' for reading' );
628 binmode( INPUT, ':encoding(UTF-8)' );
631 $wit = from_json( join( '', @lines ) );
634 if( exists $wit->{'id'} ) {
635 $self->_set_sigil( $wit->{'id'} );
636 } elsif( !$self->has_sigil ) {
637 throw( ident => 'missing sigil',
638 message => 'Could not find witness sigil (id) in JSON spec' );
640 $self->identifier( $wit->{'name'} );
643 my( @text, @layertext );
644 if( exists $wit->{'content'} ) {
645 # We need to tokenize the text ourselves.
646 @words = _split_words( $self, $wit->{'content'} );
647 } elsif( exists $wit->{'tokens'} ) {
648 # We have a bunch of pretokenized words.
650 foreach my $token ( @{$wit->{'tokens'}} ) {
651 my $w_obj = $self->tradition->collation->add_reading({
652 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
653 push( @words, $w_obj );
654 push( @text, $token->{'t'} ); # TODO unless...?
656 ## TODO rethink this JSOn mechanism
657 if( exists $wit->{'layertokens'} ) {
658 foreach my $token ( @{$wit->{'layertokens'}} ) {
659 my $w_obj = $self->tradition->collation->add_reading({
660 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
661 push( @layerwords, $w_obj );
662 push( @layertext, $token->{'t'} );
666 $self->text( \@text );
667 $self->layertext( \@layertext ) if @layertext;
668 $self->path( \@words );
669 $self->uncorrected_path( \@layerwords ) if @layerwords;
672 sub _init_from_plaintext {
674 unless( $self->has_sigil ) {
675 throw( "No sigil defined for the plaintext witness" );
678 if( $self->has_file ) {
679 my $ok = open( INPUT, $self->file );
681 throw( ident => "bad source",
682 message => 'Could not open ' . $self->file . ' for reading' );
684 binmode( INPUT, ':encoding(UTF-8)' );
687 $str = join( '', @lines );
688 } elsif( $self->has_object ) { # ...seriously?
689 $str = ${$self->object};
691 $str = $self->string;
694 # TODO allow a different word separation expression
695 my @text = split( /\s+/, $str );
696 $self->text( \@text );
697 my @words = _split_words( $self, $str );
698 $self->path( \@words );
702 Text::Tradition::Error->throw(
703 'ident' => 'Witness parsing error',
710 my $xpc = XML::LibXML::XPathContext->new( $el );
711 if( $el->namespaceURI ) {
712 $xpc->registerNs( 'tei', $el->namespaceURI );
717 =head2 export_as_json
719 Exports the witness as a JSON structure, with the following keys:
723 =item * id - The witness sigil
725 =item * name - The witness identifier
727 =item * tokens - An array of hashes of the form { "t":"WORD" }
734 my $trad = Text::Tradition->new();
736 my @text = qw/ Thhis is a line of text /;
737 my $wit = $trad->add_witness(
739 'string' => join( ' ', @text ),
740 'sourcetype' => 'plaintext',
741 'identifier' => 'test witness',
743 my $jsonstruct = $wit->export_as_json;
744 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
745 is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" );
746 is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" );
747 foreach my $idx ( 0 .. $#text ) {
748 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" );
751 my @ctext = qw( when april with his showers sweet with fruit the drought of march
752 has pierced unto the root );
753 $trad = Text::Tradition->new(
754 'input' => 'CollateX',
755 'file' => 't/data/Collatex-16.xml' );
757 $jsonstruct = $trad->witness('A')->export_as_json;
758 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
759 is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" );
760 is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" );
761 foreach my $idx ( 0 .. $#ctext ) {
762 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" );
765 ## TODO test layertext export
773 my @wordlist = map { { 't' => $_ || '' } } @{$self->text};
775 'id' => $self->sigil,
776 'tokens' => \@wordlist,
777 'name' => $self->identifier,
779 if( $self->is_layered ) {
780 my @lwlist = map { { 't' => $_ || '' } } @{$self->layertext};
781 $obj->{'layertokens'} = \@lwlist;
787 __PACKAGE__->meta->make_immutable;
793 =item * Figure out how to serialize a witness
795 =item * Support encodings other than UTF-8
801 This package is free software and is provided "as is" without express
802 or implied warranty. You can redistribute it and/or modify it under
803 the same terms as Perl itself.
807 Tara L Andrews E<lt>aurum@cpan.orgE<gt>