1 package Text::Tradition::Witness;
6 use Moose::Util::TypeConstraints;
7 use Text::TEI::Markup qw( word_tag_wrap );
9 use XML::Easy::Syntax qw( $xml10_name_rx );
13 Text::Tradition::Witness - a manuscript witness to a text tradition
17 use Text::Tradition::Witness;
18 my $w = Text::Tradition::Witness->new(
20 'identifier' => 'Oxford MS Ex.1932',
25 Text::Tradition::Witness is an object representation of a manuscript
26 witness to a text tradition. A manuscript has a sigil (a short code that
27 represents it in the wider tradition), an identifier (e.g. the library ID),
34 Create a new witness. Options include:
38 =item * sigil - A short code to represent the manuscript. Required.
40 =item * sourcetype - What sort of witness data this is. Options are
41 'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be
42 used by Collation parsers.)
48 The data source for the witness. Use the appropriate option.
50 =item * use_text - An initialization option. If the witness is read from a
51 TEI document and more than one <text/> tag exists therein, the default
52 behavior is to use the first defined text. If this is not desired,
53 use_text should be set to an XPath expression that will select the correct
56 =item * language - The name of the applicable L<Text::Tradition::Lang>
57 module for language handling. Usually inherited from the language set in
58 the L<Text::Tradition> object, and defaults to Default.
60 =item * identifier - The recognized name of the manuscript, e.g. a library
61 identifier. Taken from the msDesc element for a TEI file.
63 =item * other_info - A freeform string for any other description of the
70 The sigil by which to identify this manuscript, which must conform to the
71 specification for XML attribute strings (broadly speaking, it must begin
72 with a letter and can have only a few sorts of punctuation characters in
77 A freeform name by which to identify the manuscript, which may be longer
78 than the sigil. Defaults to 'Unidentified ms', but will be taken from the
79 TEI msName attribute, or constructed from the settlement and idno if
84 The city, town, etc. where the manuscript is held. Will be read from the
85 TEI msDesc element if supplied.
89 The institution that holds the manuscript. Will be read from the TEI msDesc
94 The identification or call number of the manuscript. Will be read from the
95 TEI msDesc element if supplied.
99 An array of strings (words) that contains the text of the
100 manuscript. This should not change after the witness has been
101 instantiated, and the path through the collation should always match it.
105 An array of strings (words) that contains the layered
106 text, if any, of the manuscript. This should not change after the witness
107 has been instantiated, and the path through the collation should always
112 Accessor method to get the witness language.
116 Accessor method for the witness identifier.
120 Accessor method for the general witness description.
124 Boolean method to note whether the witness has layers (e.g. pre-correction
125 readings) in the collation.
130 my $trad = Text::Tradition->new( 'name' => 'test tradition' );
131 my $c = $trad->collation;
133 # Test a plaintext witness via string
134 my $str = 'This is a line of text';
135 my $ptwit = $trad->add_witness(
137 'sourcetype' => 'plaintext',
140 is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' );
142 is( $ptwit->sigil, 'A', "Witness has correct sigil" );
143 is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
146 # Test some JSON witnesses via object
147 open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
148 binmode( JSIN, ':encoding(UTF-8)' );
151 $trad->add_json_witnesses( join( '', @lines ) );
152 is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
153 "Found first JSON witness" );
154 is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness',
155 "Found second JSON witness" );
157 # Test an XML witness via file
158 my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
159 'file' => 't/data/witnesses/teiwit.xml' );
160 is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
162 is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
163 ok( $xmlwit->is_layered, "Picked up correction layer" );
164 is( @{$xmlwit->text}, 182, "Got correct text length" );
165 is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" );
167 my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings;
168 is( @allwitwords, 184, "Reused appropriate readings" );
171 my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
172 'file' => 't/data/witnesses/group.xml',
173 'use_text' => '//tei:group/tei:text[2]' );
174 is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" );
176 is( $xpwit->sigil, 'G', "XML part witness has correct sigil" );
177 ok( !$xpwit->is_layered, "Picked up no correction layer" );
178 is( @{$xpwit->text}, 157, "Got correct text length" );
186 subtype 'SourceType',
188 where { $_ =~ /^(xmldesc|plaintext|json|collation)$/ },
189 message { 'Source type must be one of xmldesc, plaintext, json, collation' };
193 where { $_ =~ /\A$xml10_name_rx\z/ },
194 message { 'Sigil must be a valid XML attribute string' };
196 no Moose::Util::TypeConstraints;
200 'isa' => 'Text::Tradition',
204 # Sigil. Required identifier for a witness, but may be found inside
209 predicate => 'has_sigil',
210 writer => '_set_sigil',
216 default => 'Default',
219 # Other identifying information
220 has 'identifier' => (
225 has 'settlement' => (
230 has 'repository' => (
240 # Source. Can be XML obj, JSON data struct, or string.
241 # Not used if the witness is created by parsing a collation.
242 has 'sourcetype' => (
251 predicate => 'has_file',
257 predicate => 'has_string',
260 has 'object' => ( # could be anything.
262 predicate => 'has_object',
263 clearer => 'clear_object',
266 # In the case of a TEI document with multiple texts, specify
267 # which text is the root. Should be an XPath expression.
273 # Text. This is an array of strings (i.e. word tokens).
274 # TODO Think about how to handle this for the case of pre-prepared
275 # collations, where the tokens are in the graph already.
278 isa => 'ArrayRef[Str]',
279 predicate => 'has_text',
284 isa => 'ArrayRef[Str]',
285 predicate => 'has_layertext',
288 # Path. This is an array of Reading nodes that can be saved during
289 # initialization, but should be cleared before saving in a DB.
292 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
293 predicate => 'has_path',
294 clearer => 'clear_path',
297 ## TODO change the name of this
298 has 'uncorrected_path' => (
300 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
301 clearer => 'clear_uncorrected_path',
304 has 'is_layered' => (
309 # If we set an uncorrected path, ever, remember that we did so.
310 around 'uncorrected_path' => sub {
314 $self->is_layered( 1 );
320 if( $self->has_source ) {
321 my $init_sub = '_init_from_' . $self->sourcetype;
323 # Remove our XML / source objects; we no longer need them.
324 $self->clear_object if $self->has_object;
325 $self->tradition->collation->make_witness_path( $self );
332 return $self->has_file || $self->has_string || $self->has_object;
335 sub _init_from_xmldesc {
338 if( $self->has_object ) {
339 unless( ref( $self->object ) eq 'XML::LibXML::Element' ) {
340 throw( ident => "bad source",
341 message => "Source object must be an XML::LibXML::Element (this is "
342 . ref( $self->object ) . ");" );
344 $xmlobj = $self->object;
346 my $parser = XML::LibXML->new();
347 my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
349 $xmlobj = $parser->$parsersub( $self->file )->documentElement;
350 } catch( XML::LibXML::Error $e ) {
351 throw( ident => "bad source",
352 message => "XML parsing error: " . $e->as_string );
356 unless( $xmlobj->nodeName eq 'TEI' ) {
357 throw( ident => "bad source",
358 message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" );
361 # Set up the tags we need, with or without namespaces.
362 map { $tags{$_} = "//$_" }
363 qw/ msDesc msName settlement repository idno p lg w seg add del /;
364 # Set up our XPath object
365 my $xpc = _xpc_for_el( $xmlobj );
366 # Use namespace-aware tags if we have to
367 if( $xmlobj->namespaceURI ) {
368 map { $tags{$_} = "//tei:$_" } keys %tags;
372 if( my $desc = $xpc->find( $tags{msDesc} ) ) {
373 my $descnode = $desc->get_node(1);
374 # First try to use settlement/repository/idno.
375 my( $setNode, $reposNode, $idNode ) =
376 ( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
377 $xpc->find( $tags{repository}, $descnode )->get_node(1),
378 $xpc->find( $tags{idno}, $descnode )->get_node(1) );
379 $self->settlement( $setNode ? $setNode->textContent : '' );
380 $self->repository( $reposNode ? $reposNode->textContent : '' );
381 $self->idno( $idNode ? $idNode->textContent : '' );
382 if( $self->settlement && $self->idno ) {
383 $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) );
385 # Look for an msName.
386 my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1);
388 $self->identifier( $msNameNode->textContent );
390 # We have an msDesc but who knows what is in it?
391 my $desc = $descnode->textContent;
394 $self->identifier( $desc );
397 if( $descnode->hasAttribute('xml:id') ) {
398 $self->_set_sigil( $descnode->getAttribute('xml:id') );
399 } elsif( !$self->has_sigil ) {
400 throw( ident => 'missing sigil',
401 message => 'Could not find xml:id witness sigil' );
404 throw( ident => "bad source",
405 message => "Could not find manuscript description element in TEI header" );
408 # Now get the words out.
410 my @layerwords; # if the witness has layers
411 # First, make sure all the words are wrapped in tags.
412 # TODO Make this not necessarily dependent upon whitespace...
413 word_tag_wrap( $xmlobj );
414 # Now go text hunting.
416 if( $self->use_text ) {
417 @textnodes = $xpc->findnodes( $self->use_text );
419 # Use the first 'text' node in the document.
420 @textnodes = $xmlobj->getElementsByTagName( 'text' );
422 my $teitext = $textnodes[0];
424 _tokenize_text( $self, $teitext, \@words, \@layerwords );
426 throw( ident => "bad source",
427 message => "No text element in document '" . $self->{'identifier'} . "!" );
430 my @text = map { $_->text } @words;
431 my @layertext = map { $_->text } @layerwords;
432 $self->path( \@words );
433 $self->text( \@text );
434 if( join( ' ', @text ) ne join( ' ', @layertext ) ) {
435 $self->uncorrected_path( \@layerwords );
436 $self->layertext( \@layertext );
441 my( $self, $teitext, $wordlist, $uncorrlist ) = @_;
442 # Strip out the words.
443 my $xpc = _xpc_for_el( $teitext );
444 my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' );
447 if( my $n = $_->getAttribute( 'n' ) ) {
448 $place_str = '#DIV_' . $n . '#';
450 $place_str = '#DIV#';
452 $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str );
455 # But maybe we don't have any divs. Just paragraphs.
457 $self->_objectify_words( $teitext, $wordlist, $uncorrlist );
461 sub _objectify_words {
462 my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_;
464 my $xpc = _xpc_for_el( $element );
465 my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg};
466 my @pgraphs = $xpc->findnodes( $xpexpr );
467 return () unless @pgraphs;
468 # Set up an expression to look for words and segs
469 $xpexpr = '.' . $tags{w} . '|.' . $tags{seg};
470 foreach my $pg ( @pgraphs ) {
471 # If this paragraph is the descendant of a note element,
473 my @noop_container = $xpc->findnodes( 'ancestor::note', $pg );
474 next if scalar @noop_container;
475 # Get the text of each node
477 # Hunt down each wrapped word/seg, and make an object (or two objects)
478 # of it, if necessary.
479 foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
480 my( $text, $uncorr ) = _get_word_strings( $c );
482 # ( $text, $uncorr ) = _get_word_object( $c );
483 # } catch( Text::Tradition::Error $e
484 # where { $_->has_tag( 'lb' ) } ) {
487 unless( defined $text || defined $uncorr ) {
488 print STDERR "WARNING: no text in node " . $c->nodeName
489 . "\n" unless $c->nodeName eq 'lb';
492 print STDERR "DEBUG: space found in element node "
493 . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/;
495 my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist;
496 while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) {
499 my $id = $self->sigil . 'r' . $ctr;
500 my( $word, $acword );
502 $word = $self->tradition->collation->add_reading(
503 { 'id' => $id, 'text' => $text });
505 if( $uncorr && $uncorr ne $text ) {
507 $acword = $self->tradition->collation->add_reading(
508 { 'id' => $id, 'text' => $uncorr });
513 # if( $first_word ) {
515 # # Set the relevant sectioning markers
517 # $w->add_placeholder( $divmarker );
518 # $divmarker = undef;
520 # $w->add_placeholder( '#PG#' );
522 push( @$wordlist, $word ) if $word;
523 push( @$uncorrlist, $acword ) if $acword;
528 # Given a word or segment node, make a Reading object for the word
529 # therein. Make two Reading objects if there is an 'uncorrected' vs.
532 sub _get_word_strings {
534 my( $text, $uncorrtext );
535 # We can have an lb or pb in the middle of a word; if we do, the
536 # whitespace (including \n) after the break becomes insignificant
537 # and we want to nuke it.
538 my $strip_leading_space = 0;
539 my $word_excluded = 0;
540 my $xpc = _xpc_for_el( $node );
541 # TODO This does not cope with nested add/dels.
542 my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) );
543 my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) );
544 foreach my $c ($node->childNodes() ) {
545 if( $c->nodeName eq 'num'
546 && defined $c->getAttribute( 'value' ) ) {
548 $text .= $c->getAttribute( 'value' ) unless @deletion;
549 $uncorrtext .= $c->getAttribute( 'value' ) unless @addition;
550 # If this is just after a line/page break, return to normal behavior.
551 $strip_leading_space = 0;
552 } elsif ( $c->nodeName =~ /^[lp]b$/ ) {
553 # Set a flag that strips leading whitespace until we
554 # get to the next bit of non-whitespace.
555 $strip_leading_space = 1;
556 } elsif ( $c->nodeName eq 'fw' # for catchwords
557 || $c->nodeName eq 'sic'
558 || $c->nodeName eq 'note' #TODO: decide how to deal with notes
559 || $c->textContent eq ''
560 || ref( $c ) eq 'XML::LibXML::Comment' ) {
561 $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
563 } elsif( $c->nodeName eq 'add' ) {
564 my( $use, $discard ) = _get_word_strings( $c );
566 } elsif( $c->nodeName eq 'del' ) {
567 my( $discard, $use ) = _get_word_strings( $c );
570 my ( $tagtxt, $taguncorr );
571 if( ref( $c ) eq 'XML::LibXML::Text' ) {
573 $tagtxt = $c->textContent;
574 $taguncorr = $c->textContent;
576 ( $tagtxt, $taguncorr ) = _get_word_strings( $c );
578 if( $strip_leading_space ) {
579 $tagtxt =~ s/^[\s\n]+//s;
580 $taguncorr =~ s/^[\s\n]+//s;
581 # Unset the flag as soon as we see non-whitespace.
582 $strip_leading_space = 0 if $tagtxt;
585 $uncorrtext .= $taguncorr;
588 throw( ident => "text not found",
589 tags => [ $node->nodeName ],
590 message => "No text found in node " . $node->toString(0) )
591 unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/;
592 return( $text, $uncorrtext );
596 my( $self, $string, $c ) = @_;
597 my @raw_words = split( /\s+/, $string );
599 foreach my $w ( @raw_words ) {
600 my $id = $self->sigil . 'r'. $c++;
601 my %opts = ( 'text' => $w, 'id' => $id, 'language' => $self->language );
602 my $w_obj = $self->tradition->collation->add_reading( \%opts );
603 # Skip any words that have been canonized out of existence.
604 next if( length( $w_obj->text ) == 0 );
605 push( @words, $w_obj );
610 sub _init_from_json {
613 if( $self->has_object ) {
614 $wit = $self->object;
615 } elsif( $self->has_string ) {
616 $wit = from_json( $self->string );
617 } elsif( $self->has_file ) {
618 my $ok = open( INPUT, $self->file );
620 throw( ident => "bad source",
621 message => 'Could not open ' . $self->file . ' for reading' );
623 binmode( INPUT, ':encoding(UTF-8)' );
626 $wit = from_json( join( '', @lines ) );
629 if( exists $wit->{'id'} ) {
630 $self->_set_sigil( $wit->{'id'} );
631 } elsif( !$self->has_sigil ) {
632 throw( ident => 'missing sigil',
633 message => 'Could not find witness sigil (id) in JSON spec' );
635 $self->identifier( $wit->{'name'} );
638 my( @text, @layertext );
639 if( exists $wit->{'content'} ) {
640 # We need to tokenize the text ourselves.
641 @words = _split_words( $self, $wit->{'content'} );
642 } elsif( exists $wit->{'tokens'} ) {
643 # We have a bunch of pretokenized words.
645 foreach my $token ( @{$wit->{'tokens'}} ) {
646 my $w_obj = $self->tradition->collation->add_reading({
647 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
648 push( @words, $w_obj );
649 push( @text, $token->{'t'} ); # TODO unless...?
651 ## TODO rethink this JSOn mechanism
652 if( exists $wit->{'layertokens'} ) {
653 foreach my $token ( @{$wit->{'layertokens'}} ) {
654 my $w_obj = $self->tradition->collation->add_reading({
655 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
656 push( @layerwords, $w_obj );
657 push( @layertext, $token->{'t'} );
661 $self->text( \@text );
662 $self->layertext( \@layertext ) if @layertext;
663 $self->path( \@words );
664 $self->uncorrected_path( \@layerwords ) if @layerwords;
667 sub _init_from_plaintext {
670 if( $self->has_file ) {
671 my $ok = open( INPUT, $self->file );
673 throw( ident => "bad source",
674 message => 'Could not open ' . $self->file . ' for reading' );
676 binmode( INPUT, ':encoding(UTF-8)' );
679 $str = join( '', @lines );
680 } elsif( $self->has_object ) { # ...seriously?
681 $str = ${$self->object};
683 $str = $self->string;
686 # TODO allow a different word separation expression
687 my @text = split( /\s+/, $str );
688 $self->text( \@text );
689 my @words = _split_words( $self, $str );
690 $self->path( \@words );
694 Text::Tradition::Error->throw(
695 'ident' => 'Witness parsing error',
702 my $xpc = XML::LibXML::XPathContext->new( $el );
703 if( $el->namespaceURI ) {
704 $xpc->registerNs( 'tei', $el->namespaceURI );
709 =head2 export_as_json
711 Exports the witness as a JSON structure, with the following keys:
715 =item * id - The witness sigil
717 =item * name - The witness identifier
719 =item * tokens - An array of hashes of the form { "t":"WORD" }
726 my $trad = Text::Tradition->new();
728 my @text = qw/ Thhis is a line of text /;
729 my $wit = $trad->add_witness(
731 'string' => join( ' ', @text ),
732 'sourcetype' => 'plaintext',
733 'identifier' => 'test witness',
735 my $jsonstruct = $wit->export_as_json;
736 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
737 is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" );
738 is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" );
739 foreach my $idx ( 0 .. $#text ) {
740 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" );
743 my @ctext = qw( when april with his showers sweet with fruit the drought of march
744 has pierced unto the root );
745 $trad = Text::Tradition->new(
746 'input' => 'CollateX',
747 'file' => 't/data/Collatex-16.xml' );
749 $jsonstruct = $trad->witness('A')->export_as_json;
750 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
751 is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" );
752 is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" );
753 foreach my $idx ( 0 .. $#ctext ) {
754 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" );
757 ## TODO test layertext export
765 my @wordlist = map { { 't' => $_ || '' } } @{$self->text};
767 'id' => $self->sigil,
768 'tokens' => \@wordlist,
769 'name' => $self->identifier,
771 if( $self->is_layered ) {
772 my @lwlist = map { { 't' => $_ || '' } } @{$self->uncorrected};
773 $obj->{'layertokens'} = \@lwlist;
779 __PACKAGE__->meta->make_immutable;
785 =item * Support encodings other than UTF-8
791 This package is free software and is provided "as is" without express
792 or implied warranty. You can redistribute it and/or modify it under
793 the same terms as Perl itself.
797 Tara L Andrews E<lt>aurum@cpan.orgE<gt>