1 package Text::Tradition::Witness;
6 use Moose::Util::TypeConstraints;
7 use Text::TEI::Markup qw( word_tag_wrap );
9 use XML::Easy::Syntax qw( $xml10_name_rx );
13 Text::Tradition::Witness - a manuscript witness to a text tradition
17 use Text::Tradition::Witness;
18 my $w = Text::Tradition::Witness->new(
20 'identifier' => 'Oxford MS Ex.1932',
25 Text::Tradition::Witness is an object representation of a manuscript
26 witness to a text tradition. A manuscript has a sigil (a short code that
27 represents it in the wider tradition), an identifier (e.g. the library ID),
34 Create a new witness. Options include:
38 =item * sigil - A short code to represent the manuscript. Required.
40 =item * sourcetype - What sort of witness data this is. Options are
41 'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be
42 used by Collation parsers.)
48 The data source for the witness. Use the appropriate option.
50 =item * use_text - An initialization option. If the witness is read from a
51 TEI document and more than one <text/> tag exists therein, the default
52 behavior is to use the first defined text. If this is not desired,
53 use_text should be set to an XPath expression that will select the correct
56 =item * language - The name of the applicable L<Text::Tradition::Lang>
57 module for language handling. Usually inherited from the language set in
58 the L<Text::Tradition> object, and defaults to Default.
60 =item * identifier - The recognized name of the manuscript, e.g. a library
61 identifier. Taken from the msDesc element for a TEI file.
63 =item * other_info - A freeform string for any other description of the
70 The sigil by which to identify this manuscript, which must conform to the
71 specification for XML attribute strings (broadly speaking, it must begin
72 with a letter and can have only a few sorts of punctuation characters in
77 A freeform name by which to identify the manuscript, which may be longer
78 than the sigil. Defaults to 'Unidentified ms', but will be taken from the
79 TEI msName attribute, or constructed from the settlement and idno if
84 The city, town, etc. where the manuscript is held. Will be read from the
85 TEI msDesc element if supplied.
89 The institution that holds the manuscript. Will be read from the TEI msDesc
94 The identification or call number of the manuscript. Will be read from the
95 TEI msDesc element if supplied.
99 An array of strings (words) that contains the text of the
100 manuscript. This should not change after the witness has been
101 instantiated, and the path through the collation should always match it.
105 An array of strings (words) that contains the layered
106 text, if any, of the manuscript. This should not change after the witness
107 has been instantiated, and the path through the collation should always
112 Accessor method to get the witness language.
116 Accessor method for the witness identifier.
120 Accessor method for the general witness description.
124 Boolean method to note whether the witness has layers (e.g. pre-correction
125 readings) in the collation.
130 my $trad = Text::Tradition->new( 'name' => 'test tradition' );
131 my $c = $trad->collation;
133 # Test a plaintext witness via string
134 my $str = 'This is a line of text';
135 my $ptwit = $trad->add_witness(
137 'sourcetype' => 'plaintext',
140 is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' );
142 is( $ptwit->sigil, 'A', "Witness has correct sigil" );
143 is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
146 # Test some JSON witnesses via object
147 open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
148 binmode( JSIN, ':encoding(UTF-8)' );
151 $trad->add_json_witnesses( join( '', @lines ) );
152 is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
153 "Found first JSON witness" );
154 is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness',
155 "Found second JSON witness" );
157 # # Test an XML witness via file
158 # my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
159 # 'file' => 't/data/witnesses/teiwit.xml' );
160 # is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
162 # is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
163 # ok( $xmlwit->is_layered, "Picked up correction layer" );
164 # is( @{$xmlwit->path}, 185, "Got correct text length" );
165 # is( @{$xmlwit->uncorrected_path}, 185, "Got correct a.c. text length" );
174 subtype 'SourceType',
176 where { $_ =~ /^(xmldesc|plaintext|json|collation)$/ },
177 message { 'Source type must be one of xmldesc, plaintext, json, collation' };
181 where { $_ =~ /\A$xml10_name_rx\z/ },
182 message { 'Sigil must be a valid XML attribute string' };
184 no Moose::Util::TypeConstraints;
188 'isa' => 'Text::Tradition',
192 # Sigil. Required identifier for a witness, but may be found inside
197 predicate => 'has_sigil',
198 writer => '_set_sigil',
201 # Other identifying information
202 has 'identifier' => (
207 has 'settlement' => (
212 has 'repository' => (
222 has 'sourcetype' => (
231 default => 'Default',
234 # Source. Can be XML obj, JSON data struct, or string.
235 # Not used if the witness is created by parsing a collation.
239 predicate => 'has_file',
245 predicate => 'has_string',
248 has 'object' => ( # could be anything.
250 predicate => 'has_object',
251 clearer => 'clear_object',
254 # In the case of a TEI document with multiple texts, specify
255 # which text is the root. Should be an XPath expression.
261 has 'msdesc' => ( # if we started with a TEI doc
263 isa => 'XML::LibXML::Element',
264 predicate => 'has_msdesc',
265 writer => '_save_msdesc',
268 # Text. This is an array of strings (i.e. word tokens).
269 # TODO Think about how to handle this for the case of pre-prepared
270 # collations, where the tokens are in the graph already.
273 isa => 'ArrayRef[Str]',
274 predicate => 'has_text',
279 isa => 'ArrayRef[Str]',
280 predicate => 'has_layertext',
283 # Path. This is an array of Reading nodes that can be saved during
284 # initialization, but should be cleared before saving in a DB.
287 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
288 predicate => 'has_path',
289 clearer => 'clear_path',
292 has 'uncorrected_path' => (
294 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
295 clearer => 'clear_uncorrected_path',
298 has 'is_layered' => (
303 # If we set an uncorrected path, ever, remember that we did so.
304 around 'uncorrected_path' => sub {
308 $self->is_layered( 1 );
314 if( $self->has_source ) {
315 my $init_sub = '_init_from_' . $self->sourcetype;
317 # Remove our XML / source objects; we no longer need them.
318 $self->clear_object if $self->has_object;
319 $self->tradition->collation->make_witness_path( $self );
326 return $self->has_file || $self->has_string || $self->has_object;
329 sub _init_from_xmldesc {
332 if( $self->has_object ) {
333 unless( ref( $self->object ) eq 'XML::LibXML::Element' ) {
334 throw( ident => "bad source",
335 message => "Source object must be an XML::LibXML::Element (this is "
336 . ref( $self->object ) . ");" );
338 $xmlobj = $self->object;
340 my $parser = XML::LibXML->new();
341 my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
343 $xmlobj = $parser->$parsersub( $self->file )->documentElement;
344 } catch( XML::LibXML::Error $e ) {
345 throw( ident => "bad source",
346 message => "XML parsing error: " . $e->as_string );
350 unless( $xmlobj->nodeName eq 'TEI' ) {
351 throw( ident => "bad source",
352 message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" );
355 # Set up the tags we need, with or without namespaces.
356 map { $tags{$_} = "//$_" }
357 qw/ msDesc msName settlement repository idno p lg w seg add del /;
358 # Set up our XPath object
359 my $xpc = _xpc_for_el( $xmlobj );
360 # Use namespace-aware tags if we have to
361 if( $xmlobj->namespaceURI ) {
362 map { $tags{$_} = "//tei:$_" } keys %tags;
366 if( my $desc = $xpc->find( $tags{msDesc} ) ) {
367 my $descnode = $desc->get_node(1);
368 $self->_save_msdesc( $descnode );
369 # First try to use settlement/repository/idno.
370 my( $setNode, $reposNode, $idNode ) =
371 ( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
372 $xpc->find( $tags{repository}, $descnode )->get_node(1),
373 $xpc->find( $tags{idno}, $descnode )->get_node(1) );
374 $self->settlement( $setNode ? $setNode->textContent : '' );
375 $self->repository( $reposNode ? $reposNode->textContent : '' );
376 $self->idno( $idNode ? $idNode->textContent : '' );
377 if( $self->settlement && $self->idno ) {
378 $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) );
380 # Look for an msName.
381 my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1);
383 $self->identifier( $msNameNode->textContent );
385 # We have an msDesc but who knows what is in it?
386 my $desc = $descnode->textContent;
389 $self->identifier( $desc );
392 if( $descnode->hasAttribute('xml:id') ) {
393 $self->_set_sigil( $descnode->getAttribute('xml:id') );
394 } elsif( !$self->has_sigil ) {
395 throw( ident => 'missing sigil',
396 message => 'Could not find xml:id witness sigil' );
399 throw( ident => "bad source",
400 message => "Could not find manuscript description element in TEI header" );
403 # Now get the words out.
405 my @layerwords; # if the witness has layers
406 # First, make sure all the words are wrapped in tags.
407 # TODO Make this not necessarily dependent upon whitespace...
408 word_tag_wrap( $xmlobj );
409 # Now go text hunting.
411 if( $self->use_text ) {
412 @textnodes = $xpc->findnodes( $self->use_text );
414 # Use the first 'text' node in the document.
415 @textnodes = $xmlobj->getElementsByTagName( 'text' );
417 my $teitext = $textnodes[0];
419 _tokenize_text( $self, $teitext, \@words, \@layerwords );
421 throw( ident => "bad source",
422 message => "No text element in document '" . $self->{'identifier'} . "!" );
425 $self->path( \@words );
426 my $a = join( ' ', map { $_->text } @words );
427 my $b = join( ' ', map { $_->text } @layerwords );
429 $self->uncorrected_path( \@layerwords );
431 # TODO set self->text
435 my( $self, $teitext, $wordlist, $uncorrlist ) = @_;
436 # Strip out the words.
437 my $xpc = _xpc_for_el( $teitext );
438 my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' );
441 if( my $n = $_->getAttribute( 'n' ) ) {
442 $place_str = '#DIV_' . $n . '#';
444 $place_str = '#DIV#';
446 $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str );
449 # But maybe we don't have any divs. Just paragraphs.
451 $self->_objectify_words( $teitext, $wordlist, $uncorrlist );
455 sub _objectify_words {
456 my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_;
458 my $xpc = _xpc_for_el( $element );
459 my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg};
460 my @pgraphs = $xpc->findnodes( $xpexpr );
461 return () unless @pgraphs;
462 # Set up an expression to look for words and segs
463 $xpexpr = '.' . $tags{w} . '|.' . $tags{seg};
464 foreach my $pg ( @pgraphs ) {
465 # If this paragraph is the descendant of a note element,
467 my @noop_container = $xpc->findnodes( 'ancestor::note', $pg );
468 next if scalar @noop_container;
469 # Get the text of each node
471 # Hunt down each wrapped word/seg, and make an object (or two objects)
472 # of it, if necessary.
473 foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
474 my( $text, $uncorr ) = _get_word_object( $c );
476 # ( $text, $uncorr ) = _get_word_object( $c );
477 # } catch( Text::Tradition::Error $e
478 # where { $_->has_tag( 'lb' ) } ) {
481 unless( defined $text || defined $uncorr ) {
482 print STDERR "WARNING: no text in node " . $c->nodeName
483 . "\n" unless $c->nodeName eq 'lb';
486 print STDERR "DEBUG: space found in element node "
487 . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/;
489 my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist;
490 while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) {
493 my $id = $self->sigil . 'r' . $ctr;
494 my( $word, $acword );
496 $word = $self->tradition->collation->add_reading(
497 { 'id' => $id, 'text' => $text });
499 if( $uncorr && $uncorr ne $text ) {
501 $acword = $self->tradition->collation->add_reading(
502 { 'id' => $id, 'text' => $uncorr });
507 # if( $first_word ) {
509 # # Set the relevant sectioning markers
511 # $w->add_placeholder( $divmarker );
512 # $divmarker = undef;
514 # $w->add_placeholder( '#PG#' );
516 push( @$wordlist, $word ) if $word;
517 push( @$uncorrlist, $acword ) if $acword;
522 # Given a word or segment node, make a Reading object for the word
523 # therein. Make two Reading objects if there is an 'uncorrected' vs.
526 sub _get_word_strings {
528 my( $text, $uncorrtext );
529 # We can have an lb or pb in the middle of a word; if we do, the
530 # whitespace (including \n) after the break becomes insignificant
531 # and we want to nuke it.
532 my $strip_leading_space = 0;
533 my $word_excluded = 0;
534 my $xpc = _xpc_for_el( $node );
535 # TODO This does not cope with nested add/dels.
536 my @addition = $xpc->findnodes( 'ancestor::' . $tags{add} );
537 my @deletion = $xpc->findnodes( 'ancestor::' . $tags{del} );
538 foreach my $c ($node->childNodes() ) {
539 if( $c->nodeName eq 'num'
540 && defined $c->getAttribute( 'value' ) ) {
542 $text .= $c->getAttribute( 'value' ) unless @deletion;
543 $uncorrtext .= $c->getAttribute( 'value' ) unless @addition;
544 # If this is just after a line/page break, return to normal behavior.
545 $strip_leading_space = 0;
546 } elsif ( $c->nodeName =~ /^[lp]b$/ ) {
547 # Set a flag that strips leading whitespace until we
548 # get to the next bit of non-whitespace.
549 $strip_leading_space = 1;
550 } elsif ( $c->nodeName eq 'fw' # for catchwords
551 || $c->nodeName eq 'sic'
552 || $c->nodeName eq 'note' #TODO: decide how to deal with notes
553 || $c->textContent eq ''
554 || ref( $c ) eq 'XML::LibXML::Comment' ) {
555 $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
557 } elsif( $c->nodeName eq 'add' ) {
558 my( $use, $discard ) = _get_text_from_node( $c );
560 } elsif( $c->nodeName eq 'del' ) {
561 my( $discard, $use ) = _get_text_from_node( $c );
565 if( ref( $c ) eq 'XML::LibXML::Text' ) {
567 $tagtxt = $c->textContent;
569 $tagtxt = _get_text_from_node( $c );
571 if( $strip_leading_space ) {
572 $tagtxt =~ s/^[\s\n]+//s;
573 # Unset the flag as soon as we see non-whitespace.
574 $strip_leading_space = 0 if $tagtxt;
577 $uncorrtext .= $tagtxt;
580 throw( ident => "text not found",
581 tags => [ $node->nodeName ],
582 message => "No text found in node " . $node->toString(0) )
583 unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/;
584 return( $text, $uncorrtext );
588 my( $self, $string, $c ) = @_;
589 my @raw_words = split( /\s+/, $string );
591 foreach my $w ( @raw_words ) {
592 my $id = $self->sigil . 'r'. $c++;
593 my %opts = ( 'text' => $w, 'id' => $id, 'language' => $self->language );
594 my $w_obj = $self->tradition->collation->add_reading( \%opts );
595 # Skip any words that have been canonized out of existence.
596 next if( length( $w_obj->text ) == 0 );
597 push( @words, $w_obj );
602 sub _init_from_json {
605 if( $self->has_object ) {
606 $wit = $self->object;
607 } elsif( $self->has_string ) {
608 $wit = from_json( $self->string );
609 } elsif( $self->has_file ) {
610 my $ok = open( INPUT, $self->file );
612 throw( ident => "bad source",
613 message => 'Could not open ' . $self->file . ' for reading' );
615 binmode( INPUT, ':encoding(UTF-8)' );
618 $wit = from_json( join( '', @lines ) );
621 if( exists $wit->{'id'} ) {
622 $self->_set_sigil( $wit->{'id'} );
623 } elsif( !$self->has_sigil ) {
624 throw( ident => 'missing sigil',
625 message => 'Could not find witness sigil (id) in JSON spec' );
627 $self->identifier( $wit->{'name'} );
630 my( @text, @layertext );
631 if( exists $wit->{'content'} ) {
632 # We need to tokenize the text ourselves.
633 @words = _split_words( $self, $wit->{'content'} );
634 } elsif( exists $wit->{'tokens'} ) {
635 # We have a bunch of pretokenized words.
637 foreach my $token ( @{$wit->{'tokens'}} ) {
638 my $w_obj = $self->tradition->collation->add_reading({
639 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
640 push( @words, $w_obj );
641 push( @text, $token->{'t'} ); # TODO unless...?
643 ## TODO rethink this JSOn mechanism
644 if( exists $wit->{'layertokens'} ) {
645 foreach my $token ( @{$wit->{'layertokens'}} ) {
646 my $w_obj = $self->tradition->collation->add_reading({
647 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
648 push( @layerwords, $w_obj );
649 push( @layertext, $token->{'t'} );
653 $self->text( \@text );
654 $self->layertext( \@layertext ) if @layertext;
655 $self->path( \@words );
656 $self->uncorrected_path( \@layerwords ) if @layerwords;
659 sub _init_from_plaintext {
662 if( $self->has_file ) {
663 my $ok = open( INPUT, $self->file );
665 throw( ident => "bad source",
666 message => 'Could not open ' . $self->file . ' for reading' );
668 binmode( INPUT, ':encoding(UTF-8)' );
671 $str = join( '', @lines );
672 } elsif( $self->has_object ) { # ...seriously?
673 $str = ${$self->object};
675 $str = $self->string;
678 # TODO allow a different word separation expression
679 my @text = split( /\s+/, $str );
680 $self->text( \@text );
681 my @words = _split_words( $self, $str );
682 $self->path( \@words );
686 Text::Tradition::Error->throw(
687 'ident' => 'Witness parsing error',
694 my $xpc = XML::LibXML::XPathContext->new( $el );
695 if( $el->namespaceURI ) {
696 $xpc->registerNs( 'tei', $el->namespaceURI );
701 =head2 export_as_json
703 Exports the witness as a JSON structure, with the following keys:
707 =item * id - The witness sigil
709 =item * name - The witness identifier
711 =item * tokens - An array of hashes of the form { "t":"WORD" }
718 my $trad = Text::Tradition->new();
720 my @text = qw/ Thhis is a line of text /;
721 my $wit = $trad->add_witness(
723 'string' => join( ' ', @text ),
724 'sourcetype' => 'plaintext',
725 'identifier' => 'test witness',
727 my $jsonstruct = $wit->export_as_json;
728 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
729 is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" );
730 is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" );
731 foreach my $idx ( 0 .. $#text ) {
732 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" );
735 my @ctext = qw( when april with his showers sweet with fruit the drought of march
736 has pierced unto the root );
737 $trad = Text::Tradition->new(
738 'input' => 'CollateX',
739 'file' => 't/data/Collatex-16.xml' );
741 $jsonstruct = $trad->witness('A')->export_as_json;
742 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
743 is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" );
744 is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" );
745 foreach my $idx ( 0 .. $#ctext ) {
746 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" );
749 ## TODO test layertext export
757 my @wordlist = map { { 't' => $_ || '' } } @{$self->text};
759 'id' => $self->sigil,
760 'tokens' => \@wordlist,
761 'name' => $self->identifier,
763 if( $self->is_layered ) {
764 my @lwlist = map { { 't' => $_ || '' } } @{$self->uncorrected};
765 $obj->{'layertokens'} = \@lwlist;
771 __PACKAGE__->meta->make_immutable;
777 =item * Support encodings other than UTF-8
783 This package is free software and is provided "as is" without express
784 or implied warranty. You can redistribute it and/or modify it under
785 the same terms as Perl itself.
789 Tara L Andrews E<lt>aurum@cpan.orgE<gt>