1 package Text::Tradition::Witness;
6 use Text::Tradition::Datatypes;
7 use Text::TEI::Markup qw( word_tag_wrap );
12 Text::Tradition::Witness - a manuscript witness to a text tradition
16 use Text::Tradition::Witness;
17 my $w = Text::Tradition::Witness->new(
19 'identifier' => 'Oxford MS Ex.1932',
24 Text::Tradition::Witness is an object representation of a manuscript
25 witness to a text tradition. A manuscript has a sigil (a short code that
26 represents it in the wider tradition), an identifier (e.g. the library ID),
33 Create a new witness. Options include:
37 =item * sigil - A short code to represent the manuscript. Required.
39 =item * sourcetype - What sort of witness data this is. Options are
40 'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be
41 used by Collation parsers.)
47 The data source for the witness. Use the appropriate option.
49 =item * use_text - An initialization option. If the witness is read from a
50 TEI document and more than one <text/> tag exists therein, the default
51 behavior is to use the first defined text. If this is not desired,
52 use_text should be set to an XPath expression that will select the correct
55 =item * identifier - The recognized name of the manuscript, e.g. a library
56 identifier. Taken from the msDesc element for a TEI file.
58 =item * other_info - A freeform string for any other description of the
65 The sigil by which to identify this manuscript, which must conform to the
66 specification for XML attribute strings (broadly speaking, it must begin
67 with a letter and can have only a few sorts of punctuation characters in
72 A freeform name by which to identify the manuscript, which may be longer
73 than the sigil. Defaults to 'Unidentified ms', but will be taken from the
74 TEI msName attribute, or constructed from the settlement and idno if
79 The city, town, etc. where the manuscript is held. Will be read from the
80 TEI msDesc element if supplied.
84 The institution that holds the manuscript. Will be read from the TEI msDesc
89 The identification or call number of the manuscript. Will be read from the
90 TEI msDesc element if supplied.
94 An array of strings (words) that contains the text of the
95 manuscript. This should not change after the witness has been
96 instantiated, and the path through the collation should always match it.
100 An array of strings (words) that contains the layered
101 text, if any, of the manuscript. This should not change after the witness
102 has been instantiated, and the path through the collation should always
107 Accessor method for the witness identifier.
111 Accessor method for the general witness description.
115 Boolean method that returns a true value if the witness was created with a
116 data source (that is, a file, string, or object to be parsed).
120 Boolean method to note whether the witness has layers (e.g. pre-correction
121 readings) in the collation.
126 my $trad = Text::Tradition->new( 'name' => 'test tradition' );
127 my $c = $trad->collation;
129 # Test a plaintext witness via string
130 my $str = 'This is a line of text';
131 my $ptwit = $trad->add_witness(
133 'sourcetype' => 'plaintext',
136 is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' );
138 is( $ptwit->sigil, 'A', "Witness has correct sigil" );
139 $c->make_witness_path( $ptwit );
140 is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
143 # Test some JSON witnesses via object
144 open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
145 binmode( JSIN, ':encoding(UTF-8)' );
148 $trad->add_json_witnesses( join( '', @lines ) );
149 is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
150 "Found first JSON witness" );
151 is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness',
152 "Found second JSON witness" );
154 # Test an XML witness via file
155 my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
156 'file' => 't/data/witnesses/teiwit.xml' );
157 is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
159 is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
160 ok( $xmlwit->is_layered, "Picked up correction layer" );
161 is( @{$xmlwit->text}, 182, "Got correct text length" );
162 is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" );
164 my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings;
165 is( @allwitwords, 184, "Reused appropriate readings" );
168 my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
169 'file' => 't/data/witnesses/group.xml',
170 'use_text' => '//tei:group/tei:text[2]' );
171 is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" );
173 is( $xpwit->sigil, 'G', "XML part witness has correct sigil" );
174 ok( !$xpwit->is_layered, "Picked up no correction layer" );
175 is( @{$xpwit->text}, 157, "Got correct text length" );
183 # Enable plugin(s) if available
184 eval { with 'Text::Tradition::WitLanguage'; };
188 isa => 'Text::Tradition',
193 # Sigil. Required identifier for a witness, but may be found inside
198 predicate => 'has_sigil',
199 writer => '_set_sigil',
202 # Other identifying information
203 has 'identifier' => (
208 has 'settlement' => (
213 has 'repository' => (
223 # Source. Can be XML obj, JSON data struct, or string.
224 # Not used if the witness is created by parsing a collation.
225 has 'sourcetype' => (
234 predicate => 'has_file',
240 predicate => 'has_string',
243 has 'object' => ( # could be anything.
245 predicate => 'has_object',
246 clearer => 'clear_object',
249 # In the case of a TEI document with multiple texts, specify
250 # which text is the root. Should be an XPath expression.
256 # Text. This is an array of strings (i.e. word tokens).
257 # TODO Think about how to handle this for the case of pre-prepared
258 # collations, where the tokens are in the graph already.
261 isa => 'ArrayRef[Str]',
262 predicate => 'has_text',
267 isa => 'ArrayRef[Str]',
268 predicate => 'has_layertext',
271 has 'is_collated' => (
276 # Path. This is an array of Reading nodes that can be saved during
277 # initialization, but should be cleared before saving in a DB.
280 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
281 predicate => 'has_path',
282 clearer => 'clear_path',
285 ## TODO change the name of this
286 has 'uncorrected_path' => (
288 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
289 clearer => 'clear_uncorrected_path',
292 has 'is_layered' => (
297 # If we set an uncorrected path, ever, remember that we did so.
298 around 'uncorrected_path' => sub {
302 $self->is_layered( 1 );
308 if( $self->has_source ) {
309 my $init_sub = '_init_from_' . $self->sourcetype;
311 # Remove our XML / source objects; we no longer need them.
312 $self->clear_object if $self->has_object;
313 # $self->tradition->collation->make_witness_path( $self );
315 if( $self->sourcetype eq 'collation' ) {
316 $self->is_collated( 1 );
323 return $self->has_file || $self->has_string || $self->has_object;
326 sub _init_from_xmldesc {
329 if( $self->has_object ) {
330 unless( ref( $self->object ) eq 'XML::LibXML::Element' ) {
331 throw( ident => "bad source",
332 message => "Source object must be an XML::LibXML::Element (this is "
333 . ref( $self->object ) . ");" );
335 $xmlobj = $self->object;
338 my $parser = XML::LibXML->new();
339 my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
341 $xmlobj = $parser->$parsersub( $self->file )->documentElement;
342 } catch( XML::LibXML::Error $e ) {
343 throw( ident => "bad source",
344 message => "XML parsing error: " . $e->as_string );
348 unless( $xmlobj->nodeName eq 'TEI' ) {
349 throw( ident => "bad source",
350 message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" );
353 # Set up the tags we need, with or without namespaces.
354 map { $tags{$_} = "//$_" }
355 qw/ msDesc msName settlement repository idno p lg w seg add del /;
356 # Set up our XPath object
357 my $xpc = _xpc_for_el( $xmlobj );
358 # Use namespace-aware tags if we have to
359 if( $xmlobj->namespaceURI ) {
360 map { $tags{$_} = "//tei:$_" } keys %tags;
364 if( my $desc = $xpc->find( $tags{msDesc} ) ) {
365 my $descnode = $desc->get_node(1);
366 # First try to use settlement/repository/idno.
367 my( $setNode, $reposNode, $idNode ) =
368 ( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
369 $xpc->find( $tags{repository}, $descnode )->get_node(1),
370 $xpc->find( $tags{idno}, $descnode )->get_node(1) );
371 $self->settlement( $setNode ? $setNode->textContent : '' );
372 $self->repository( $reposNode ? $reposNode->textContent : '' );
373 $self->idno( $idNode ? $idNode->textContent : '' );
374 if( $self->settlement && $self->idno ) {
375 $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) );
377 # Look for an msName.
378 my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1);
380 $self->identifier( $msNameNode->textContent );
382 # We have an msDesc but who knows what is in it?
383 my $desc = $descnode->textContent;
386 $self->identifier( $desc );
389 if( $descnode->hasAttribute('xml:id') ) {
390 $self->_set_sigil( $descnode->getAttribute('xml:id') );
391 } elsif( !$self->has_sigil ) {
392 throw( ident => 'missing sigil',
393 message => 'Could not find xml:id witness sigil' );
396 throw( ident => "bad source",
397 message => "Could not find manuscript description element in TEI header" );
400 # Now get the words out.
402 my @layerwords; # if the witness has layers
403 # First, make sure all the words are wrapped in tags.
404 # TODO Make this not necessarily dependent upon whitespace...
405 word_tag_wrap( $xmlobj );
406 # Now go text hunting.
408 if( $self->use_text ) {
409 @textnodes = $xpc->findnodes( $self->use_text );
411 # Use the first 'text' node in the document.
412 @textnodes = $xmlobj->getElementsByTagName( 'text' );
414 my $teitext = $textnodes[0];
416 _tokenize_text( $self, $teitext, \@words, \@layerwords );
418 throw( ident => "bad source",
419 message => "No text element in document '" . $self->{'identifier'} . "!" );
422 my @text = map { $_->text } @words;
423 my @layertext = map { $_->text } @layerwords;
424 $self->path( \@words );
425 $self->text( \@text );
426 if( join( ' ', @text ) ne join( ' ', @layertext ) ) {
427 $self->uncorrected_path( \@layerwords );
428 $self->layertext( \@layertext );
433 my( $self, $teitext, $wordlist, $uncorrlist ) = @_;
434 # Strip out the words.
435 my $xpc = _xpc_for_el( $teitext );
436 my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' );
439 if( my $n = $_->getAttribute( 'n' ) ) {
440 $place_str = '#DIV_' . $n . '#';
442 $place_str = '#DIV#';
444 $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str );
447 # But maybe we don't have any divs. Just paragraphs.
449 $self->_objectify_words( $teitext, $wordlist, $uncorrlist );
453 sub _objectify_words {
454 my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_;
456 my $xpc = _xpc_for_el( $element );
457 my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg};
458 my @pgraphs = $xpc->findnodes( $xpexpr );
459 return () unless @pgraphs;
460 # Set up an expression to look for words and segs
461 $xpexpr = '.' . $tags{w} . '|.' . $tags{seg};
462 foreach my $pg ( @pgraphs ) {
463 # If this paragraph is the descendant of a note element,
465 my @noop_container = $xpc->findnodes( 'ancestor::note', $pg );
466 next if scalar @noop_container;
467 # Get the text of each node
469 # Hunt down each wrapped word/seg, and make an object (or two objects)
470 # of it, if necessary.
471 foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
472 my( $text, $uncorr ) = _get_word_strings( $c );
474 # ( $text, $uncorr ) = _get_word_object( $c );
475 # } catch( Text::Tradition::Error $e
476 # where { $_->has_tag( 'lb' ) } ) {
479 unless( defined $text || defined $uncorr ) {
480 print STDERR "WARNING: no text in node " . $c->nodeName
481 . "\n" unless $c->nodeName eq 'lb';
484 print STDERR "DEBUG: space found in element node "
485 . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/;
487 my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist;
488 while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) {
491 my $id = $self->sigil . 'r' . $ctr;
492 my( $word, $acword );
494 $word = $self->tradition->collation->add_reading(
495 { 'id' => $id, 'text' => $text });
497 if( $uncorr && $uncorr ne $text ) {
499 $acword = $self->tradition->collation->add_reading(
500 { 'id' => $id, 'text' => $uncorr });
505 # if( $first_word ) {
507 # # Set the relevant sectioning markers
509 # $w->add_placeholder( $divmarker );
510 # $divmarker = undef;
512 # $w->add_placeholder( '#PG#' );
514 push( @$wordlist, $word ) if $word;
515 push( @$uncorrlist, $acword ) if $acword;
520 # Given a word or segment node, make a Reading object for the word
521 # therein. Make two Reading objects if there is an 'uncorrected' vs.
524 sub _get_word_strings {
526 my( $text, $uncorrtext );
527 # We can have an lb or pb in the middle of a word; if we do, the
528 # whitespace (including \n) after the break becomes insignificant
529 # and we want to nuke it.
530 my $strip_leading_space = 0;
531 my $word_excluded = 0;
532 my $xpc = _xpc_for_el( $node );
533 # TODO This does not cope with nested add/dels.
534 my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) );
535 my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) );
536 foreach my $c ($node->childNodes() ) {
537 if( $c->nodeName eq 'num'
538 && defined $c->getAttribute( 'value' ) ) {
540 $text .= $c->getAttribute( 'value' ) unless @deletion;
541 $uncorrtext .= $c->getAttribute( 'value' ) unless @addition;
542 # If this is just after a line/page break, return to normal behavior.
543 $strip_leading_space = 0;
544 } elsif ( $c->nodeName =~ /^[lp]b$/ ) {
545 # Set a flag that strips leading whitespace until we
546 # get to the next bit of non-whitespace.
547 $strip_leading_space = 1;
548 } elsif ( $c->nodeName eq 'fw' # for catchwords
549 || $c->nodeName eq 'sic'
550 || $c->nodeName eq 'note' #TODO: decide how to deal with notes
551 || $c->textContent eq ''
552 || ref( $c ) eq 'XML::LibXML::Comment' ) {
553 $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
555 } elsif( $c->nodeName eq 'add' ) {
556 my( $use, $discard ) = _get_word_strings( $c );
558 } elsif( $c->nodeName eq 'del' ) {
559 my( $discard, $use ) = _get_word_strings( $c );
562 my ( $tagtxt, $taguncorr );
563 if( ref( $c ) eq 'XML::LibXML::Text' ) {
565 $tagtxt = $c->textContent;
566 $taguncorr = $c->textContent;
568 ( $tagtxt, $taguncorr ) = _get_word_strings( $c );
570 if( $strip_leading_space ) {
571 $tagtxt =~ s/^[\s\n]+//s;
572 $taguncorr =~ s/^[\s\n]+//s;
573 # Unset the flag as soon as we see non-whitespace.
574 $strip_leading_space = 0 if $tagtxt;
577 $uncorrtext .= $taguncorr;
580 throw( ident => "text not found",
581 tags => [ $node->nodeName ],
582 message => "No text found in node " . $node->toString(0) )
583 unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/;
584 return( $text, $uncorrtext );
588 my( $self, $string, $c ) = @_;
589 my @raw_words = split( /\s+/, $string );
591 foreach my $w ( @raw_words ) {
592 my $id = $self->sigil . 'r'. $c++;
593 my %opts = ( 'text' => $w, 'id' => $id );
594 my $w_obj = $self->tradition->collation->add_reading( \%opts );
595 # Skip any words that have been canonized out of existence.
596 next if( length( $w_obj->text ) == 0 );
597 push( @words, $w_obj );
602 sub _init_from_json {
605 if( $self->has_object ) {
606 $wit = $self->object;
607 } elsif( $self->has_string ) {
608 $wit = from_json( $self->string );
609 } elsif( $self->has_file ) {
610 my $ok = open( INPUT, $self->file );
612 throw( ident => "bad source",
613 message => 'Could not open ' . $self->file . ' for reading' );
615 binmode( INPUT, ':encoding(UTF-8)' );
618 $wit = from_json( join( '', @lines ) );
621 if( exists $wit->{'id'} ) {
622 $self->_set_sigil( $wit->{'id'} );
623 } elsif( !$self->has_sigil ) {
624 throw( ident => 'missing sigil',
625 message => 'Could not find witness sigil (id) in JSON spec' );
627 $self->identifier( $wit->{'name'} );
630 my( @text, @layertext );
631 if( exists $wit->{'content'} ) {
632 # We need to tokenize the text ourselves.
633 @words = _split_words( $self, $wit->{'content'} );
634 } elsif( exists $wit->{'tokens'} ) {
635 # We have a bunch of pretokenized words.
637 foreach my $token ( @{$wit->{'tokens'}} ) {
638 my $w_obj = $self->tradition->collation->add_reading({
639 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
640 push( @words, $w_obj );
641 push( @text, $token->{'t'} ); # TODO unless...?
643 ## TODO rethink this JSOn mechanism
644 if( exists $wit->{'layertokens'} ) {
645 foreach my $token ( @{$wit->{'layertokens'}} ) {
646 my $w_obj = $self->tradition->collation->add_reading({
647 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
648 push( @layerwords, $w_obj );
649 push( @layertext, $token->{'t'} );
653 $self->text( \@text );
654 $self->layertext( \@layertext ) if @layertext;
655 $self->path( \@words );
656 $self->uncorrected_path( \@layerwords ) if @layerwords;
659 sub _init_from_plaintext {
661 unless( $self->has_sigil ) {
662 throw( "No sigil defined for the plaintext witness" );
665 if( $self->has_file ) {
666 my $ok = open( INPUT, $self->file );
668 throw( ident => "bad source",
669 message => 'Could not open ' . $self->file . ' for reading' );
671 binmode( INPUT, ':encoding(UTF-8)' );
674 $str = join( '', @lines );
675 } elsif( $self->has_object ) { # ...seriously?
676 $str = ${$self->object};
678 $str = $self->string;
681 # TODO allow a different word separation expression
682 my @text = split( /\s+/, $str );
683 $self->text( \@text );
684 my @words = _split_words( $self, $str );
685 $self->path( \@words );
689 Text::Tradition::Error->throw(
690 'ident' => 'Witness parsing error',
697 my $xpc = XML::LibXML::XPathContext->new( $el );
698 if( $el->namespaceURI ) {
699 $xpc->registerNs( 'tei', $el->namespaceURI );
704 =head2 export_as_json
706 Exports the witness as a JSON structure, with the following keys:
710 =item * id - The witness sigil
712 =item * name - The witness identifier
714 =item * tokens - An array of hashes of the form { "t":"WORD" }
721 my $trad = Text::Tradition->new();
723 my @text = qw/ Thhis is a line of text /;
724 my $wit = $trad->add_witness(
726 'string' => join( ' ', @text ),
727 'sourcetype' => 'plaintext',
728 'identifier' => 'test witness',
730 my $jsonstruct = $wit->export_as_json;
731 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
732 is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" );
733 is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" );
734 foreach my $idx ( 0 .. $#text ) {
735 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" );
738 my @ctext = qw( when april with his showers sweet with fruit the drought of march
739 has pierced unto the root );
740 $trad = Text::Tradition->new(
741 'input' => 'CollateX',
742 'file' => 't/data/Collatex-16.xml' );
744 $jsonstruct = $trad->witness('A')->export_as_json;
745 is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
746 is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" );
747 is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" );
748 foreach my $idx ( 0 .. $#ctext ) {
749 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" );
752 ## TODO test layertext export
760 my @wordlist = map { { 't' => $_ || '' } } @{$self->text};
762 'id' => $self->sigil,
763 'tokens' => \@wordlist,
764 'name' => $self->identifier,
766 if( $self->is_layered ) {
767 my @lwlist = map { { 't' => $_ || '' } } @{$self->layertext};
768 $obj->{'layertokens'} = \@lwlist;
774 __PACKAGE__->meta->make_immutable;
780 =item * Figure out how to serialize a witness
782 =item * Support encodings other than UTF-8
788 This package is free software and is provided "as is" without express
789 or implied warranty. You can redistribute it and/or modify it under
790 the same terms as Perl itself.
794 Tara L Andrews E<lt>aurum@cpan.orgE<gt>