make the rest of the tests work with the new Witness
[scpubgit/stemmatology.git] / lib / Text / Tradition / Witness.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Witness;
fae52efd 2
3use vars qw( %tags );
4use JSON;
dd3b58b0 5use Moose;
7854e12e 6use Moose::Util::TypeConstraints;
fae52efd 7use Text::TEI::Markup qw( word_tag_wrap );
8use TryCatch;
9use XML::Easy::Syntax qw( $xml10_name_rx );
dd3b58b0 10
7158714d 11=head1 NAME
12
13Text::Tradition::Witness - a manuscript witness to a text tradition
14
15=head1 SYNOPSIS
16
17 use Text::Tradition::Witness;
18 my $w = Text::Tradition::Witness->new(
19 'sigil' => 'A',
20 'identifier' => 'Oxford MS Ex.1932',
21 );
22
23=head1 DESCRIPTION
24
25Text::Tradition::Witness is an object representation of a manuscript
26witness to a text tradition. A manuscript has a sigil (a short code that
27represents it in the wider tradition), an identifier (e.g. the library ID),
28and probably a text.
29
30=head1 METHODS
31
32=head2 new
33
34Create a new witness. Options include:
35
36=over
37
38=item * sigil - A short code to represent the manuscript. Required.
39
fae52efd 40=item * sourcetype - What sort of witness data this is. Options are
41'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be
42used by Collation parsers.)
861c3e27 43
fae52efd 44=item * file
45=item * string
46=item * object
47
48The data source for the witness. Use the appropriate option.
49
50=item * use_text - An initialization option. If the witness is read from a
51TEI document and more than one <text/> tag exists therein, the default
52behavior is to use the first defined text. If this is not desired,
53use_text should be set to an XPath expression that will select the correct
54text.
7158714d 55
fae52efd 56=item * language - The name of the applicable L<Text::Tradition::Lang>
57module for language handling. Usually inherited from the language set in
58the L<Text::Tradition> object, and defaults to Default.
7158714d 59
60=item * identifier - The recognized name of the manuscript, e.g. a library
fae52efd 61identifier. Taken from the msDesc element for a TEI file.
7158714d 62
63=item * other_info - A freeform string for any other description of the
fae52efd 64manuscript.
7158714d 65
66=back
67
68=head2 sigil
69
fae52efd 70The sigil by which to identify this manuscript, which must conform to the
71specification for XML attribute strings (broadly speaking, it must begin
72with a letter and can have only a few sorts of punctuation characters in
73it.)
74
75=head2 identifier
76
77A freeform name by which to identify the manuscript, which may be longer
78than the sigil. Defaults to 'Unidentified ms', but will be taken from the
79TEI msName attribute, or constructed from the settlement and idno if
80supplied.
81
82=head2 settlement
83
84The city, town, etc. where the manuscript is held. Will be read from the
85TEI msDesc element if supplied.
86
87=head2 repository
88
89The institution that holds the manuscript. Will be read from the TEI msDesc
90element if supplied.
91
92=head2 idno
93
94The identification or call number of the manuscript. Will be read from the
95TEI msDesc element if supplied.
7158714d 96
97=head2 text
98
fae52efd 99An array of strings (words) that contains the text of the
100manuscript. This should not change after the witness has been
101instantiated, and the path through the collation should always match it.
102
103=head2 layertext
7158714d 104
fae52efd 105An array of strings (words) that contains the layered
106text, if any, of the manuscript. This should not change after the witness
107has been instantiated, and the path through the collation should always
108match it.
7158714d 109
fae52efd 110=head2 language
111
112Accessor method to get the witness language.
7158714d 113
114=head2 identifier
115
116Accessor method for the witness identifier.
117
118=head2 other_info
119
120Accessor method for the general witness description.
121
1f7aa795 122=head2 is_layered
7158714d 123
1f7aa795 124Boolean method to note whether the witness has layers (e.g. pre-correction
125readings) in the collation.
7158714d 126
127=begin testing
128
fae52efd 129use Text::Tradition;
130my $trad = Text::Tradition->new( 'name' => 'test tradition' );
131my $c = $trad->collation;
7158714d 132
fae52efd 133# Test a plaintext witness via string
134my $str = 'This is a line of text';
135my $ptwit = $trad->add_witness(
7158714d 136 'sigil' => 'A',
fae52efd 137 'sourcetype' => 'plaintext',
138 'string' => $str
7158714d 139 );
fae52efd 140is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' );
141if( $ptwit ) {
142 is( $ptwit->sigil, 'A', "Witness has correct sigil" );
143 is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
7158714d 144}
145
65ed66b9 146# Test some JSON witnesses via object
147open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
148binmode( JSIN, ':encoding(UTF-8)' );
149my @lines = <JSIN>;
150close JSIN;
151$trad->add_json_witnesses( join( '', @lines ) );
152is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
153 "Found first JSON witness" );
154is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness',
155 "Found second JSON witness" );
156
fae52efd 157# # Test an XML witness via file
158# my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
159# 'file' => 't/data/witnesses/teiwit.xml' );
160# is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
161# if( $xmlwit ) {
162# is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
163# ok( $xmlwit->is_layered, "Picked up correction layer" );
164# is( @{$xmlwit->path}, 185, "Got correct text length" );
165# is( @{$xmlwit->uncorrected_path}, 185, "Got correct a.c. text length" );
166# }
167
168## Test use_text
169
7158714d 170=end testing
171
172=cut
173
fae52efd 174subtype 'SourceType',
175 as 'Str',
176 where { $_ =~ /^(xmldesc|plaintext|json|collation)$/ },
177 message { 'Source type must be one of xmldesc, plaintext, json, collation' };
178
179subtype 'Sigil',
180 as 'Str',
181 where { $_ =~ /\A$xml10_name_rx\z/ },
182 message { 'Sigil must be a valid XML attribute string' };
183
184no Moose::Util::TypeConstraints;
185
186has 'tradition' => (
187 'is' => 'ro',
188 'isa' => 'Text::Tradition',
189 'required' => 1,
190 );
191
192# Sigil. Required identifier for a witness, but may be found inside
193# the XML file.
dd3b58b0 194has 'sigil' => (
7158714d 195 is => 'ro',
fae52efd 196 isa => 'Sigil',
197 predicate => 'has_sigil',
198 writer => '_set_sigil',
199 );
200
201# Other identifying information
202has 'identifier' => (
203 is => 'rw',
204 isa => 'Str',
205 );
206
207has 'settlement' => (
208 is => 'rw',
209 isa => 'Str',
210 );
211
212has 'repository' => (
213 is => 'rw',
214 isa => 'Str',
215 );
216
217has 'idno' => (
218 is => 'rw',
219 isa => 'Str',
220 );
221
222has 'sourcetype' => (
223 is => 'ro',
224 isa => 'SourceType',
225 required => 1,
226);
227
228has 'language' => (
229 is => 'ro',
230 isa => 'Str',
231 default => 'Default',
232 );
233
234# Source. Can be XML obj, JSON data struct, or string.
235# Not used if the witness is created by parsing a collation.
236has 'file' => (
237 is => 'ro',
238 isa => 'Str',
239 predicate => 'has_file',
240);
241
242has 'string' => (
243 is => 'ro',
244 isa => 'Str',
245 predicate => 'has_string',
246);
247
248has 'object' => ( # could be anything.
249 is => 'ro',
250 predicate => 'has_object',
251 clearer => 'clear_object',
252);
253
254# In the case of a TEI document with multiple texts, specify
255# which text is the root. Should be an XPath expression.
256has 'use_text' => (
257 is => 'ro',
7158714d 258 isa => 'Str',
fae52efd 259 );
260
261has 'msdesc' => ( # if we started with a TEI doc
262 is => 'ro',
263 isa => 'XML::LibXML::Element',
264 predicate => 'has_msdesc',
265 writer => '_save_msdesc',
7158714d 266 );
dd3b58b0 267
7158714d 268# Text. This is an array of strings (i.e. word tokens).
d047cd52 269# TODO Think about how to handle this for the case of pre-prepared
270# collations, where the tokens are in the graph already.
dd3b58b0 271has 'text' => (
7158714d 272 is => 'rw',
273 isa => 'ArrayRef[Str]',
274 predicate => 'has_text',
275 );
b0b4421a 276
277has 'layertext' => (
278 is => 'rw',
279 isa => 'ArrayRef[Str]',
280 predicate => 'has_layertext',
281 );
fae52efd 282
1f7aa795 283# Path. This is an array of Reading nodes that can be saved during
284# initialization, but should be cleared before saving in a DB.
4a8828f0 285has 'path' => (
7158714d 286 is => 'rw',
287 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
288 predicate => 'has_path',
1f7aa795 289 clearer => 'clear_path',
7158714d 290 );
4a8828f0 291
b15511bf 292has 'uncorrected_path' => (
7158714d 293 is => 'rw',
294 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
1f7aa795 295 clearer => 'clear_uncorrected_path',
296 );
297
298has 'is_layered' => (
299 is => 'rw',
300 isa => 'Bool',
7158714d 301 );
f6066bac 302
1f7aa795 303# If we set an uncorrected path, ever, remember that we did so.
304around 'uncorrected_path' => sub {
305 my $orig = shift;
306 my $self = shift;
307
308 $self->is_layered( 1 );
309 $self->$orig( @_ );
310};
e2902068 311
784877d9 312sub BUILD {
7158714d 313 my $self = shift;
314 if( $self->has_source ) {
fae52efd 315 my $init_sub = '_init_from_' . $self->sourcetype;
316 $self->$init_sub();
317 # Remove our XML / source objects; we no longer need them.
318 $self->clear_object if $self->has_object;
319 $self->tradition->collation->make_witness_path( $self );
320 }
321 return $self;
322}
323
324sub has_source {
325 my $self = shift;
326 return $self->has_file || $self->has_string || $self->has_object;
327}
328
329sub _init_from_xmldesc {
330 my $self = shift;
331 my $xmlobj;
332 if( $self->has_object ) {
333 unless( ref( $self->object ) eq 'XML::LibXML::Element' ) {
334 throw( ident => "bad source",
335 message => "Source object must be an XML::LibXML::Element (this is "
336 . ref( $self->object ) . ");" );
337 }
338 $xmlobj = $self->object;
339 } else {
340 my $parser = XML::LibXML->new();
341 my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
342 try {
343 $xmlobj = $parser->$parsersub( $self->file )->documentElement;
344 } catch( XML::LibXML::Error $e ) {
345 throw( ident => "bad source",
346 message => "XML parsing error: " . $e->as_string );
347 }
348 }
349
350 unless( $xmlobj->nodeName eq 'TEI' ) {
351 throw( ident => "bad source",
352 message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" );
353 }
354
355 # Set up the tags we need, with or without namespaces.
356 map { $tags{$_} = "//$_" }
357 qw/ msDesc msName settlement repository idno p lg w seg add del /;
358 # Set up our XPath object
359 my $xpc = _xpc_for_el( $xmlobj );
360 # Use namespace-aware tags if we have to
361 if( $xmlobj->namespaceURI ) {
362 map { $tags{$_} = "//tei:$_" } keys %tags;
363 }
364
365 # Get the identifier
366 if( my $desc = $xpc->find( $tags{msDesc} ) ) {
367 my $descnode = $desc->get_node(1);
368 $self->_save_msdesc( $descnode );
369 # First try to use settlement/repository/idno.
370 my( $setNode, $reposNode, $idNode ) =
371 ( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
372 $xpc->find( $tags{repository}, $descnode )->get_node(1),
373 $xpc->find( $tags{idno}, $descnode )->get_node(1) );
374 $self->settlement( $setNode ? $setNode->textContent : '' );
375 $self->repository( $reposNode ? $reposNode->textContent : '' );
376 $self->idno( $idNode ? $idNode->textContent : '' );
377 if( $self->settlement && $self->idno ) {
378 $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) );
379 } else {
380 # Look for an msName.
381 my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1);
382 if( $msNameNode ) {
383 $self->identifier( $msNameNode->textContent );
384 } else {
385 # We have an msDesc but who knows what is in it?
386 my $desc = $descnode->textContent;
387 $desc =~ s/\n/ /gs;
388 $desc =~ s/\s+/ /g;
389 $self->identifier( $desc );
390 }
391 }
392 if( $descnode->hasAttribute('xml:id') ) {
393 $self->_set_sigil( $descnode->getAttribute('xml:id') );
394 } elsif( !$self->has_sigil ) {
65ed66b9 395 throw( ident => 'missing sigil',
396 message => 'Could not find xml:id witness sigil' );
fae52efd 397 }
398 } else {
399 throw( ident => "bad source",
400 message => "Could not find manuscript description element in TEI header" );
401 }
402
403 # Now get the words out.
404 my @words;
405 my @layerwords; # if the witness has layers
406 # First, make sure all the words are wrapped in tags.
407 # TODO Make this not necessarily dependent upon whitespace...
408 word_tag_wrap( $xmlobj );
409 # Now go text hunting.
410 my @textnodes;
411 if( $self->use_text ) {
412 @textnodes = $xpc->findnodes( $self->use_text );
413 } else {
414 # Use the first 'text' node in the document.
415 @textnodes = $xmlobj->getElementsByTagName( 'text' );
416 }
417 my $teitext = $textnodes[0];
418 if( $teitext ) {
419 _tokenize_text( $self, $teitext, \@words, \@layerwords );
420 } else {
421 throw( ident => "bad source",
422 message => "No text element in document '" . $self->{'identifier'} . "!" );
423 }
424
425 $self->path( \@words );
426 my $a = join( ' ', map { $_->text } @words );
427 my $b = join( ' ', map { $_->text } @layerwords );
428 if( $a ne $b ) {
429 $self->uncorrected_path( \@layerwords );
430 }
431 # TODO set self->text
432}
433
434sub _tokenize_text {
435 my( $self, $teitext, $wordlist, $uncorrlist ) = @_;
436 # Strip out the words.
437 my $xpc = _xpc_for_el( $teitext );
438 my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' );
439 foreach( @divs ) {
440 my $place_str;
441 if( my $n = $_->getAttribute( 'n' ) ) {
442 $place_str = '#DIV_' . $n . '#';
443 } else {
444 $place_str = '#DIV#';
445 }
446 $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str );
447 } # foreach <div/>
448
449 # But maybe we don't have any divs. Just paragraphs.
450 unless( @divs ) {
451 $self->_objectify_words( $teitext, $wordlist, $uncorrlist );
452 }
453}
454
455sub _objectify_words {
456 my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_;
457
458 my $xpc = _xpc_for_el( $element );
459 my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg};
460 my @pgraphs = $xpc->findnodes( $xpexpr );
461 return () unless @pgraphs;
462 # Set up an expression to look for words and segs
463 $xpexpr = '.' . $tags{w} . '|.' . $tags{seg};
464 foreach my $pg ( @pgraphs ) {
465 # If this paragraph is the descendant of a note element,
466 # skip it.
467 my @noop_container = $xpc->findnodes( 'ancestor::note', $pg );
468 next if scalar @noop_container;
469 # Get the text of each node
470 my $first_word = 1;
471 # Hunt down each wrapped word/seg, and make an object (or two objects)
472 # of it, if necessary.
473 foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
474 my( $text, $uncorr ) = _get_word_object( $c );
475# try {
476# ( $text, $uncorr ) = _get_word_object( $c );
477# } catch( Text::Tradition::Error $e
478# where { $_->has_tag( 'lb' ) } ) {
479# next;
480# }
481 unless( defined $text || defined $uncorr ) {
482 print STDERR "WARNING: no text in node " . $c->nodeName
483 . "\n" unless $c->nodeName eq 'lb';
484 next;
485 }
486 print STDERR "DEBUG: space found in element node "
487 . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/;
488
489 my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist;
490 while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) {
491 $ctr++;
492 }
493 my $id = $self->sigil . 'r' . $ctr;
494 my( $word, $acword );
495 if( $text ) {
496 $word = $self->tradition->collation->add_reading(
497 { 'id' => $id, 'text' => $text });
498 }
499 if( $uncorr && $uncorr ne $text ) {
500 $id .= '_ac';
501 $acword = $self->tradition->collation->add_reading(
502 { 'id' => $id, 'text' => $uncorr });
503 } elsif( $uncorr ) {
504 $acword = $word;
505 }
506
507# if( $first_word ) {
508# $first_word = 0;
509# # Set the relevant sectioning markers
510# if( $divmarker ) {
511# $w->add_placeholder( $divmarker );
512# $divmarker = undef;
513# }
514# $w->add_placeholder( '#PG#' );
515# }
516 push( @$wordlist, $word ) if $word;
517 push( @$uncorrlist, $acword ) if $acword;
518 }
519 }
520}
521
522# Given a word or segment node, make a Reading object for the word
523# therein. Make two Reading objects if there is an 'uncorrected' vs.
524# 'corrected' state.
525
526sub _get_word_strings {
527 my( $node ) = @_;
528 my( $text, $uncorrtext );
529 # We can have an lb or pb in the middle of a word; if we do, the
530 # whitespace (including \n) after the break becomes insignificant
531 # and we want to nuke it.
532 my $strip_leading_space = 0;
533 my $word_excluded = 0;
534 my $xpc = _xpc_for_el( $node );
535 # TODO This does not cope with nested add/dels.
536 my @addition = $xpc->findnodes( 'ancestor::' . $tags{add} );
537 my @deletion = $xpc->findnodes( 'ancestor::' . $tags{del} );
538 foreach my $c ($node->childNodes() ) {
539 if( $c->nodeName eq 'num'
540 && defined $c->getAttribute( 'value' ) ) {
541 # Push the number.
542 $text .= $c->getAttribute( 'value' ) unless @deletion;
543 $uncorrtext .= $c->getAttribute( 'value' ) unless @addition;
544 # If this is just after a line/page break, return to normal behavior.
545 $strip_leading_space = 0;
546 } elsif ( $c->nodeName =~ /^[lp]b$/ ) {
547 # Set a flag that strips leading whitespace until we
548 # get to the next bit of non-whitespace.
549 $strip_leading_space = 1;
550 } elsif ( $c->nodeName eq 'fw' # for catchwords
551 || $c->nodeName eq 'sic'
552 || $c->nodeName eq 'note' #TODO: decide how to deal with notes
553 || $c->textContent eq ''
554 || ref( $c ) eq 'XML::LibXML::Comment' ) {
555 $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
556 next;
557 } elsif( $c->nodeName eq 'add' ) {
558 my( $use, $discard ) = _get_text_from_node( $c );
559 $text .= $use;
560 } elsif( $c->nodeName eq 'del' ) {
561 my( $discard, $use ) = _get_text_from_node( $c );
562 $uncorrtext .= $use;
563 } else {
564 my $tagtxt;
565 if( ref( $c ) eq 'XML::LibXML::Text' ) {
566 # A text node.
567 $tagtxt = $c->textContent;
568 } else {
569 $tagtxt = _get_text_from_node( $c );
570 }
571 if( $strip_leading_space ) {
572 $tagtxt =~ s/^[\s\n]+//s;
573 # Unset the flag as soon as we see non-whitespace.
574 $strip_leading_space = 0 if $tagtxt;
575 }
576 $text .= $tagtxt;
577 $uncorrtext .= $tagtxt;
578 }
579 }
580 throw( ident => "text not found",
581 tags => [ $node->nodeName ],
582 message => "No text found in node " . $node->toString(0) )
583 unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/;
584 return( $text, $uncorrtext );
585}
586
587sub _split_words {
588 my( $self, $string, $c ) = @_;
589 my @raw_words = split( /\s+/, $string );
590 my @words;
591 foreach my $w ( @raw_words ) {
592 my $id = $self->sigil . 'r'. $c++;
593 my %opts = ( 'text' => $w, 'id' => $id, 'language' => $self->language );
594 my $w_obj = $self->tradition->collation->add_reading( \%opts );
595 # Skip any words that have been canonized out of existence.
596 next if( length( $w_obj->text ) == 0 );
597 push( @words, $w_obj );
598 }
599 return @words;
600}
601
602sub _init_from_json {
603 my( $self ) = shift;
604 my $wit;
605 if( $self->has_object ) {
606 $wit = $self->object;
65ed66b9 607 } elsif( $self->has_string ) {
608 $wit = from_json( $self->string );
609 } elsif( $self->has_file ) {
610 my $ok = open( INPUT, $self->file );
611 unless( $ok ) {
612 throw( ident => "bad source",
613 message => 'Could not open ' . $self->file . ' for reading' );
614 }
615 binmode( INPUT, ':encoding(UTF-8)' );
616 my @lines = <INPUT>;
617 close INPUT;
618 $wit = from_json( join( '', @lines ) );
fae52efd 619 }
620
65ed66b9 621 if( exists $wit->{'id'} ) {
622 $self->_set_sigil( $wit->{'id'} );
623 } elsif( !$self->has_sigil ) {
624 throw( ident => 'missing sigil',
625 message => 'Could not find witness sigil (id) in JSON spec' );
626 }
fae52efd 627 $self->identifier( $wit->{'name'} );
628 my @words;
629 my @layerwords;
038d6b50 630 my( @text, @layertext );
fae52efd 631 if( exists $wit->{'content'} ) {
632 # We need to tokenize the text ourselves.
633 @words = _split_words( $self, $wit->{'content'} );
634 } elsif( exists $wit->{'tokens'} ) {
635 # We have a bunch of pretokenized words.
636 my $ctr = 0;
637 foreach my $token ( @{$wit->{'tokens'}} ) {
638 my $w_obj = $self->tradition->collation->add_reading({
65ed66b9 639 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
fae52efd 640 push( @words, $w_obj );
038d6b50 641 push( @text, $token->{'t'} ); # TODO unless...?
fae52efd 642 }
643 ## TODO rethink this JSOn mechanism
644 if( exists $wit->{'layertokens'} ) {
645 foreach my $token ( @{$wit->{'layertokens'}} ) {
646 my $w_obj = $self->tradition->collation->add_reading({
65ed66b9 647 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
fae52efd 648 push( @layerwords, $w_obj );
038d6b50 649 push( @layertext, $token->{'t'} );
7158714d 650 }
fae52efd 651 }
7158714d 652 }
038d6b50 653 $self->text( \@text );
654 $self->layertext( \@layertext ) if @layertext;
fae52efd 655 $self->path( \@words );
656 $self->uncorrected_path( \@layerwords ) if @layerwords;
7158714d 657}
658
fae52efd 659sub _init_from_plaintext {
660 my( $self ) = @_;
661 my $str;
662 if( $self->has_file ) {
663 my $ok = open( INPUT, $self->file );
664 unless( $ok ) {
665 throw( ident => "bad source",
666 message => 'Could not open ' . $self->file . ' for reading' );
667 }
668 binmode( INPUT, ':encoding(UTF-8)' );
669 my @lines = <INPUT>;
670 close INPUT;
671 $str = join( '', @lines );
672 } elsif( $self->has_object ) { # ...seriously?
673 $str = ${$self->object};
674 } else {
675 $str = $self->string;
676 }
677
678 # TODO allow a different word separation expression
679 my @text = split( /\s+/, $str );
680 $self->text( \@text );
681 my @words = _split_words( $self, $str );
682 $self->path( \@words );
683}
684
685sub throw {
686 Text::Tradition::Error->throw(
687 'ident' => 'Witness parsing error',
688 'message' => $_[0],
689 );
690}
691
692sub _xpc_for_el {
693 my $el = shift;
694 my $xpc = XML::LibXML::XPathContext->new( $el );
695 if( $el->namespaceURI ) {
696 $xpc->registerNs( 'tei', $el->namespaceURI );
697 }
698 return $xpc;
699}
700
f025e303 701=head2 export_as_json
702
703Exports the witness as a JSON structure, with the following keys:
704
705=over 4
706
707=item * id - The witness sigil
708
709=item * name - The witness identifier
710
711=item * tokens - An array of hashes of the form { "t":"WORD" }
712
713=back
714
715=begin testing
716
717use Text::Tradition;
fae52efd 718my $trad = Text::Tradition->new();
f025e303 719
fae52efd 720my @text = qw/ Thhis is a line of text /;
721my $wit = $trad->add_witness(
f025e303 722 'sigil' => 'A',
fae52efd 723 'string' => join( ' ', @text ),
724 'sourcetype' => 'plaintext',
f025e303 725 'identifier' => 'test witness',
726 );
727my $jsonstruct = $wit->export_as_json;
728is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
729is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" );
730is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" );
731foreach my $idx ( 0 .. $#text ) {
732 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" );
733}
734
735my @ctext = qw( when april with his showers sweet with fruit the drought of march
736 has pierced unto the root );
fae52efd 737$trad = Text::Tradition->new(
f025e303 738 'input' => 'CollateX',
739 'file' => 't/data/Collatex-16.xml' );
740
741$jsonstruct = $trad->witness('A')->export_as_json;
742is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
743is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" );
744is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" );
745foreach my $idx ( 0 .. $#ctext ) {
746 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" );
747}
748
fae52efd 749## TODO test layertext export
750
f025e303 751=end testing
752
753=cut
754
755sub export_as_json {
756 my $self = shift;
757 my @wordlist = map { { 't' => $_ || '' } } @{$self->text};
fae52efd 758 my $obj = {
f025e303 759 'id' => $self->sigil,
760 'tokens' => \@wordlist,
761 'name' => $self->identifier,
762 };
fae52efd 763 if( $self->is_layered ) {
764 my @lwlist = map { { 't' => $_ || '' } } @{$self->uncorrected};
765 $obj->{'layertokens'} = \@lwlist;
766 }
767 return $obj;
f025e303 768}
769
dd3b58b0 770no Moose;
771__PACKAGE__->meta->make_immutable;
7158714d 772
773=head1 BUGS / TODO
774
775=over
776
fae52efd 777=item * Support encodings other than UTF-8
7158714d 778
779=back
780
781=head1 LICENSE
782
783This package is free software and is provided "as is" without express
784or implied warranty. You can redistribute it and/or modify it under
785the same terms as Perl itself.
786
787=head1 AUTHOR
788
789Tara L Andrews E<lt>aurum@cpan.orgE<gt>