change calling of enum for RT ticket 90694
[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Witness.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Witness;
fae52efd 2
3use vars qw( %tags );
4use JSON;
dd3b58b0 5use Moose;
7854e12e 6use Moose::Util::TypeConstraints;
fae52efd 7use Text::TEI::Markup qw( word_tag_wrap );
8use TryCatch;
9use XML::Easy::Syntax qw( $xml10_name_rx );
dd3b58b0 10
7158714d 11=head1 NAME
12
13Text::Tradition::Witness - a manuscript witness to a text tradition
14
15=head1 SYNOPSIS
16
17 use Text::Tradition::Witness;
18 my $w = Text::Tradition::Witness->new(
19 'sigil' => 'A',
20 'identifier' => 'Oxford MS Ex.1932',
21 );
22
23=head1 DESCRIPTION
24
25Text::Tradition::Witness is an object representation of a manuscript
26witness to a text tradition. A manuscript has a sigil (a short code that
27represents it in the wider tradition), an identifier (e.g. the library ID),
28and probably a text.
29
30=head1 METHODS
31
32=head2 new
33
34Create a new witness. Options include:
35
36=over
37
38=item * sigil - A short code to represent the manuscript. Required.
39
fae52efd 40=item * sourcetype - What sort of witness data this is. Options are
41'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be
42used by Collation parsers.)
861c3e27 43
fae52efd 44=item * file
45=item * string
46=item * object
47
48The data source for the witness. Use the appropriate option.
49
50=item * use_text - An initialization option. If the witness is read from a
51TEI document and more than one <text/> tag exists therein, the default
52behavior is to use the first defined text. If this is not desired,
53use_text should be set to an XPath expression that will select the correct
54text.
7158714d 55
7158714d 56=item * identifier - The recognized name of the manuscript, e.g. a library
fae52efd 57identifier. Taken from the msDesc element for a TEI file.
7158714d 58
59=item * other_info - A freeform string for any other description of the
fae52efd 60manuscript.
7158714d 61
62=back
63
64=head2 sigil
65
fae52efd 66The sigil by which to identify this manuscript, which must conform to the
67specification for XML attribute strings (broadly speaking, it must begin
68with a letter and can have only a few sorts of punctuation characters in
69it.)
70
71=head2 identifier
72
73A freeform name by which to identify the manuscript, which may be longer
74than the sigil. Defaults to 'Unidentified ms', but will be taken from the
75TEI msName attribute, or constructed from the settlement and idno if
76supplied.
77
78=head2 settlement
79
80The city, town, etc. where the manuscript is held. Will be read from the
81TEI msDesc element if supplied.
82
83=head2 repository
84
85The institution that holds the manuscript. Will be read from the TEI msDesc
86element if supplied.
87
88=head2 idno
89
90The identification or call number of the manuscript. Will be read from the
91TEI msDesc element if supplied.
7158714d 92
93=head2 text
94
fae52efd 95An array of strings (words) that contains the text of the
96manuscript. This should not change after the witness has been
97instantiated, and the path through the collation should always match it.
98
99=head2 layertext
7158714d 100
fae52efd 101An array of strings (words) that contains the layered
102text, if any, of the manuscript. This should not change after the witness
103has been instantiated, and the path through the collation should always
104match it.
7158714d 105
7158714d 106=head2 identifier
107
108Accessor method for the witness identifier.
109
110=head2 other_info
111
112Accessor method for the general witness description.
113
06e7cbc7 114=head2 has_source
115
116Boolean method that returns a true value if the witness was created with a
117data source (that is, a file, string, or object to be parsed).
118
1f7aa795 119=head2 is_layered
7158714d 120
1f7aa795 121Boolean method to note whether the witness has layers (e.g. pre-correction
122readings) in the collation.
7158714d 123
124=begin testing
125
fae52efd 126use Text::Tradition;
127my $trad = Text::Tradition->new( 'name' => 'test tradition' );
128my $c = $trad->collation;
7158714d 129
fae52efd 130# Test a plaintext witness via string
131my $str = 'This is a line of text';
132my $ptwit = $trad->add_witness(
7158714d 133 'sigil' => 'A',
fae52efd 134 'sourcetype' => 'plaintext',
135 'string' => $str
7158714d 136 );
fae52efd 137is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' );
138if( $ptwit ) {
139 is( $ptwit->sigil, 'A', "Witness has correct sigil" );
248276a2 140 $c->make_witness_path( $ptwit );
fae52efd 141 is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
7158714d 142}
143
65ed66b9 144# Test some JSON witnesses via object
145open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
146binmode( JSIN, ':encoding(UTF-8)' );
147my @lines = <JSIN>;
148close JSIN;
149$trad->add_json_witnesses( join( '', @lines ) );
150is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
151 "Found first JSON witness" );
152is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness',
153 "Found second JSON witness" );
154
b39fb0b3 155# Test an XML witness via file
156my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
157 'file' => 't/data/witnesses/teiwit.xml' );
158is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
159if( $xmlwit ) {
160 is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
161 ok( $xmlwit->is_layered, "Picked up correction layer" );
162 is( @{$xmlwit->text}, 182, "Got correct text length" );
163 is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" );
164}
165my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings;
166is( @allwitwords, 184, "Reused appropriate readings" );
fae52efd 167
168## Test use_text
b39fb0b3 169my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
170 'file' => 't/data/witnesses/group.xml',
171 'use_text' => '//tei:group/tei:text[2]' );
172is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" );
173if( $xpwit ) {
174 is( $xpwit->sigil, 'G', "XML part witness has correct sigil" );
175 ok( !$xpwit->is_layered, "Picked up no correction layer" );
176 is( @{$xpwit->text}, 157, "Got correct text length" );
177}
178
fae52efd 179
7158714d 180=end testing
181
182=cut
183
bf7d52b5 184# Enable plugin(s) if available
185eval { with 'Text::Tradition::WitLanguage'; };
186
fae52efd 187subtype 'SourceType',
188 as 'Str',
189 where { $_ =~ /^(xmldesc|plaintext|json|collation)$/ },
190 message { 'Source type must be one of xmldesc, plaintext, json, collation' };
191
192subtype 'Sigil',
193 as 'Str',
194 where { $_ =~ /\A$xml10_name_rx\z/ },
195 message { 'Sigil must be a valid XML attribute string' };
196
197no Moose::Util::TypeConstraints;
198
199has 'tradition' => (
248276a2 200 is => 'ro',
201 isa => 'Text::Tradition',
202 required => 1,
203 weak_ref => 1
fae52efd 204 );
205
206# Sigil. Required identifier for a witness, but may be found inside
207# the XML file.
dd3b58b0 208has 'sigil' => (
7158714d 209 is => 'ro',
fae52efd 210 isa => 'Sigil',
211 predicate => 'has_sigil',
212 writer => '_set_sigil',
213 );
214
215# Other identifying information
216has 'identifier' => (
217 is => 'rw',
218 isa => 'Str',
219 );
220
221has 'settlement' => (
222 is => 'rw',
223 isa => 'Str',
224 );
225
226has 'repository' => (
227 is => 'rw',
228 isa => 'Str',
229 );
230
231has 'idno' => (
232 is => 'rw',
233 isa => 'Str',
234 );
235
b39fb0b3 236# Source. Can be XML obj, JSON data struct, or string.
237# Not used if the witness is created by parsing a collation.
fae52efd 238has 'sourcetype' => (
239 is => 'ro',
240 isa => 'SourceType',
241 required => 1,
242);
243
fae52efd 244has 'file' => (
245 is => 'ro',
246 isa => 'Str',
247 predicate => 'has_file',
248);
249
250has 'string' => (
251 is => 'ro',
252 isa => 'Str',
253 predicate => 'has_string',
254);
255
256has 'object' => ( # could be anything.
257 is => 'ro',
258 predicate => 'has_object',
259 clearer => 'clear_object',
260);
261
262# In the case of a TEI document with multiple texts, specify
263# which text is the root. Should be an XPath expression.
264has 'use_text' => (
265 is => 'ro',
7158714d 266 isa => 'Str',
fae52efd 267 );
268
7158714d 269# Text. This is an array of strings (i.e. word tokens).
d047cd52 270# TODO Think about how to handle this for the case of pre-prepared
271# collations, where the tokens are in the graph already.
dd3b58b0 272has 'text' => (
7158714d 273 is => 'rw',
274 isa => 'ArrayRef[Str]',
275 predicate => 'has_text',
276 );
b0b4421a 277
278has 'layertext' => (
279 is => 'rw',
280 isa => 'ArrayRef[Str]',
281 predicate => 'has_layertext',
282 );
fae52efd 283
4889be4f 284has 'is_collated' => (
285 is => 'rw',
286 isa => 'Bool'
287 );
288
1f7aa795 289# Path. This is an array of Reading nodes that can be saved during
290# initialization, but should be cleared before saving in a DB.
4a8828f0 291has 'path' => (
7158714d 292 is => 'rw',
293 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
294 predicate => 'has_path',
1f7aa795 295 clearer => 'clear_path',
7158714d 296 );
4a8828f0 297
b39fb0b3 298## TODO change the name of this
b15511bf 299has 'uncorrected_path' => (
7158714d 300 is => 'rw',
301 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
1f7aa795 302 clearer => 'clear_uncorrected_path',
303 );
304
305has 'is_layered' => (
306 is => 'rw',
307 isa => 'Bool',
7158714d 308 );
f6066bac 309
1f7aa795 310# If we set an uncorrected path, ever, remember that we did so.
311around 'uncorrected_path' => sub {
312 my $orig = shift;
313 my $self = shift;
314
315 $self->is_layered( 1 );
316 $self->$orig( @_ );
317};
e2902068 318
784877d9 319sub BUILD {
7158714d 320 my $self = shift;
321 if( $self->has_source ) {
fae52efd 322 my $init_sub = '_init_from_' . $self->sourcetype;
323 $self->$init_sub();
324 # Remove our XML / source objects; we no longer need them.
325 $self->clear_object if $self->has_object;
4889be4f 326 # $self->tradition->collation->make_witness_path( $self );
327 }
328 if( $self->sourcetype eq 'collation' ) {
329 $self->is_collated( 1 );
fae52efd 330 }
331 return $self;
332}
333
334sub has_source {
335 my $self = shift;
336 return $self->has_file || $self->has_string || $self->has_object;
337}
338
339sub _init_from_xmldesc {
340 my $self = shift;
341 my $xmlobj;
342 if( $self->has_object ) {
343 unless( ref( $self->object ) eq 'XML::LibXML::Element' ) {
344 throw( ident => "bad source",
345 message => "Source object must be an XML::LibXML::Element (this is "
346 . ref( $self->object ) . ");" );
347 }
348 $xmlobj = $self->object;
349 } else {
428bcf0b 350 require XML::LibXML;
fae52efd 351 my $parser = XML::LibXML->new();
352 my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
353 try {
354 $xmlobj = $parser->$parsersub( $self->file )->documentElement;
355 } catch( XML::LibXML::Error $e ) {
356 throw( ident => "bad source",
357 message => "XML parsing error: " . $e->as_string );
358 }
359 }
360
361 unless( $xmlobj->nodeName eq 'TEI' ) {
362 throw( ident => "bad source",
363 message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" );
364 }
365
366 # Set up the tags we need, with or without namespaces.
367 map { $tags{$_} = "//$_" }
368 qw/ msDesc msName settlement repository idno p lg w seg add del /;
369 # Set up our XPath object
370 my $xpc = _xpc_for_el( $xmlobj );
371 # Use namespace-aware tags if we have to
372 if( $xmlobj->namespaceURI ) {
373 map { $tags{$_} = "//tei:$_" } keys %tags;
374 }
375
376 # Get the identifier
377 if( my $desc = $xpc->find( $tags{msDesc} ) ) {
378 my $descnode = $desc->get_node(1);
fae52efd 379 # First try to use settlement/repository/idno.
380 my( $setNode, $reposNode, $idNode ) =
381 ( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
382 $xpc->find( $tags{repository}, $descnode )->get_node(1),
383 $xpc->find( $tags{idno}, $descnode )->get_node(1) );
384 $self->settlement( $setNode ? $setNode->textContent : '' );
385 $self->repository( $reposNode ? $reposNode->textContent : '' );
386 $self->idno( $idNode ? $idNode->textContent : '' );
387 if( $self->settlement && $self->idno ) {
388 $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) );
389 } else {
390 # Look for an msName.
391 my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1);
392 if( $msNameNode ) {
393 $self->identifier( $msNameNode->textContent );
394 } else {
395 # We have an msDesc but who knows what is in it?
396 my $desc = $descnode->textContent;
397 $desc =~ s/\n/ /gs;
398 $desc =~ s/\s+/ /g;
399 $self->identifier( $desc );
400 }
401 }
402 if( $descnode->hasAttribute('xml:id') ) {
403 $self->_set_sigil( $descnode->getAttribute('xml:id') );
404 } elsif( !$self->has_sigil ) {
65ed66b9 405 throw( ident => 'missing sigil',
406 message => 'Could not find xml:id witness sigil' );
fae52efd 407 }
408 } else {
409 throw( ident => "bad source",
410 message => "Could not find manuscript description element in TEI header" );
411 }
412
413 # Now get the words out.
414 my @words;
415 my @layerwords; # if the witness has layers
416 # First, make sure all the words are wrapped in tags.
417 # TODO Make this not necessarily dependent upon whitespace...
418 word_tag_wrap( $xmlobj );
419 # Now go text hunting.
420 my @textnodes;
421 if( $self->use_text ) {
422 @textnodes = $xpc->findnodes( $self->use_text );
423 } else {
424 # Use the first 'text' node in the document.
425 @textnodes = $xmlobj->getElementsByTagName( 'text' );
426 }
427 my $teitext = $textnodes[0];
428 if( $teitext ) {
429 _tokenize_text( $self, $teitext, \@words, \@layerwords );
430 } else {
431 throw( ident => "bad source",
432 message => "No text element in document '" . $self->{'identifier'} . "!" );
433 }
434
b39fb0b3 435 my @text = map { $_->text } @words;
436 my @layertext = map { $_->text } @layerwords;
fae52efd 437 $self->path( \@words );
b39fb0b3 438 $self->text( \@text );
439 if( join( ' ', @text ) ne join( ' ', @layertext ) ) {
fae52efd 440 $self->uncorrected_path( \@layerwords );
b39fb0b3 441 $self->layertext( \@layertext );
fae52efd 442 }
fae52efd 443}
444
445sub _tokenize_text {
446 my( $self, $teitext, $wordlist, $uncorrlist ) = @_;
447 # Strip out the words.
448 my $xpc = _xpc_for_el( $teitext );
449 my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' );
450 foreach( @divs ) {
451 my $place_str;
452 if( my $n = $_->getAttribute( 'n' ) ) {
453 $place_str = '#DIV_' . $n . '#';
454 } else {
455 $place_str = '#DIV#';
456 }
457 $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str );
458 } # foreach <div/>
459
460 # But maybe we don't have any divs. Just paragraphs.
461 unless( @divs ) {
462 $self->_objectify_words( $teitext, $wordlist, $uncorrlist );
463 }
464}
465
466sub _objectify_words {
467 my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_;
468
469 my $xpc = _xpc_for_el( $element );
470 my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg};
471 my @pgraphs = $xpc->findnodes( $xpexpr );
472 return () unless @pgraphs;
473 # Set up an expression to look for words and segs
474 $xpexpr = '.' . $tags{w} . '|.' . $tags{seg};
475 foreach my $pg ( @pgraphs ) {
476 # If this paragraph is the descendant of a note element,
477 # skip it.
478 my @noop_container = $xpc->findnodes( 'ancestor::note', $pg );
479 next if scalar @noop_container;
480 # Get the text of each node
481 my $first_word = 1;
482 # Hunt down each wrapped word/seg, and make an object (or two objects)
483 # of it, if necessary.
484 foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
b39fb0b3 485 my( $text, $uncorr ) = _get_word_strings( $c );
fae52efd 486# try {
487# ( $text, $uncorr ) = _get_word_object( $c );
488# } catch( Text::Tradition::Error $e
489# where { $_->has_tag( 'lb' ) } ) {
490# next;
491# }
492 unless( defined $text || defined $uncorr ) {
493 print STDERR "WARNING: no text in node " . $c->nodeName
494 . "\n" unless $c->nodeName eq 'lb';
495 next;
496 }
497 print STDERR "DEBUG: space found in element node "
498 . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/;
499
500 my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist;
501 while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) {
502 $ctr++;
503 }
504 my $id = $self->sigil . 'r' . $ctr;
505 my( $word, $acword );
506 if( $text ) {
507 $word = $self->tradition->collation->add_reading(
508 { 'id' => $id, 'text' => $text });
509 }
510 if( $uncorr && $uncorr ne $text ) {
511 $id .= '_ac';
512 $acword = $self->tradition->collation->add_reading(
513 { 'id' => $id, 'text' => $uncorr });
514 } elsif( $uncorr ) {
515 $acword = $word;
516 }
517
518# if( $first_word ) {
519# $first_word = 0;
520# # Set the relevant sectioning markers
521# if( $divmarker ) {
522# $w->add_placeholder( $divmarker );
523# $divmarker = undef;
524# }
525# $w->add_placeholder( '#PG#' );
526# }
527 push( @$wordlist, $word ) if $word;
528 push( @$uncorrlist, $acword ) if $acword;
529 }
530 }
531}
532
533# Given a word or segment node, make a Reading object for the word
534# therein. Make two Reading objects if there is an 'uncorrected' vs.
535# 'corrected' state.
536
537sub _get_word_strings {
538 my( $node ) = @_;
539 my( $text, $uncorrtext );
540 # We can have an lb or pb in the middle of a word; if we do, the
541 # whitespace (including \n) after the break becomes insignificant
542 # and we want to nuke it.
543 my $strip_leading_space = 0;
544 my $word_excluded = 0;
545 my $xpc = _xpc_for_el( $node );
546 # TODO This does not cope with nested add/dels.
b39fb0b3 547 my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) );
548 my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) );
fae52efd 549 foreach my $c ($node->childNodes() ) {
550 if( $c->nodeName eq 'num'
551 && defined $c->getAttribute( 'value' ) ) {
552 # Push the number.
553 $text .= $c->getAttribute( 'value' ) unless @deletion;
554 $uncorrtext .= $c->getAttribute( 'value' ) unless @addition;
555 # If this is just after a line/page break, return to normal behavior.
556 $strip_leading_space = 0;
557 } elsif ( $c->nodeName =~ /^[lp]b$/ ) {
558 # Set a flag that strips leading whitespace until we
559 # get to the next bit of non-whitespace.
560 $strip_leading_space = 1;
561 } elsif ( $c->nodeName eq 'fw' # for catchwords
562 || $c->nodeName eq 'sic'
563 || $c->nodeName eq 'note' #TODO: decide how to deal with notes
564 || $c->textContent eq ''
565 || ref( $c ) eq 'XML::LibXML::Comment' ) {
566 $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
567 next;
568 } elsif( $c->nodeName eq 'add' ) {
b39fb0b3 569 my( $use, $discard ) = _get_word_strings( $c );
fae52efd 570 $text .= $use;
571 } elsif( $c->nodeName eq 'del' ) {
b39fb0b3 572 my( $discard, $use ) = _get_word_strings( $c );
fae52efd 573 $uncorrtext .= $use;
574 } else {
b39fb0b3 575 my ( $tagtxt, $taguncorr );
fae52efd 576 if( ref( $c ) eq 'XML::LibXML::Text' ) {
577 # A text node.
578 $tagtxt = $c->textContent;
b39fb0b3 579 $taguncorr = $c->textContent;
fae52efd 580 } else {
b39fb0b3 581 ( $tagtxt, $taguncorr ) = _get_word_strings( $c );
fae52efd 582 }
583 if( $strip_leading_space ) {
584 $tagtxt =~ s/^[\s\n]+//s;
b39fb0b3 585 $taguncorr =~ s/^[\s\n]+//s;
fae52efd 586 # Unset the flag as soon as we see non-whitespace.
587 $strip_leading_space = 0 if $tagtxt;
588 }
589 $text .= $tagtxt;
b39fb0b3 590 $uncorrtext .= $taguncorr;
fae52efd 591 }
592 }
593 throw( ident => "text not found",
594 tags => [ $node->nodeName ],
595 message => "No text found in node " . $node->toString(0) )
596 unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/;
597 return( $text, $uncorrtext );
598}
599
600sub _split_words {
601 my( $self, $string, $c ) = @_;
602 my @raw_words = split( /\s+/, $string );
603 my @words;
604 foreach my $w ( @raw_words ) {
605 my $id = $self->sigil . 'r'. $c++;
e92d4229 606 my %opts = ( 'text' => $w, 'id' => $id );
fae52efd 607 my $w_obj = $self->tradition->collation->add_reading( \%opts );
608 # Skip any words that have been canonized out of existence.
609 next if( length( $w_obj->text ) == 0 );
610 push( @words, $w_obj );
611 }
612 return @words;
613}
614
615sub _init_from_json {
616 my( $self ) = shift;
617 my $wit;
618 if( $self->has_object ) {
619 $wit = $self->object;
65ed66b9 620 } elsif( $self->has_string ) {
621 $wit = from_json( $self->string );
622 } elsif( $self->has_file ) {
623 my $ok = open( INPUT, $self->file );
624 unless( $ok ) {
625 throw( ident => "bad source",
626 message => 'Could not open ' . $self->file . ' for reading' );
627 }
628 binmode( INPUT, ':encoding(UTF-8)' );
629 my @lines = <INPUT>;
630 close INPUT;
631 $wit = from_json( join( '', @lines ) );
fae52efd 632 }
633
65ed66b9 634 if( exists $wit->{'id'} ) {
635 $self->_set_sigil( $wit->{'id'} );
636 } elsif( !$self->has_sigil ) {
637 throw( ident => 'missing sigil',
638 message => 'Could not find witness sigil (id) in JSON spec' );
639 }
fae52efd 640 $self->identifier( $wit->{'name'} );
641 my @words;
642 my @layerwords;
038d6b50 643 my( @text, @layertext );
fae52efd 644 if( exists $wit->{'content'} ) {
645 # We need to tokenize the text ourselves.
646 @words = _split_words( $self, $wit->{'content'} );
647 } elsif( exists $wit->{'tokens'} ) {
648 # We have a bunch of pretokenized words.
649 my $ctr = 0;
650 foreach my $token ( @{$wit->{'tokens'}} ) {
651 my $w_obj = $self->tradition->collation->add_reading({
65ed66b9 652 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
fae52efd 653 push( @words, $w_obj );
038d6b50 654 push( @text, $token->{'t'} ); # TODO unless...?
fae52efd 655 }
656 ## TODO rethink this JSOn mechanism
657 if( exists $wit->{'layertokens'} ) {
658 foreach my $token ( @{$wit->{'layertokens'}} ) {
659 my $w_obj = $self->tradition->collation->add_reading({
65ed66b9 660 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
fae52efd 661 push( @layerwords, $w_obj );
038d6b50 662 push( @layertext, $token->{'t'} );
7158714d 663 }
fae52efd 664 }
7158714d 665 }
038d6b50 666 $self->text( \@text );
667 $self->layertext( \@layertext ) if @layertext;
fae52efd 668 $self->path( \@words );
669 $self->uncorrected_path( \@layerwords ) if @layerwords;
7158714d 670}
671
fae52efd 672sub _init_from_plaintext {
673 my( $self ) = @_;
bf7d52b5 674 unless( $self->has_sigil ) {
675 throw( "No sigil defined for the plaintext witness" );
676 }
fae52efd 677 my $str;
678 if( $self->has_file ) {
679 my $ok = open( INPUT, $self->file );
680 unless( $ok ) {
681 throw( ident => "bad source",
682 message => 'Could not open ' . $self->file . ' for reading' );
683 }
684 binmode( INPUT, ':encoding(UTF-8)' );
685 my @lines = <INPUT>;
686 close INPUT;
687 $str = join( '', @lines );
688 } elsif( $self->has_object ) { # ...seriously?
689 $str = ${$self->object};
690 } else {
691 $str = $self->string;
692 }
693
694 # TODO allow a different word separation expression
695 my @text = split( /\s+/, $str );
696 $self->text( \@text );
697 my @words = _split_words( $self, $str );
698 $self->path( \@words );
699}
700
701sub throw {
702 Text::Tradition::Error->throw(
703 'ident' => 'Witness parsing error',
704 'message' => $_[0],
705 );
706}
707
708sub _xpc_for_el {
709 my $el = shift;
710 my $xpc = XML::LibXML::XPathContext->new( $el );
711 if( $el->namespaceURI ) {
712 $xpc->registerNs( 'tei', $el->namespaceURI );
713 }
714 return $xpc;
715}
716
f025e303 717=head2 export_as_json
718
719Exports the witness as a JSON structure, with the following keys:
720
721=over 4
722
723=item * id - The witness sigil
724
725=item * name - The witness identifier
726
727=item * tokens - An array of hashes of the form { "t":"WORD" }
728
729=back
730
731=begin testing
732
733use Text::Tradition;
fae52efd 734my $trad = Text::Tradition->new();
f025e303 735
fae52efd 736my @text = qw/ Thhis is a line of text /;
737my $wit = $trad->add_witness(
f025e303 738 'sigil' => 'A',
fae52efd 739 'string' => join( ' ', @text ),
740 'sourcetype' => 'plaintext',
f025e303 741 'identifier' => 'test witness',
742 );
743my $jsonstruct = $wit->export_as_json;
744is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
745is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" );
746is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" );
747foreach my $idx ( 0 .. $#text ) {
748 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" );
749}
750
751my @ctext = qw( when april with his showers sweet with fruit the drought of march
752 has pierced unto the root );
fae52efd 753$trad = Text::Tradition->new(
f025e303 754 'input' => 'CollateX',
755 'file' => 't/data/Collatex-16.xml' );
756
757$jsonstruct = $trad->witness('A')->export_as_json;
758is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
759is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" );
760is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" );
761foreach my $idx ( 0 .. $#ctext ) {
762 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" );
763}
764
fae52efd 765## TODO test layertext export
766
f025e303 767=end testing
768
769=cut
770
771sub export_as_json {
772 my $self = shift;
773 my @wordlist = map { { 't' => $_ || '' } } @{$self->text};
fae52efd 774 my $obj = {
f025e303 775 'id' => $self->sigil,
776 'tokens' => \@wordlist,
777 'name' => $self->identifier,
778 };
fae52efd 779 if( $self->is_layered ) {
248276a2 780 my @lwlist = map { { 't' => $_ || '' } } @{$self->layertext};
fae52efd 781 $obj->{'layertokens'} = \@lwlist;
782 }
783 return $obj;
f025e303 784}
785
dd3b58b0 786no Moose;
787__PACKAGE__->meta->make_immutable;
7158714d 788
789=head1 BUGS / TODO
790
791=over
792
06e7cbc7 793=item * Figure out how to serialize a witness
794
fae52efd 795=item * Support encodings other than UTF-8
7158714d 796
797=back
798
799=head1 LICENSE
800
801This package is free software and is provided "as is" without express
802or implied warranty. You can redistribute it and/or modify it under
803the same terms as Perl itself.
804
805=head1 AUTHOR
806
807Tara L Andrews E<lt>aurum@cpan.orgE<gt>