load XML::LibXML only when required; handle global relationships more correctly;...
[scpubgit/stemmatology.git] / lib / Text / Tradition / Witness.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Witness;
fae52efd 2
3use vars qw( %tags );
4use JSON;
dd3b58b0 5use Moose;
7854e12e 6use Moose::Util::TypeConstraints;
fae52efd 7use Text::TEI::Markup qw( word_tag_wrap );
8use TryCatch;
9use XML::Easy::Syntax qw( $xml10_name_rx );
dd3b58b0 10
7158714d 11=head1 NAME
12
13Text::Tradition::Witness - a manuscript witness to a text tradition
14
15=head1 SYNOPSIS
16
17 use Text::Tradition::Witness;
18 my $w = Text::Tradition::Witness->new(
19 'sigil' => 'A',
20 'identifier' => 'Oxford MS Ex.1932',
21 );
22
23=head1 DESCRIPTION
24
25Text::Tradition::Witness is an object representation of a manuscript
26witness to a text tradition. A manuscript has a sigil (a short code that
27represents it in the wider tradition), an identifier (e.g. the library ID),
28and probably a text.
29
30=head1 METHODS
31
32=head2 new
33
34Create a new witness. Options include:
35
36=over
37
38=item * sigil - A short code to represent the manuscript. Required.
39
fae52efd 40=item * sourcetype - What sort of witness data this is. Options are
41'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be
42used by Collation parsers.)
861c3e27 43
fae52efd 44=item * file
45=item * string
46=item * object
47
48The data source for the witness. Use the appropriate option.
49
50=item * use_text - An initialization option. If the witness is read from a
51TEI document and more than one <text/> tag exists therein, the default
52behavior is to use the first defined text. If this is not desired,
53use_text should be set to an XPath expression that will select the correct
54text.
7158714d 55
fae52efd 56=item * language - The name of the applicable L<Text::Tradition::Lang>
57module for language handling. Usually inherited from the language set in
58the L<Text::Tradition> object, and defaults to Default.
7158714d 59
60=item * identifier - The recognized name of the manuscript, e.g. a library
fae52efd 61identifier. Taken from the msDesc element for a TEI file.
7158714d 62
63=item * other_info - A freeform string for any other description of the
fae52efd 64manuscript.
7158714d 65
66=back
67
68=head2 sigil
69
fae52efd 70The sigil by which to identify this manuscript, which must conform to the
71specification for XML attribute strings (broadly speaking, it must begin
72with a letter and can have only a few sorts of punctuation characters in
73it.)
74
75=head2 identifier
76
77A freeform name by which to identify the manuscript, which may be longer
78than the sigil. Defaults to 'Unidentified ms', but will be taken from the
79TEI msName attribute, or constructed from the settlement and idno if
80supplied.
81
82=head2 settlement
83
84The city, town, etc. where the manuscript is held. Will be read from the
85TEI msDesc element if supplied.
86
87=head2 repository
88
89The institution that holds the manuscript. Will be read from the TEI msDesc
90element if supplied.
91
92=head2 idno
93
94The identification or call number of the manuscript. Will be read from the
95TEI msDesc element if supplied.
7158714d 96
97=head2 text
98
fae52efd 99An array of strings (words) that contains the text of the
100manuscript. This should not change after the witness has been
101instantiated, and the path through the collation should always match it.
102
103=head2 layertext
7158714d 104
fae52efd 105An array of strings (words) that contains the layered
106text, if any, of the manuscript. This should not change after the witness
107has been instantiated, and the path through the collation should always
108match it.
7158714d 109
fae52efd 110=head2 language
111
112Accessor method to get the witness language.
7158714d 113
114=head2 identifier
115
116Accessor method for the witness identifier.
117
118=head2 other_info
119
120Accessor method for the general witness description.
121
06e7cbc7 122=head2 has_source
123
124Boolean method that returns a true value if the witness was created with a
125data source (that is, a file, string, or object to be parsed).
126
1f7aa795 127=head2 is_layered
7158714d 128
1f7aa795 129Boolean method to note whether the witness has layers (e.g. pre-correction
130readings) in the collation.
7158714d 131
132=begin testing
133
fae52efd 134use Text::Tradition;
135my $trad = Text::Tradition->new( 'name' => 'test tradition' );
136my $c = $trad->collation;
7158714d 137
fae52efd 138# Test a plaintext witness via string
139my $str = 'This is a line of text';
140my $ptwit = $trad->add_witness(
7158714d 141 'sigil' => 'A',
fae52efd 142 'sourcetype' => 'plaintext',
143 'string' => $str
7158714d 144 );
fae52efd 145is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' );
146if( $ptwit ) {
147 is( $ptwit->sigil, 'A', "Witness has correct sigil" );
148 is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
7158714d 149}
150
65ed66b9 151# Test some JSON witnesses via object
152open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
153binmode( JSIN, ':encoding(UTF-8)' );
154my @lines = <JSIN>;
155close JSIN;
156$trad->add_json_witnesses( join( '', @lines ) );
157is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
158 "Found first JSON witness" );
159is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness',
160 "Found second JSON witness" );
161
b39fb0b3 162# Test an XML witness via file
163my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
164 'file' => 't/data/witnesses/teiwit.xml' );
165is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
166if( $xmlwit ) {
167 is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
168 ok( $xmlwit->is_layered, "Picked up correction layer" );
169 is( @{$xmlwit->text}, 182, "Got correct text length" );
170 is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" );
171}
172my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings;
173is( @allwitwords, 184, "Reused appropriate readings" );
fae52efd 174
175## Test use_text
b39fb0b3 176my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
177 'file' => 't/data/witnesses/group.xml',
178 'use_text' => '//tei:group/tei:text[2]' );
179is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" );
180if( $xpwit ) {
181 is( $xpwit->sigil, 'G', "XML part witness has correct sigil" );
182 ok( !$xpwit->is_layered, "Picked up no correction layer" );
183 is( @{$xpwit->text}, 157, "Got correct text length" );
184}
185
fae52efd 186
7158714d 187=end testing
188
189=cut
190
fae52efd 191subtype 'SourceType',
192 as 'Str',
193 where { $_ =~ /^(xmldesc|plaintext|json|collation)$/ },
194 message { 'Source type must be one of xmldesc, plaintext, json, collation' };
195
196subtype 'Sigil',
197 as 'Str',
198 where { $_ =~ /\A$xml10_name_rx\z/ },
199 message { 'Sigil must be a valid XML attribute string' };
200
201no Moose::Util::TypeConstraints;
202
203has 'tradition' => (
204 'is' => 'ro',
205 'isa' => 'Text::Tradition',
206 'required' => 1,
207 );
208
209# Sigil. Required identifier for a witness, but may be found inside
210# the XML file.
dd3b58b0 211has 'sigil' => (
7158714d 212 is => 'ro',
fae52efd 213 isa => 'Sigil',
214 predicate => 'has_sigil',
215 writer => '_set_sigil',
216 );
217
b39fb0b3 218has 'language' => (
219 is => 'ro',
220 isa => 'Str',
221 default => 'Default',
222 );
223
fae52efd 224# Other identifying information
225has 'identifier' => (
226 is => 'rw',
227 isa => 'Str',
228 );
229
230has 'settlement' => (
231 is => 'rw',
232 isa => 'Str',
233 );
234
235has 'repository' => (
236 is => 'rw',
237 isa => 'Str',
238 );
239
240has 'idno' => (
241 is => 'rw',
242 isa => 'Str',
243 );
244
b39fb0b3 245# Source. Can be XML obj, JSON data struct, or string.
246# Not used if the witness is created by parsing a collation.
fae52efd 247has 'sourcetype' => (
248 is => 'ro',
249 isa => 'SourceType',
250 required => 1,
251);
252
fae52efd 253has 'file' => (
254 is => 'ro',
255 isa => 'Str',
256 predicate => 'has_file',
257);
258
259has 'string' => (
260 is => 'ro',
261 isa => 'Str',
262 predicate => 'has_string',
263);
264
265has 'object' => ( # could be anything.
266 is => 'ro',
267 predicate => 'has_object',
268 clearer => 'clear_object',
269);
270
271# In the case of a TEI document with multiple texts, specify
272# which text is the root. Should be an XPath expression.
273has 'use_text' => (
274 is => 'ro',
7158714d 275 isa => 'Str',
fae52efd 276 );
277
7158714d 278# Text. This is an array of strings (i.e. word tokens).
d047cd52 279# TODO Think about how to handle this for the case of pre-prepared
280# collations, where the tokens are in the graph already.
dd3b58b0 281has 'text' => (
7158714d 282 is => 'rw',
283 isa => 'ArrayRef[Str]',
284 predicate => 'has_text',
285 );
b0b4421a 286
287has 'layertext' => (
288 is => 'rw',
289 isa => 'ArrayRef[Str]',
290 predicate => 'has_layertext',
291 );
fae52efd 292
1f7aa795 293# Path. This is an array of Reading nodes that can be saved during
294# initialization, but should be cleared before saving in a DB.
4a8828f0 295has 'path' => (
7158714d 296 is => 'rw',
297 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
298 predicate => 'has_path',
1f7aa795 299 clearer => 'clear_path',
7158714d 300 );
4a8828f0 301
b39fb0b3 302## TODO change the name of this
b15511bf 303has 'uncorrected_path' => (
7158714d 304 is => 'rw',
305 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
1f7aa795 306 clearer => 'clear_uncorrected_path',
307 );
308
309has 'is_layered' => (
310 is => 'rw',
311 isa => 'Bool',
7158714d 312 );
f6066bac 313
1f7aa795 314# If we set an uncorrected path, ever, remember that we did so.
315around 'uncorrected_path' => sub {
316 my $orig = shift;
317 my $self = shift;
318
319 $self->is_layered( 1 );
320 $self->$orig( @_ );
321};
e2902068 322
784877d9 323sub BUILD {
7158714d 324 my $self = shift;
325 if( $self->has_source ) {
fae52efd 326 my $init_sub = '_init_from_' . $self->sourcetype;
327 $self->$init_sub();
328 # Remove our XML / source objects; we no longer need them.
329 $self->clear_object if $self->has_object;
330 $self->tradition->collation->make_witness_path( $self );
331 }
332 return $self;
333}
334
335sub has_source {
336 my $self = shift;
337 return $self->has_file || $self->has_string || $self->has_object;
338}
339
340sub _init_from_xmldesc {
341 my $self = shift;
342 my $xmlobj;
343 if( $self->has_object ) {
344 unless( ref( $self->object ) eq 'XML::LibXML::Element' ) {
345 throw( ident => "bad source",
346 message => "Source object must be an XML::LibXML::Element (this is "
347 . ref( $self->object ) . ");" );
348 }
349 $xmlobj = $self->object;
350 } else {
428bcf0b 351 require XML::LibXML;
fae52efd 352 my $parser = XML::LibXML->new();
353 my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
354 try {
355 $xmlobj = $parser->$parsersub( $self->file )->documentElement;
356 } catch( XML::LibXML::Error $e ) {
357 throw( ident => "bad source",
358 message => "XML parsing error: " . $e->as_string );
359 }
360 }
361
362 unless( $xmlobj->nodeName eq 'TEI' ) {
363 throw( ident => "bad source",
364 message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" );
365 }
366
367 # Set up the tags we need, with or without namespaces.
368 map { $tags{$_} = "//$_" }
369 qw/ msDesc msName settlement repository idno p lg w seg add del /;
370 # Set up our XPath object
371 my $xpc = _xpc_for_el( $xmlobj );
372 # Use namespace-aware tags if we have to
373 if( $xmlobj->namespaceURI ) {
374 map { $tags{$_} = "//tei:$_" } keys %tags;
375 }
376
377 # Get the identifier
378 if( my $desc = $xpc->find( $tags{msDesc} ) ) {
379 my $descnode = $desc->get_node(1);
fae52efd 380 # First try to use settlement/repository/idno.
381 my( $setNode, $reposNode, $idNode ) =
382 ( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
383 $xpc->find( $tags{repository}, $descnode )->get_node(1),
384 $xpc->find( $tags{idno}, $descnode )->get_node(1) );
385 $self->settlement( $setNode ? $setNode->textContent : '' );
386 $self->repository( $reposNode ? $reposNode->textContent : '' );
387 $self->idno( $idNode ? $idNode->textContent : '' );
388 if( $self->settlement && $self->idno ) {
389 $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) );
390 } else {
391 # Look for an msName.
392 my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1);
393 if( $msNameNode ) {
394 $self->identifier( $msNameNode->textContent );
395 } else {
396 # We have an msDesc but who knows what is in it?
397 my $desc = $descnode->textContent;
398 $desc =~ s/\n/ /gs;
399 $desc =~ s/\s+/ /g;
400 $self->identifier( $desc );
401 }
402 }
403 if( $descnode->hasAttribute('xml:id') ) {
404 $self->_set_sigil( $descnode->getAttribute('xml:id') );
405 } elsif( !$self->has_sigil ) {
65ed66b9 406 throw( ident => 'missing sigil',
407 message => 'Could not find xml:id witness sigil' );
fae52efd 408 }
409 } else {
410 throw( ident => "bad source",
411 message => "Could not find manuscript description element in TEI header" );
412 }
413
414 # Now get the words out.
415 my @words;
416 my @layerwords; # if the witness has layers
417 # First, make sure all the words are wrapped in tags.
418 # TODO Make this not necessarily dependent upon whitespace...
419 word_tag_wrap( $xmlobj );
420 # Now go text hunting.
421 my @textnodes;
422 if( $self->use_text ) {
423 @textnodes = $xpc->findnodes( $self->use_text );
424 } else {
425 # Use the first 'text' node in the document.
426 @textnodes = $xmlobj->getElementsByTagName( 'text' );
427 }
428 my $teitext = $textnodes[0];
429 if( $teitext ) {
430 _tokenize_text( $self, $teitext, \@words, \@layerwords );
431 } else {
432 throw( ident => "bad source",
433 message => "No text element in document '" . $self->{'identifier'} . "!" );
434 }
435
b39fb0b3 436 my @text = map { $_->text } @words;
437 my @layertext = map { $_->text } @layerwords;
fae52efd 438 $self->path( \@words );
b39fb0b3 439 $self->text( \@text );
440 if( join( ' ', @text ) ne join( ' ', @layertext ) ) {
fae52efd 441 $self->uncorrected_path( \@layerwords );
b39fb0b3 442 $self->layertext( \@layertext );
fae52efd 443 }
fae52efd 444}
445
446sub _tokenize_text {
447 my( $self, $teitext, $wordlist, $uncorrlist ) = @_;
448 # Strip out the words.
449 my $xpc = _xpc_for_el( $teitext );
450 my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' );
451 foreach( @divs ) {
452 my $place_str;
453 if( my $n = $_->getAttribute( 'n' ) ) {
454 $place_str = '#DIV_' . $n . '#';
455 } else {
456 $place_str = '#DIV#';
457 }
458 $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str );
459 } # foreach <div/>
460
461 # But maybe we don't have any divs. Just paragraphs.
462 unless( @divs ) {
463 $self->_objectify_words( $teitext, $wordlist, $uncorrlist );
464 }
465}
466
467sub _objectify_words {
468 my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_;
469
470 my $xpc = _xpc_for_el( $element );
471 my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg};
472 my @pgraphs = $xpc->findnodes( $xpexpr );
473 return () unless @pgraphs;
474 # Set up an expression to look for words and segs
475 $xpexpr = '.' . $tags{w} . '|.' . $tags{seg};
476 foreach my $pg ( @pgraphs ) {
477 # If this paragraph is the descendant of a note element,
478 # skip it.
479 my @noop_container = $xpc->findnodes( 'ancestor::note', $pg );
480 next if scalar @noop_container;
481 # Get the text of each node
482 my $first_word = 1;
483 # Hunt down each wrapped word/seg, and make an object (or two objects)
484 # of it, if necessary.
485 foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
b39fb0b3 486 my( $text, $uncorr ) = _get_word_strings( $c );
fae52efd 487# try {
488# ( $text, $uncorr ) = _get_word_object( $c );
489# } catch( Text::Tradition::Error $e
490# where { $_->has_tag( 'lb' ) } ) {
491# next;
492# }
493 unless( defined $text || defined $uncorr ) {
494 print STDERR "WARNING: no text in node " . $c->nodeName
495 . "\n" unless $c->nodeName eq 'lb';
496 next;
497 }
498 print STDERR "DEBUG: space found in element node "
499 . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/;
500
501 my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist;
502 while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) {
503 $ctr++;
504 }
505 my $id = $self->sigil . 'r' . $ctr;
506 my( $word, $acword );
507 if( $text ) {
508 $word = $self->tradition->collation->add_reading(
509 { 'id' => $id, 'text' => $text });
510 }
511 if( $uncorr && $uncorr ne $text ) {
512 $id .= '_ac';
513 $acword = $self->tradition->collation->add_reading(
514 { 'id' => $id, 'text' => $uncorr });
515 } elsif( $uncorr ) {
516 $acword = $word;
517 }
518
519# if( $first_word ) {
520# $first_word = 0;
521# # Set the relevant sectioning markers
522# if( $divmarker ) {
523# $w->add_placeholder( $divmarker );
524# $divmarker = undef;
525# }
526# $w->add_placeholder( '#PG#' );
527# }
528 push( @$wordlist, $word ) if $word;
529 push( @$uncorrlist, $acword ) if $acword;
530 }
531 }
532}
533
534# Given a word or segment node, make a Reading object for the word
535# therein. Make two Reading objects if there is an 'uncorrected' vs.
536# 'corrected' state.
537
538sub _get_word_strings {
539 my( $node ) = @_;
540 my( $text, $uncorrtext );
541 # We can have an lb or pb in the middle of a word; if we do, the
542 # whitespace (including \n) after the break becomes insignificant
543 # and we want to nuke it.
544 my $strip_leading_space = 0;
545 my $word_excluded = 0;
546 my $xpc = _xpc_for_el( $node );
547 # TODO This does not cope with nested add/dels.
b39fb0b3 548 my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) );
549 my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) );
fae52efd 550 foreach my $c ($node->childNodes() ) {
551 if( $c->nodeName eq 'num'
552 && defined $c->getAttribute( 'value' ) ) {
553 # Push the number.
554 $text .= $c->getAttribute( 'value' ) unless @deletion;
555 $uncorrtext .= $c->getAttribute( 'value' ) unless @addition;
556 # If this is just after a line/page break, return to normal behavior.
557 $strip_leading_space = 0;
558 } elsif ( $c->nodeName =~ /^[lp]b$/ ) {
559 # Set a flag that strips leading whitespace until we
560 # get to the next bit of non-whitespace.
561 $strip_leading_space = 1;
562 } elsif ( $c->nodeName eq 'fw' # for catchwords
563 || $c->nodeName eq 'sic'
564 || $c->nodeName eq 'note' #TODO: decide how to deal with notes
565 || $c->textContent eq ''
566 || ref( $c ) eq 'XML::LibXML::Comment' ) {
567 $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
568 next;
569 } elsif( $c->nodeName eq 'add' ) {
b39fb0b3 570 my( $use, $discard ) = _get_word_strings( $c );
fae52efd 571 $text .= $use;
572 } elsif( $c->nodeName eq 'del' ) {
b39fb0b3 573 my( $discard, $use ) = _get_word_strings( $c );
fae52efd 574 $uncorrtext .= $use;
575 } else {
b39fb0b3 576 my ( $tagtxt, $taguncorr );
fae52efd 577 if( ref( $c ) eq 'XML::LibXML::Text' ) {
578 # A text node.
579 $tagtxt = $c->textContent;
b39fb0b3 580 $taguncorr = $c->textContent;
fae52efd 581 } else {
b39fb0b3 582 ( $tagtxt, $taguncorr ) = _get_word_strings( $c );
fae52efd 583 }
584 if( $strip_leading_space ) {
585 $tagtxt =~ s/^[\s\n]+//s;
b39fb0b3 586 $taguncorr =~ s/^[\s\n]+//s;
fae52efd 587 # Unset the flag as soon as we see non-whitespace.
588 $strip_leading_space = 0 if $tagtxt;
589 }
590 $text .= $tagtxt;
b39fb0b3 591 $uncorrtext .= $taguncorr;
fae52efd 592 }
593 }
594 throw( ident => "text not found",
595 tags => [ $node->nodeName ],
596 message => "No text found in node " . $node->toString(0) )
597 unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/;
598 return( $text, $uncorrtext );
599}
600
601sub _split_words {
602 my( $self, $string, $c ) = @_;
603 my @raw_words = split( /\s+/, $string );
604 my @words;
605 foreach my $w ( @raw_words ) {
606 my $id = $self->sigil . 'r'. $c++;
607 my %opts = ( 'text' => $w, 'id' => $id, 'language' => $self->language );
608 my $w_obj = $self->tradition->collation->add_reading( \%opts );
609 # Skip any words that have been canonized out of existence.
610 next if( length( $w_obj->text ) == 0 );
611 push( @words, $w_obj );
612 }
613 return @words;
614}
615
616sub _init_from_json {
617 my( $self ) = shift;
618 my $wit;
619 if( $self->has_object ) {
620 $wit = $self->object;
65ed66b9 621 } elsif( $self->has_string ) {
622 $wit = from_json( $self->string );
623 } elsif( $self->has_file ) {
624 my $ok = open( INPUT, $self->file );
625 unless( $ok ) {
626 throw( ident => "bad source",
627 message => 'Could not open ' . $self->file . ' for reading' );
628 }
629 binmode( INPUT, ':encoding(UTF-8)' );
630 my @lines = <INPUT>;
631 close INPUT;
632 $wit = from_json( join( '', @lines ) );
fae52efd 633 }
634
65ed66b9 635 if( exists $wit->{'id'} ) {
636 $self->_set_sigil( $wit->{'id'} );
637 } elsif( !$self->has_sigil ) {
638 throw( ident => 'missing sigil',
639 message => 'Could not find witness sigil (id) in JSON spec' );
640 }
fae52efd 641 $self->identifier( $wit->{'name'} );
642 my @words;
643 my @layerwords;
038d6b50 644 my( @text, @layertext );
fae52efd 645 if( exists $wit->{'content'} ) {
646 # We need to tokenize the text ourselves.
647 @words = _split_words( $self, $wit->{'content'} );
648 } elsif( exists $wit->{'tokens'} ) {
649 # We have a bunch of pretokenized words.
650 my $ctr = 0;
651 foreach my $token ( @{$wit->{'tokens'}} ) {
652 my $w_obj = $self->tradition->collation->add_reading({
65ed66b9 653 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
fae52efd 654 push( @words, $w_obj );
038d6b50 655 push( @text, $token->{'t'} ); # TODO unless...?
fae52efd 656 }
657 ## TODO rethink this JSOn mechanism
658 if( exists $wit->{'layertokens'} ) {
659 foreach my $token ( @{$wit->{'layertokens'}} ) {
660 my $w_obj = $self->tradition->collation->add_reading({
65ed66b9 661 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
fae52efd 662 push( @layerwords, $w_obj );
038d6b50 663 push( @layertext, $token->{'t'} );
7158714d 664 }
fae52efd 665 }
7158714d 666 }
038d6b50 667 $self->text( \@text );
668 $self->layertext( \@layertext ) if @layertext;
fae52efd 669 $self->path( \@words );
670 $self->uncorrected_path( \@layerwords ) if @layerwords;
7158714d 671}
672
fae52efd 673sub _init_from_plaintext {
674 my( $self ) = @_;
675 my $str;
676 if( $self->has_file ) {
677 my $ok = open( INPUT, $self->file );
678 unless( $ok ) {
679 throw( ident => "bad source",
680 message => 'Could not open ' . $self->file . ' for reading' );
681 }
682 binmode( INPUT, ':encoding(UTF-8)' );
683 my @lines = <INPUT>;
684 close INPUT;
685 $str = join( '', @lines );
686 } elsif( $self->has_object ) { # ...seriously?
687 $str = ${$self->object};
688 } else {
689 $str = $self->string;
690 }
691
692 # TODO allow a different word separation expression
693 my @text = split( /\s+/, $str );
694 $self->text( \@text );
695 my @words = _split_words( $self, $str );
696 $self->path( \@words );
697}
698
699sub throw {
700 Text::Tradition::Error->throw(
701 'ident' => 'Witness parsing error',
702 'message' => $_[0],
703 );
704}
705
706sub _xpc_for_el {
707 my $el = shift;
708 my $xpc = XML::LibXML::XPathContext->new( $el );
709 if( $el->namespaceURI ) {
710 $xpc->registerNs( 'tei', $el->namespaceURI );
711 }
712 return $xpc;
713}
714
f025e303 715=head2 export_as_json
716
717Exports the witness as a JSON structure, with the following keys:
718
719=over 4
720
721=item * id - The witness sigil
722
723=item * name - The witness identifier
724
725=item * tokens - An array of hashes of the form { "t":"WORD" }
726
727=back
728
729=begin testing
730
731use Text::Tradition;
fae52efd 732my $trad = Text::Tradition->new();
f025e303 733
fae52efd 734my @text = qw/ Thhis is a line of text /;
735my $wit = $trad->add_witness(
f025e303 736 'sigil' => 'A',
fae52efd 737 'string' => join( ' ', @text ),
738 'sourcetype' => 'plaintext',
f025e303 739 'identifier' => 'test witness',
740 );
741my $jsonstruct = $wit->export_as_json;
742is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
743is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" );
744is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" );
745foreach my $idx ( 0 .. $#text ) {
746 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" );
747}
748
749my @ctext = qw( when april with his showers sweet with fruit the drought of march
750 has pierced unto the root );
fae52efd 751$trad = Text::Tradition->new(
f025e303 752 'input' => 'CollateX',
753 'file' => 't/data/Collatex-16.xml' );
754
755$jsonstruct = $trad->witness('A')->export_as_json;
756is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
757is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" );
758is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" );
759foreach my $idx ( 0 .. $#ctext ) {
760 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" );
761}
762
fae52efd 763## TODO test layertext export
764
f025e303 765=end testing
766
767=cut
768
769sub export_as_json {
770 my $self = shift;
771 my @wordlist = map { { 't' => $_ || '' } } @{$self->text};
fae52efd 772 my $obj = {
f025e303 773 'id' => $self->sigil,
774 'tokens' => \@wordlist,
775 'name' => $self->identifier,
776 };
fae52efd 777 if( $self->is_layered ) {
778 my @lwlist = map { { 't' => $_ || '' } } @{$self->uncorrected};
779 $obj->{'layertokens'} = \@lwlist;
780 }
781 return $obj;
f025e303 782}
783
dd3b58b0 784no Moose;
785__PACKAGE__->meta->make_immutable;
7158714d 786
787=head1 BUGS / TODO
788
789=over
790
06e7cbc7 791=item * Figure out how to serialize a witness
792
fae52efd 793=item * Support encodings other than UTF-8
7158714d 794
795=back
796
797=head1 LICENSE
798
799This package is free software and is provided "as is" without express
800or implied warranty. You can redistribute it and/or modify it under
801the same terms as Perl itself.
802
803=head1 AUTHOR
804
805Tara L Andrews E<lt>aurum@cpan.orgE<gt>