load extensions statically to avoid bad object wrapping interactions
[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Witness.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Witness;
fae52efd 2
3use vars qw( %tags );
4use JSON;
dd3b58b0 5use Moose;
7854e12e 6use Moose::Util::TypeConstraints;
fae52efd 7use Text::TEI::Markup qw( word_tag_wrap );
8use TryCatch;
9use XML::Easy::Syntax qw( $xml10_name_rx );
dd3b58b0 10
7158714d 11=head1 NAME
12
13Text::Tradition::Witness - a manuscript witness to a text tradition
14
15=head1 SYNOPSIS
16
17 use Text::Tradition::Witness;
18 my $w = Text::Tradition::Witness->new(
19 'sigil' => 'A',
20 'identifier' => 'Oxford MS Ex.1932',
21 );
22
23=head1 DESCRIPTION
24
25Text::Tradition::Witness is an object representation of a manuscript
26witness to a text tradition. A manuscript has a sigil (a short code that
27represents it in the wider tradition), an identifier (e.g. the library ID),
28and probably a text.
29
30=head1 METHODS
31
32=head2 new
33
34Create a new witness. Options include:
35
36=over
37
38=item * sigil - A short code to represent the manuscript. Required.
39
fae52efd 40=item * sourcetype - What sort of witness data this is. Options are
41'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be
42used by Collation parsers.)
861c3e27 43
fae52efd 44=item * file
45=item * string
46=item * object
47
48The data source for the witness. Use the appropriate option.
49
50=item * use_text - An initialization option. If the witness is read from a
51TEI document and more than one <text/> tag exists therein, the default
52behavior is to use the first defined text. If this is not desired,
53use_text should be set to an XPath expression that will select the correct
54text.
7158714d 55
fae52efd 56=item * language - The name of the applicable L<Text::Tradition::Lang>
57module for language handling. Usually inherited from the language set in
58the L<Text::Tradition> object, and defaults to Default.
7158714d 59
60=item * identifier - The recognized name of the manuscript, e.g. a library
fae52efd 61identifier. Taken from the msDesc element for a TEI file.
7158714d 62
63=item * other_info - A freeform string for any other description of the
fae52efd 64manuscript.
7158714d 65
66=back
67
68=head2 sigil
69
fae52efd 70The sigil by which to identify this manuscript, which must conform to the
71specification for XML attribute strings (broadly speaking, it must begin
72with a letter and can have only a few sorts of punctuation characters in
73it.)
74
75=head2 identifier
76
77A freeform name by which to identify the manuscript, which may be longer
78than the sigil. Defaults to 'Unidentified ms', but will be taken from the
79TEI msName attribute, or constructed from the settlement and idno if
80supplied.
81
82=head2 settlement
83
84The city, town, etc. where the manuscript is held. Will be read from the
85TEI msDesc element if supplied.
86
87=head2 repository
88
89The institution that holds the manuscript. Will be read from the TEI msDesc
90element if supplied.
91
92=head2 idno
93
94The identification or call number of the manuscript. Will be read from the
95TEI msDesc element if supplied.
7158714d 96
97=head2 text
98
fae52efd 99An array of strings (words) that contains the text of the
100manuscript. This should not change after the witness has been
101instantiated, and the path through the collation should always match it.
102
103=head2 layertext
7158714d 104
fae52efd 105An array of strings (words) that contains the layered
106text, if any, of the manuscript. This should not change after the witness
107has been instantiated, and the path through the collation should always
108match it.
7158714d 109
fae52efd 110=head2 language
111
112Accessor method to get the witness language.
7158714d 113
114=head2 identifier
115
116Accessor method for the witness identifier.
117
118=head2 other_info
119
120Accessor method for the general witness description.
121
06e7cbc7 122=head2 has_source
123
124Boolean method that returns a true value if the witness was created with a
125data source (that is, a file, string, or object to be parsed).
126
1f7aa795 127=head2 is_layered
7158714d 128
1f7aa795 129Boolean method to note whether the witness has layers (e.g. pre-correction
130readings) in the collation.
7158714d 131
132=begin testing
133
fae52efd 134use Text::Tradition;
135my $trad = Text::Tradition->new( 'name' => 'test tradition' );
136my $c = $trad->collation;
7158714d 137
fae52efd 138# Test a plaintext witness via string
139my $str = 'This is a line of text';
140my $ptwit = $trad->add_witness(
7158714d 141 'sigil' => 'A',
fae52efd 142 'sourcetype' => 'plaintext',
143 'string' => $str
7158714d 144 );
fae52efd 145is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' );
146if( $ptwit ) {
147 is( $ptwit->sigil, 'A', "Witness has correct sigil" );
148 is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
7158714d 149}
150
65ed66b9 151# Test some JSON witnesses via object
152open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
153binmode( JSIN, ':encoding(UTF-8)' );
154my @lines = <JSIN>;
155close JSIN;
156$trad->add_json_witnesses( join( '', @lines ) );
157is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
158 "Found first JSON witness" );
159is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness',
160 "Found second JSON witness" );
161
b39fb0b3 162# Test an XML witness via file
163my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
164 'file' => 't/data/witnesses/teiwit.xml' );
165is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
166if( $xmlwit ) {
167 is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
168 ok( $xmlwit->is_layered, "Picked up correction layer" );
169 is( @{$xmlwit->text}, 182, "Got correct text length" );
170 is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" );
171}
172my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings;
173is( @allwitwords, 184, "Reused appropriate readings" );
fae52efd 174
175## Test use_text
b39fb0b3 176my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
177 'file' => 't/data/witnesses/group.xml',
178 'use_text' => '//tei:group/tei:text[2]' );
179is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" );
180if( $xpwit ) {
181 is( $xpwit->sigil, 'G', "XML part witness has correct sigil" );
182 ok( !$xpwit->is_layered, "Picked up no correction layer" );
183 is( @{$xpwit->text}, 157, "Got correct text length" );
184}
185
fae52efd 186
7158714d 187=end testing
188
189=cut
190
fae52efd 191subtype 'SourceType',
192 as 'Str',
193 where { $_ =~ /^(xmldesc|plaintext|json|collation)$/ },
194 message { 'Source type must be one of xmldesc, plaintext, json, collation' };
195
196subtype 'Sigil',
197 as 'Str',
198 where { $_ =~ /\A$xml10_name_rx\z/ },
199 message { 'Sigil must be a valid XML attribute string' };
200
201no Moose::Util::TypeConstraints;
202
203has 'tradition' => (
204 'is' => 'ro',
205 'isa' => 'Text::Tradition',
206 'required' => 1,
5c0072ef 207 weak_ref => 1
fae52efd 208 );
209
210# Sigil. Required identifier for a witness, but may be found inside
211# the XML file.
dd3b58b0 212has 'sigil' => (
7158714d 213 is => 'ro',
fae52efd 214 isa => 'Sigil',
215 predicate => 'has_sigil',
216 writer => '_set_sigil',
217 );
218
b39fb0b3 219has 'language' => (
220 is => 'ro',
221 isa => 'Str',
222 default => 'Default',
223 );
224
fae52efd 225# Other identifying information
226has 'identifier' => (
227 is => 'rw',
228 isa => 'Str',
229 );
230
231has 'settlement' => (
232 is => 'rw',
233 isa => 'Str',
234 );
235
236has 'repository' => (
237 is => 'rw',
238 isa => 'Str',
239 );
240
241has 'idno' => (
242 is => 'rw',
243 isa => 'Str',
244 );
245
b39fb0b3 246# Source. Can be XML obj, JSON data struct, or string.
247# Not used if the witness is created by parsing a collation.
fae52efd 248has 'sourcetype' => (
249 is => 'ro',
250 isa => 'SourceType',
251 required => 1,
252);
253
fae52efd 254has 'file' => (
255 is => 'ro',
256 isa => 'Str',
257 predicate => 'has_file',
258);
259
260has 'string' => (
261 is => 'ro',
262 isa => 'Str',
263 predicate => 'has_string',
264);
265
266has 'object' => ( # could be anything.
267 is => 'ro',
268 predicate => 'has_object',
269 clearer => 'clear_object',
270);
271
272# In the case of a TEI document with multiple texts, specify
273# which text is the root. Should be an XPath expression.
274has 'use_text' => (
275 is => 'ro',
7158714d 276 isa => 'Str',
fae52efd 277 );
278
7158714d 279# Text. This is an array of strings (i.e. word tokens).
d047cd52 280# TODO Think about how to handle this for the case of pre-prepared
281# collations, where the tokens are in the graph already.
dd3b58b0 282has 'text' => (
7158714d 283 is => 'rw',
284 isa => 'ArrayRef[Str]',
285 predicate => 'has_text',
286 );
b0b4421a 287
288has 'layertext' => (
289 is => 'rw',
290 isa => 'ArrayRef[Str]',
291 predicate => 'has_layertext',
292 );
fae52efd 293
1f7aa795 294# Path. This is an array of Reading nodes that can be saved during
295# initialization, but should be cleared before saving in a DB.
4a8828f0 296has 'path' => (
7158714d 297 is => 'rw',
298 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
299 predicate => 'has_path',
1f7aa795 300 clearer => 'clear_path',
7158714d 301 );
4a8828f0 302
b39fb0b3 303## TODO change the name of this
b15511bf 304has 'uncorrected_path' => (
7158714d 305 is => 'rw',
306 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
1f7aa795 307 clearer => 'clear_uncorrected_path',
308 );
309
310has 'is_layered' => (
311 is => 'rw',
312 isa => 'Bool',
7158714d 313 );
f6066bac 314
1f7aa795 315# If we set an uncorrected path, ever, remember that we did so.
316around 'uncorrected_path' => sub {
317 my $orig = shift;
318 my $self = shift;
319
320 $self->is_layered( 1 );
321 $self->$orig( @_ );
322};
e2902068 323
784877d9 324sub BUILD {
7158714d 325 my $self = shift;
326 if( $self->has_source ) {
fae52efd 327 my $init_sub = '_init_from_' . $self->sourcetype;
328 $self->$init_sub();
329 # Remove our XML / source objects; we no longer need them.
330 $self->clear_object if $self->has_object;
331 $self->tradition->collation->make_witness_path( $self );
332 }
333 return $self;
334}
335
336sub has_source {
337 my $self = shift;
338 return $self->has_file || $self->has_string || $self->has_object;
339}
340
341sub _init_from_xmldesc {
342 my $self = shift;
343 my $xmlobj;
344 if( $self->has_object ) {
345 unless( ref( $self->object ) eq 'XML::LibXML::Element' ) {
346 throw( ident => "bad source",
347 message => "Source object must be an XML::LibXML::Element (this is "
348 . ref( $self->object ) . ");" );
349 }
350 $xmlobj = $self->object;
351 } else {
428bcf0b 352 require XML::LibXML;
fae52efd 353 my $parser = XML::LibXML->new();
354 my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
355 try {
356 $xmlobj = $parser->$parsersub( $self->file )->documentElement;
357 } catch( XML::LibXML::Error $e ) {
358 throw( ident => "bad source",
359 message => "XML parsing error: " . $e->as_string );
360 }
361 }
362
363 unless( $xmlobj->nodeName eq 'TEI' ) {
364 throw( ident => "bad source",
365 message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" );
366 }
367
368 # Set up the tags we need, with or without namespaces.
369 map { $tags{$_} = "//$_" }
370 qw/ msDesc msName settlement repository idno p lg w seg add del /;
371 # Set up our XPath object
372 my $xpc = _xpc_for_el( $xmlobj );
373 # Use namespace-aware tags if we have to
374 if( $xmlobj->namespaceURI ) {
375 map { $tags{$_} = "//tei:$_" } keys %tags;
376 }
377
378 # Get the identifier
379 if( my $desc = $xpc->find( $tags{msDesc} ) ) {
380 my $descnode = $desc->get_node(1);
fae52efd 381 # First try to use settlement/repository/idno.
382 my( $setNode, $reposNode, $idNode ) =
383 ( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
384 $xpc->find( $tags{repository}, $descnode )->get_node(1),
385 $xpc->find( $tags{idno}, $descnode )->get_node(1) );
386 $self->settlement( $setNode ? $setNode->textContent : '' );
387 $self->repository( $reposNode ? $reposNode->textContent : '' );
388 $self->idno( $idNode ? $idNode->textContent : '' );
389 if( $self->settlement && $self->idno ) {
390 $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) );
391 } else {
392 # Look for an msName.
393 my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1);
394 if( $msNameNode ) {
395 $self->identifier( $msNameNode->textContent );
396 } else {
397 # We have an msDesc but who knows what is in it?
398 my $desc = $descnode->textContent;
399 $desc =~ s/\n/ /gs;
400 $desc =~ s/\s+/ /g;
401 $self->identifier( $desc );
402 }
403 }
404 if( $descnode->hasAttribute('xml:id') ) {
405 $self->_set_sigil( $descnode->getAttribute('xml:id') );
406 } elsif( !$self->has_sigil ) {
65ed66b9 407 throw( ident => 'missing sigil',
408 message => 'Could not find xml:id witness sigil' );
fae52efd 409 }
410 } else {
411 throw( ident => "bad source",
412 message => "Could not find manuscript description element in TEI header" );
413 }
414
415 # Now get the words out.
416 my @words;
417 my @layerwords; # if the witness has layers
418 # First, make sure all the words are wrapped in tags.
419 # TODO Make this not necessarily dependent upon whitespace...
420 word_tag_wrap( $xmlobj );
421 # Now go text hunting.
422 my @textnodes;
423 if( $self->use_text ) {
424 @textnodes = $xpc->findnodes( $self->use_text );
425 } else {
426 # Use the first 'text' node in the document.
427 @textnodes = $xmlobj->getElementsByTagName( 'text' );
428 }
429 my $teitext = $textnodes[0];
430 if( $teitext ) {
431 _tokenize_text( $self, $teitext, \@words, \@layerwords );
432 } else {
433 throw( ident => "bad source",
434 message => "No text element in document '" . $self->{'identifier'} . "!" );
435 }
436
b39fb0b3 437 my @text = map { $_->text } @words;
438 my @layertext = map { $_->text } @layerwords;
fae52efd 439 $self->path( \@words );
b39fb0b3 440 $self->text( \@text );
441 if( join( ' ', @text ) ne join( ' ', @layertext ) ) {
fae52efd 442 $self->uncorrected_path( \@layerwords );
b39fb0b3 443 $self->layertext( \@layertext );
fae52efd 444 }
fae52efd 445}
446
447sub _tokenize_text {
448 my( $self, $teitext, $wordlist, $uncorrlist ) = @_;
449 # Strip out the words.
450 my $xpc = _xpc_for_el( $teitext );
451 my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' );
452 foreach( @divs ) {
453 my $place_str;
454 if( my $n = $_->getAttribute( 'n' ) ) {
455 $place_str = '#DIV_' . $n . '#';
456 } else {
457 $place_str = '#DIV#';
458 }
459 $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str );
460 } # foreach <div/>
461
462 # But maybe we don't have any divs. Just paragraphs.
463 unless( @divs ) {
464 $self->_objectify_words( $teitext, $wordlist, $uncorrlist );
465 }
466}
467
468sub _objectify_words {
469 my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_;
470
471 my $xpc = _xpc_for_el( $element );
472 my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg};
473 my @pgraphs = $xpc->findnodes( $xpexpr );
474 return () unless @pgraphs;
475 # Set up an expression to look for words and segs
476 $xpexpr = '.' . $tags{w} . '|.' . $tags{seg};
477 foreach my $pg ( @pgraphs ) {
478 # If this paragraph is the descendant of a note element,
479 # skip it.
480 my @noop_container = $xpc->findnodes( 'ancestor::note', $pg );
481 next if scalar @noop_container;
482 # Get the text of each node
483 my $first_word = 1;
484 # Hunt down each wrapped word/seg, and make an object (or two objects)
485 # of it, if necessary.
486 foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
b39fb0b3 487 my( $text, $uncorr ) = _get_word_strings( $c );
fae52efd 488# try {
489# ( $text, $uncorr ) = _get_word_object( $c );
490# } catch( Text::Tradition::Error $e
491# where { $_->has_tag( 'lb' ) } ) {
492# next;
493# }
494 unless( defined $text || defined $uncorr ) {
495 print STDERR "WARNING: no text in node " . $c->nodeName
496 . "\n" unless $c->nodeName eq 'lb';
497 next;
498 }
499 print STDERR "DEBUG: space found in element node "
500 . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/;
501
502 my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist;
503 while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) {
504 $ctr++;
505 }
506 my $id = $self->sigil . 'r' . $ctr;
507 my( $word, $acword );
508 if( $text ) {
509 $word = $self->tradition->collation->add_reading(
510 { 'id' => $id, 'text' => $text });
511 }
512 if( $uncorr && $uncorr ne $text ) {
513 $id .= '_ac';
514 $acword = $self->tradition->collation->add_reading(
515 { 'id' => $id, 'text' => $uncorr });
516 } elsif( $uncorr ) {
517 $acword = $word;
518 }
519
520# if( $first_word ) {
521# $first_word = 0;
522# # Set the relevant sectioning markers
523# if( $divmarker ) {
524# $w->add_placeholder( $divmarker );
525# $divmarker = undef;
526# }
527# $w->add_placeholder( '#PG#' );
528# }
529 push( @$wordlist, $word ) if $word;
530 push( @$uncorrlist, $acword ) if $acword;
531 }
532 }
533}
534
535# Given a word or segment node, make a Reading object for the word
536# therein. Make two Reading objects if there is an 'uncorrected' vs.
537# 'corrected' state.
538
539sub _get_word_strings {
540 my( $node ) = @_;
541 my( $text, $uncorrtext );
542 # We can have an lb or pb in the middle of a word; if we do, the
543 # whitespace (including \n) after the break becomes insignificant
544 # and we want to nuke it.
545 my $strip_leading_space = 0;
546 my $word_excluded = 0;
547 my $xpc = _xpc_for_el( $node );
548 # TODO This does not cope with nested add/dels.
b39fb0b3 549 my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) );
550 my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) );
fae52efd 551 foreach my $c ($node->childNodes() ) {
552 if( $c->nodeName eq 'num'
553 && defined $c->getAttribute( 'value' ) ) {
554 # Push the number.
555 $text .= $c->getAttribute( 'value' ) unless @deletion;
556 $uncorrtext .= $c->getAttribute( 'value' ) unless @addition;
557 # If this is just after a line/page break, return to normal behavior.
558 $strip_leading_space = 0;
559 } elsif ( $c->nodeName =~ /^[lp]b$/ ) {
560 # Set a flag that strips leading whitespace until we
561 # get to the next bit of non-whitespace.
562 $strip_leading_space = 1;
563 } elsif ( $c->nodeName eq 'fw' # for catchwords
564 || $c->nodeName eq 'sic'
565 || $c->nodeName eq 'note' #TODO: decide how to deal with notes
566 || $c->textContent eq ''
567 || ref( $c ) eq 'XML::LibXML::Comment' ) {
568 $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
569 next;
570 } elsif( $c->nodeName eq 'add' ) {
b39fb0b3 571 my( $use, $discard ) = _get_word_strings( $c );
fae52efd 572 $text .= $use;
573 } elsif( $c->nodeName eq 'del' ) {
b39fb0b3 574 my( $discard, $use ) = _get_word_strings( $c );
fae52efd 575 $uncorrtext .= $use;
576 } else {
b39fb0b3 577 my ( $tagtxt, $taguncorr );
fae52efd 578 if( ref( $c ) eq 'XML::LibXML::Text' ) {
579 # A text node.
580 $tagtxt = $c->textContent;
b39fb0b3 581 $taguncorr = $c->textContent;
fae52efd 582 } else {
b39fb0b3 583 ( $tagtxt, $taguncorr ) = _get_word_strings( $c );
fae52efd 584 }
585 if( $strip_leading_space ) {
586 $tagtxt =~ s/^[\s\n]+//s;
b39fb0b3 587 $taguncorr =~ s/^[\s\n]+//s;
fae52efd 588 # Unset the flag as soon as we see non-whitespace.
589 $strip_leading_space = 0 if $tagtxt;
590 }
591 $text .= $tagtxt;
b39fb0b3 592 $uncorrtext .= $taguncorr;
fae52efd 593 }
594 }
595 throw( ident => "text not found",
596 tags => [ $node->nodeName ],
597 message => "No text found in node " . $node->toString(0) )
598 unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/;
599 return( $text, $uncorrtext );
600}
601
602sub _split_words {
603 my( $self, $string, $c ) = @_;
604 my @raw_words = split( /\s+/, $string );
605 my @words;
606 foreach my $w ( @raw_words ) {
607 my $id = $self->sigil . 'r'. $c++;
608 my %opts = ( 'text' => $w, 'id' => $id, 'language' => $self->language );
609 my $w_obj = $self->tradition->collation->add_reading( \%opts );
610 # Skip any words that have been canonized out of existence.
611 next if( length( $w_obj->text ) == 0 );
612 push( @words, $w_obj );
613 }
614 return @words;
615}
616
617sub _init_from_json {
618 my( $self ) = shift;
619 my $wit;
620 if( $self->has_object ) {
621 $wit = $self->object;
65ed66b9 622 } elsif( $self->has_string ) {
623 $wit = from_json( $self->string );
624 } elsif( $self->has_file ) {
625 my $ok = open( INPUT, $self->file );
626 unless( $ok ) {
627 throw( ident => "bad source",
628 message => 'Could not open ' . $self->file . ' for reading' );
629 }
630 binmode( INPUT, ':encoding(UTF-8)' );
631 my @lines = <INPUT>;
632 close INPUT;
633 $wit = from_json( join( '', @lines ) );
fae52efd 634 }
635
65ed66b9 636 if( exists $wit->{'id'} ) {
637 $self->_set_sigil( $wit->{'id'} );
638 } elsif( !$self->has_sigil ) {
639 throw( ident => 'missing sigil',
640 message => 'Could not find witness sigil (id) in JSON spec' );
641 }
fae52efd 642 $self->identifier( $wit->{'name'} );
643 my @words;
644 my @layerwords;
038d6b50 645 my( @text, @layertext );
fae52efd 646 if( exists $wit->{'content'} ) {
647 # We need to tokenize the text ourselves.
648 @words = _split_words( $self, $wit->{'content'} );
649 } elsif( exists $wit->{'tokens'} ) {
650 # We have a bunch of pretokenized words.
651 my $ctr = 0;
652 foreach my $token ( @{$wit->{'tokens'}} ) {
653 my $w_obj = $self->tradition->collation->add_reading({
65ed66b9 654 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
fae52efd 655 push( @words, $w_obj );
038d6b50 656 push( @text, $token->{'t'} ); # TODO unless...?
fae52efd 657 }
658 ## TODO rethink this JSOn mechanism
659 if( exists $wit->{'layertokens'} ) {
660 foreach my $token ( @{$wit->{'layertokens'}} ) {
661 my $w_obj = $self->tradition->collation->add_reading({
65ed66b9 662 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
fae52efd 663 push( @layerwords, $w_obj );
038d6b50 664 push( @layertext, $token->{'t'} );
7158714d 665 }
fae52efd 666 }
7158714d 667 }
038d6b50 668 $self->text( \@text );
669 $self->layertext( \@layertext ) if @layertext;
fae52efd 670 $self->path( \@words );
671 $self->uncorrected_path( \@layerwords ) if @layerwords;
7158714d 672}
673
fae52efd 674sub _init_from_plaintext {
675 my( $self ) = @_;
676 my $str;
677 if( $self->has_file ) {
678 my $ok = open( INPUT, $self->file );
679 unless( $ok ) {
680 throw( ident => "bad source",
681 message => 'Could not open ' . $self->file . ' for reading' );
682 }
683 binmode( INPUT, ':encoding(UTF-8)' );
684 my @lines = <INPUT>;
685 close INPUT;
686 $str = join( '', @lines );
687 } elsif( $self->has_object ) { # ...seriously?
688 $str = ${$self->object};
689 } else {
690 $str = $self->string;
691 }
692
693 # TODO allow a different word separation expression
694 my @text = split( /\s+/, $str );
695 $self->text( \@text );
696 my @words = _split_words( $self, $str );
697 $self->path( \@words );
698}
699
700sub throw {
701 Text::Tradition::Error->throw(
702 'ident' => 'Witness parsing error',
703 'message' => $_[0],
704 );
705}
706
707sub _xpc_for_el {
708 my $el = shift;
709 my $xpc = XML::LibXML::XPathContext->new( $el );
710 if( $el->namespaceURI ) {
711 $xpc->registerNs( 'tei', $el->namespaceURI );
712 }
713 return $xpc;
714}
715
f025e303 716=head2 export_as_json
717
718Exports the witness as a JSON structure, with the following keys:
719
720=over 4
721
722=item * id - The witness sigil
723
724=item * name - The witness identifier
725
726=item * tokens - An array of hashes of the form { "t":"WORD" }
727
728=back
729
730=begin testing
731
732use Text::Tradition;
fae52efd 733my $trad = Text::Tradition->new();
f025e303 734
fae52efd 735my @text = qw/ Thhis is a line of text /;
736my $wit = $trad->add_witness(
f025e303 737 'sigil' => 'A',
fae52efd 738 'string' => join( ' ', @text ),
739 'sourcetype' => 'plaintext',
f025e303 740 'identifier' => 'test witness',
741 );
742my $jsonstruct = $wit->export_as_json;
743is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
744is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" );
745is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" );
746foreach my $idx ( 0 .. $#text ) {
747 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" );
748}
749
750my @ctext = qw( when april with his showers sweet with fruit the drought of march
751 has pierced unto the root );
fae52efd 752$trad = Text::Tradition->new(
f025e303 753 'input' => 'CollateX',
754 'file' => 't/data/Collatex-16.xml' );
755
756$jsonstruct = $trad->witness('A')->export_as_json;
757is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
758is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" );
759is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" );
760foreach my $idx ( 0 .. $#ctext ) {
761 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" );
762}
763
fae52efd 764## TODO test layertext export
765
f025e303 766=end testing
767
768=cut
769
770sub export_as_json {
771 my $self = shift;
772 my @wordlist = map { { 't' => $_ || '' } } @{$self->text};
fae52efd 773 my $obj = {
f025e303 774 'id' => $self->sigil,
775 'tokens' => \@wordlist,
776 'name' => $self->identifier,
777 };
fae52efd 778 if( $self->is_layered ) {
779 my @lwlist = map { { 't' => $_ || '' } } @{$self->uncorrected};
780 $obj->{'layertokens'} = \@lwlist;
781 }
782 return $obj;
f025e303 783}
784
dd3b58b0 785no Moose;
786__PACKAGE__->meta->make_immutable;
7158714d 787
788=head1 BUGS / TODO
789
790=over
791
06e7cbc7 792=item * Figure out how to serialize a witness
793
fae52efd 794=item * Support encodings other than UTF-8
7158714d 795
796=back
797
798=head1 LICENSE
799
800This package is free software and is provided "as is" without express
801or implied warranty. You can redistribute it and/or modify it under
802the same terms as Perl itself.
803
804=head1 AUTHOR
805
806Tara L Andrews E<lt>aurum@cpan.orgE<gt>