restore legacy attribute until we can remove it from db records
[scpubgit/stemmatology.git] / lib / Text / Tradition / Witness.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Witness;
fae52efd 2
3use vars qw( %tags );
4use JSON;
dd3b58b0 5use Moose;
7854e12e 6use Moose::Util::TypeConstraints;
fae52efd 7use Text::TEI::Markup qw( word_tag_wrap );
8use TryCatch;
9use XML::Easy::Syntax qw( $xml10_name_rx );
dd3b58b0 10
7158714d 11=head1 NAME
12
13Text::Tradition::Witness - a manuscript witness to a text tradition
14
15=head1 SYNOPSIS
16
17 use Text::Tradition::Witness;
18 my $w = Text::Tradition::Witness->new(
19 'sigil' => 'A',
20 'identifier' => 'Oxford MS Ex.1932',
21 );
22
23=head1 DESCRIPTION
24
25Text::Tradition::Witness is an object representation of a manuscript
26witness to a text tradition. A manuscript has a sigil (a short code that
27represents it in the wider tradition), an identifier (e.g. the library ID),
28and probably a text.
29
30=head1 METHODS
31
32=head2 new
33
34Create a new witness. Options include:
35
36=over
37
38=item * sigil - A short code to represent the manuscript. Required.
39
fae52efd 40=item * sourcetype - What sort of witness data this is. Options are
41'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be
42used by Collation parsers.)
861c3e27 43
fae52efd 44=item * file
45=item * string
46=item * object
47
48The data source for the witness. Use the appropriate option.
49
50=item * use_text - An initialization option. If the witness is read from a
51TEI document and more than one <text/> tag exists therein, the default
52behavior is to use the first defined text. If this is not desired,
53use_text should be set to an XPath expression that will select the correct
54text.
7158714d 55
fae52efd 56=item * language - The name of the applicable L<Text::Tradition::Lang>
57module for language handling. Usually inherited from the language set in
58the L<Text::Tradition> object, and defaults to Default.
7158714d 59
60=item * identifier - The recognized name of the manuscript, e.g. a library
fae52efd 61identifier. Taken from the msDesc element for a TEI file.
7158714d 62
63=item * other_info - A freeform string for any other description of the
fae52efd 64manuscript.
7158714d 65
66=back
67
68=head2 sigil
69
fae52efd 70The sigil by which to identify this manuscript, which must conform to the
71specification for XML attribute strings (broadly speaking, it must begin
72with a letter and can have only a few sorts of punctuation characters in
73it.)
74
75=head2 identifier
76
77A freeform name by which to identify the manuscript, which may be longer
78than the sigil. Defaults to 'Unidentified ms', but will be taken from the
79TEI msName attribute, or constructed from the settlement and idno if
80supplied.
81
82=head2 settlement
83
84The city, town, etc. where the manuscript is held. Will be read from the
85TEI msDesc element if supplied.
86
87=head2 repository
88
89The institution that holds the manuscript. Will be read from the TEI msDesc
90element if supplied.
91
92=head2 idno
93
94The identification or call number of the manuscript. Will be read from the
95TEI msDesc element if supplied.
7158714d 96
97=head2 text
98
fae52efd 99An array of strings (words) that contains the text of the
100manuscript. This should not change after the witness has been
101instantiated, and the path through the collation should always match it.
102
103=head2 layertext
7158714d 104
fae52efd 105An array of strings (words) that contains the layered
106text, if any, of the manuscript. This should not change after the witness
107has been instantiated, and the path through the collation should always
108match it.
7158714d 109
fae52efd 110=head2 language
111
112Accessor method to get the witness language.
7158714d 113
114=head2 identifier
115
116Accessor method for the witness identifier.
117
118=head2 other_info
119
120Accessor method for the general witness description.
121
06e7cbc7 122=head2 has_source
123
124Boolean method that returns a true value if the witness was created with a
125data source (that is, a file, string, or object to be parsed).
126
1f7aa795 127=head2 is_layered
7158714d 128
1f7aa795 129Boolean method to note whether the witness has layers (e.g. pre-correction
130readings) in the collation.
7158714d 131
132=begin testing
133
fae52efd 134use Text::Tradition;
135my $trad = Text::Tradition->new( 'name' => 'test tradition' );
136my $c = $trad->collation;
7158714d 137
fae52efd 138# Test a plaintext witness via string
139my $str = 'This is a line of text';
140my $ptwit = $trad->add_witness(
7158714d 141 'sigil' => 'A',
fae52efd 142 'sourcetype' => 'plaintext',
143 'string' => $str
7158714d 144 );
fae52efd 145is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' );
146if( $ptwit ) {
147 is( $ptwit->sigil, 'A', "Witness has correct sigil" );
148 is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
7158714d 149}
150
65ed66b9 151# Test some JSON witnesses via object
152open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
153binmode( JSIN, ':encoding(UTF-8)' );
154my @lines = <JSIN>;
155close JSIN;
156$trad->add_json_witnesses( join( '', @lines ) );
157is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
158 "Found first JSON witness" );
159is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness',
160 "Found second JSON witness" );
161
b39fb0b3 162# Test an XML witness via file
163my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
164 'file' => 't/data/witnesses/teiwit.xml' );
165is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
166if( $xmlwit ) {
167 is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
168 ok( $xmlwit->is_layered, "Picked up correction layer" );
169 is( @{$xmlwit->text}, 182, "Got correct text length" );
170 is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" );
171}
172my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings;
173is( @allwitwords, 184, "Reused appropriate readings" );
fae52efd 174
175## Test use_text
b39fb0b3 176my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
177 'file' => 't/data/witnesses/group.xml',
178 'use_text' => '//tei:group/tei:text[2]' );
179is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" );
180if( $xpwit ) {
181 is( $xpwit->sigil, 'G', "XML part witness has correct sigil" );
182 ok( !$xpwit->is_layered, "Picked up no correction layer" );
183 is( @{$xpwit->text}, 157, "Got correct text length" );
184}
185
fae52efd 186
7158714d 187=end testing
188
189=cut
190
fae52efd 191subtype 'SourceType',
192 as 'Str',
193 where { $_ =~ /^(xmldesc|plaintext|json|collation)$/ },
194 message { 'Source type must be one of xmldesc, plaintext, json, collation' };
195
196subtype 'Sigil',
197 as 'Str',
198 where { $_ =~ /\A$xml10_name_rx\z/ },
199 message { 'Sigil must be a valid XML attribute string' };
200
201no Moose::Util::TypeConstraints;
202
203has 'tradition' => (
204 'is' => 'ro',
205 'isa' => 'Text::Tradition',
206 'required' => 1,
207 );
208
209# Sigil. Required identifier for a witness, but may be found inside
210# the XML file.
dd3b58b0 211has 'sigil' => (
7158714d 212 is => 'ro',
fae52efd 213 isa => 'Sigil',
214 predicate => 'has_sigil',
215 writer => '_set_sigil',
216 );
217
b39fb0b3 218has 'language' => (
219 is => 'ro',
220 isa => 'Str',
221 default => 'Default',
222 );
223
fae52efd 224# Other identifying information
225has 'identifier' => (
226 is => 'rw',
227 isa => 'Str',
228 );
229
230has 'settlement' => (
231 is => 'rw',
232 isa => 'Str',
233 );
234
235has 'repository' => (
236 is => 'rw',
237 isa => 'Str',
238 );
239
240has 'idno' => (
241 is => 'rw',
242 isa => 'Str',
243 );
244
b39fb0b3 245# Source. Can be XML obj, JSON data struct, or string.
246# Not used if the witness is created by parsing a collation.
fae52efd 247has 'sourcetype' => (
248 is => 'ro',
249 isa => 'SourceType',
250 required => 1,
251);
63528856 252
253has 'source' => (
254 is => 'ro',
255 isa => 'Str',
256 predicate => 'has_old_source',
257 clearer => 'clear_old_source',
258);
fae52efd 259
fae52efd 260has 'file' => (
261 is => 'ro',
262 isa => 'Str',
263 predicate => 'has_file',
264);
265
266has 'string' => (
267 is => 'ro',
268 isa => 'Str',
269 predicate => 'has_string',
270);
271
272has 'object' => ( # could be anything.
273 is => 'ro',
274 predicate => 'has_object',
275 clearer => 'clear_object',
276);
277
278# In the case of a TEI document with multiple texts, specify
279# which text is the root. Should be an XPath expression.
280has 'use_text' => (
281 is => 'ro',
7158714d 282 isa => 'Str',
fae52efd 283 );
284
7158714d 285# Text. This is an array of strings (i.e. word tokens).
d047cd52 286# TODO Think about how to handle this for the case of pre-prepared
287# collations, where the tokens are in the graph already.
dd3b58b0 288has 'text' => (
7158714d 289 is => 'rw',
290 isa => 'ArrayRef[Str]',
291 predicate => 'has_text',
292 );
b0b4421a 293
294has 'layertext' => (
295 is => 'rw',
296 isa => 'ArrayRef[Str]',
297 predicate => 'has_layertext',
298 );
fae52efd 299
1f7aa795 300# Path. This is an array of Reading nodes that can be saved during
301# initialization, but should be cleared before saving in a DB.
4a8828f0 302has 'path' => (
7158714d 303 is => 'rw',
304 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
305 predicate => 'has_path',
1f7aa795 306 clearer => 'clear_path',
7158714d 307 );
4a8828f0 308
b39fb0b3 309## TODO change the name of this
b15511bf 310has 'uncorrected_path' => (
7158714d 311 is => 'rw',
312 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
1f7aa795 313 clearer => 'clear_uncorrected_path',
314 );
315
316has 'is_layered' => (
317 is => 'rw',
318 isa => 'Bool',
7158714d 319 );
f6066bac 320
1f7aa795 321# If we set an uncorrected path, ever, remember that we did so.
322around 'uncorrected_path' => sub {
323 my $orig = shift;
324 my $self = shift;
325
326 $self->is_layered( 1 );
327 $self->$orig( @_ );
328};
e2902068 329
784877d9 330sub BUILD {
7158714d 331 my $self = shift;
332 if( $self->has_source ) {
fae52efd 333 my $init_sub = '_init_from_' . $self->sourcetype;
334 $self->$init_sub();
335 # Remove our XML / source objects; we no longer need them.
336 $self->clear_object if $self->has_object;
337 $self->tradition->collation->make_witness_path( $self );
338 }
339 return $self;
340}
341
342sub has_source {
343 my $self = shift;
344 return $self->has_file || $self->has_string || $self->has_object;
345}
346
347sub _init_from_xmldesc {
348 my $self = shift;
349 my $xmlobj;
350 if( $self->has_object ) {
351 unless( ref( $self->object ) eq 'XML::LibXML::Element' ) {
352 throw( ident => "bad source",
353 message => "Source object must be an XML::LibXML::Element (this is "
354 . ref( $self->object ) . ");" );
355 }
356 $xmlobj = $self->object;
357 } else {
358 my $parser = XML::LibXML->new();
359 my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
360 try {
361 $xmlobj = $parser->$parsersub( $self->file )->documentElement;
362 } catch( XML::LibXML::Error $e ) {
363 throw( ident => "bad source",
364 message => "XML parsing error: " . $e->as_string );
365 }
366 }
367
368 unless( $xmlobj->nodeName eq 'TEI' ) {
369 throw( ident => "bad source",
370 message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" );
371 }
372
373 # Set up the tags we need, with or without namespaces.
374 map { $tags{$_} = "//$_" }
375 qw/ msDesc msName settlement repository idno p lg w seg add del /;
376 # Set up our XPath object
377 my $xpc = _xpc_for_el( $xmlobj );
378 # Use namespace-aware tags if we have to
379 if( $xmlobj->namespaceURI ) {
380 map { $tags{$_} = "//tei:$_" } keys %tags;
381 }
382
383 # Get the identifier
384 if( my $desc = $xpc->find( $tags{msDesc} ) ) {
385 my $descnode = $desc->get_node(1);
fae52efd 386 # First try to use settlement/repository/idno.
387 my( $setNode, $reposNode, $idNode ) =
388 ( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
389 $xpc->find( $tags{repository}, $descnode )->get_node(1),
390 $xpc->find( $tags{idno}, $descnode )->get_node(1) );
391 $self->settlement( $setNode ? $setNode->textContent : '' );
392 $self->repository( $reposNode ? $reposNode->textContent : '' );
393 $self->idno( $idNode ? $idNode->textContent : '' );
394 if( $self->settlement && $self->idno ) {
395 $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) );
396 } else {
397 # Look for an msName.
398 my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1);
399 if( $msNameNode ) {
400 $self->identifier( $msNameNode->textContent );
401 } else {
402 # We have an msDesc but who knows what is in it?
403 my $desc = $descnode->textContent;
404 $desc =~ s/\n/ /gs;
405 $desc =~ s/\s+/ /g;
406 $self->identifier( $desc );
407 }
408 }
409 if( $descnode->hasAttribute('xml:id') ) {
410 $self->_set_sigil( $descnode->getAttribute('xml:id') );
411 } elsif( !$self->has_sigil ) {
65ed66b9 412 throw( ident => 'missing sigil',
413 message => 'Could not find xml:id witness sigil' );
fae52efd 414 }
415 } else {
416 throw( ident => "bad source",
417 message => "Could not find manuscript description element in TEI header" );
418 }
419
420 # Now get the words out.
421 my @words;
422 my @layerwords; # if the witness has layers
423 # First, make sure all the words are wrapped in tags.
424 # TODO Make this not necessarily dependent upon whitespace...
425 word_tag_wrap( $xmlobj );
426 # Now go text hunting.
427 my @textnodes;
428 if( $self->use_text ) {
429 @textnodes = $xpc->findnodes( $self->use_text );
430 } else {
431 # Use the first 'text' node in the document.
432 @textnodes = $xmlobj->getElementsByTagName( 'text' );
433 }
434 my $teitext = $textnodes[0];
435 if( $teitext ) {
436 _tokenize_text( $self, $teitext, \@words, \@layerwords );
437 } else {
438 throw( ident => "bad source",
439 message => "No text element in document '" . $self->{'identifier'} . "!" );
440 }
441
b39fb0b3 442 my @text = map { $_->text } @words;
443 my @layertext = map { $_->text } @layerwords;
fae52efd 444 $self->path( \@words );
b39fb0b3 445 $self->text( \@text );
446 if( join( ' ', @text ) ne join( ' ', @layertext ) ) {
fae52efd 447 $self->uncorrected_path( \@layerwords );
b39fb0b3 448 $self->layertext( \@layertext );
fae52efd 449 }
fae52efd 450}
451
452sub _tokenize_text {
453 my( $self, $teitext, $wordlist, $uncorrlist ) = @_;
454 # Strip out the words.
455 my $xpc = _xpc_for_el( $teitext );
456 my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' );
457 foreach( @divs ) {
458 my $place_str;
459 if( my $n = $_->getAttribute( 'n' ) ) {
460 $place_str = '#DIV_' . $n . '#';
461 } else {
462 $place_str = '#DIV#';
463 }
464 $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str );
465 } # foreach <div/>
466
467 # But maybe we don't have any divs. Just paragraphs.
468 unless( @divs ) {
469 $self->_objectify_words( $teitext, $wordlist, $uncorrlist );
470 }
471}
472
473sub _objectify_words {
474 my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_;
475
476 my $xpc = _xpc_for_el( $element );
477 my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg};
478 my @pgraphs = $xpc->findnodes( $xpexpr );
479 return () unless @pgraphs;
480 # Set up an expression to look for words and segs
481 $xpexpr = '.' . $tags{w} . '|.' . $tags{seg};
482 foreach my $pg ( @pgraphs ) {
483 # If this paragraph is the descendant of a note element,
484 # skip it.
485 my @noop_container = $xpc->findnodes( 'ancestor::note', $pg );
486 next if scalar @noop_container;
487 # Get the text of each node
488 my $first_word = 1;
489 # Hunt down each wrapped word/seg, and make an object (or two objects)
490 # of it, if necessary.
491 foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
b39fb0b3 492 my( $text, $uncorr ) = _get_word_strings( $c );
fae52efd 493# try {
494# ( $text, $uncorr ) = _get_word_object( $c );
495# } catch( Text::Tradition::Error $e
496# where { $_->has_tag( 'lb' ) } ) {
497# next;
498# }
499 unless( defined $text || defined $uncorr ) {
500 print STDERR "WARNING: no text in node " . $c->nodeName
501 . "\n" unless $c->nodeName eq 'lb';
502 next;
503 }
504 print STDERR "DEBUG: space found in element node "
505 . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/;
506
507 my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist;
508 while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) {
509 $ctr++;
510 }
511 my $id = $self->sigil . 'r' . $ctr;
512 my( $word, $acword );
513 if( $text ) {
514 $word = $self->tradition->collation->add_reading(
515 { 'id' => $id, 'text' => $text });
516 }
517 if( $uncorr && $uncorr ne $text ) {
518 $id .= '_ac';
519 $acword = $self->tradition->collation->add_reading(
520 { 'id' => $id, 'text' => $uncorr });
521 } elsif( $uncorr ) {
522 $acword = $word;
523 }
524
525# if( $first_word ) {
526# $first_word = 0;
527# # Set the relevant sectioning markers
528# if( $divmarker ) {
529# $w->add_placeholder( $divmarker );
530# $divmarker = undef;
531# }
532# $w->add_placeholder( '#PG#' );
533# }
534 push( @$wordlist, $word ) if $word;
535 push( @$uncorrlist, $acword ) if $acword;
536 }
537 }
538}
539
540# Given a word or segment node, make a Reading object for the word
541# therein. Make two Reading objects if there is an 'uncorrected' vs.
542# 'corrected' state.
543
544sub _get_word_strings {
545 my( $node ) = @_;
546 my( $text, $uncorrtext );
547 # We can have an lb or pb in the middle of a word; if we do, the
548 # whitespace (including \n) after the break becomes insignificant
549 # and we want to nuke it.
550 my $strip_leading_space = 0;
551 my $word_excluded = 0;
552 my $xpc = _xpc_for_el( $node );
553 # TODO This does not cope with nested add/dels.
b39fb0b3 554 my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) );
555 my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) );
fae52efd 556 foreach my $c ($node->childNodes() ) {
557 if( $c->nodeName eq 'num'
558 && defined $c->getAttribute( 'value' ) ) {
559 # Push the number.
560 $text .= $c->getAttribute( 'value' ) unless @deletion;
561 $uncorrtext .= $c->getAttribute( 'value' ) unless @addition;
562 # If this is just after a line/page break, return to normal behavior.
563 $strip_leading_space = 0;
564 } elsif ( $c->nodeName =~ /^[lp]b$/ ) {
565 # Set a flag that strips leading whitespace until we
566 # get to the next bit of non-whitespace.
567 $strip_leading_space = 1;
568 } elsif ( $c->nodeName eq 'fw' # for catchwords
569 || $c->nodeName eq 'sic'
570 || $c->nodeName eq 'note' #TODO: decide how to deal with notes
571 || $c->textContent eq ''
572 || ref( $c ) eq 'XML::LibXML::Comment' ) {
573 $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
574 next;
575 } elsif( $c->nodeName eq 'add' ) {
b39fb0b3 576 my( $use, $discard ) = _get_word_strings( $c );
fae52efd 577 $text .= $use;
578 } elsif( $c->nodeName eq 'del' ) {
b39fb0b3 579 my( $discard, $use ) = _get_word_strings( $c );
fae52efd 580 $uncorrtext .= $use;
581 } else {
b39fb0b3 582 my ( $tagtxt, $taguncorr );
fae52efd 583 if( ref( $c ) eq 'XML::LibXML::Text' ) {
584 # A text node.
585 $tagtxt = $c->textContent;
b39fb0b3 586 $taguncorr = $c->textContent;
fae52efd 587 } else {
b39fb0b3 588 ( $tagtxt, $taguncorr ) = _get_word_strings( $c );
fae52efd 589 }
590 if( $strip_leading_space ) {
591 $tagtxt =~ s/^[\s\n]+//s;
b39fb0b3 592 $taguncorr =~ s/^[\s\n]+//s;
fae52efd 593 # Unset the flag as soon as we see non-whitespace.
594 $strip_leading_space = 0 if $tagtxt;
595 }
596 $text .= $tagtxt;
b39fb0b3 597 $uncorrtext .= $taguncorr;
fae52efd 598 }
599 }
600 throw( ident => "text not found",
601 tags => [ $node->nodeName ],
602 message => "No text found in node " . $node->toString(0) )
603 unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/;
604 return( $text, $uncorrtext );
605}
606
607sub _split_words {
608 my( $self, $string, $c ) = @_;
609 my @raw_words = split( /\s+/, $string );
610 my @words;
611 foreach my $w ( @raw_words ) {
612 my $id = $self->sigil . 'r'. $c++;
613 my %opts = ( 'text' => $w, 'id' => $id, 'language' => $self->language );
614 my $w_obj = $self->tradition->collation->add_reading( \%opts );
615 # Skip any words that have been canonized out of existence.
616 next if( length( $w_obj->text ) == 0 );
617 push( @words, $w_obj );
618 }
619 return @words;
620}
621
622sub _init_from_json {
623 my( $self ) = shift;
624 my $wit;
625 if( $self->has_object ) {
626 $wit = $self->object;
65ed66b9 627 } elsif( $self->has_string ) {
628 $wit = from_json( $self->string );
629 } elsif( $self->has_file ) {
630 my $ok = open( INPUT, $self->file );
631 unless( $ok ) {
632 throw( ident => "bad source",
633 message => 'Could not open ' . $self->file . ' for reading' );
634 }
635 binmode( INPUT, ':encoding(UTF-8)' );
636 my @lines = <INPUT>;
637 close INPUT;
638 $wit = from_json( join( '', @lines ) );
fae52efd 639 }
640
65ed66b9 641 if( exists $wit->{'id'} ) {
642 $self->_set_sigil( $wit->{'id'} );
643 } elsif( !$self->has_sigil ) {
644 throw( ident => 'missing sigil',
645 message => 'Could not find witness sigil (id) in JSON spec' );
646 }
fae52efd 647 $self->identifier( $wit->{'name'} );
648 my @words;
649 my @layerwords;
038d6b50 650 my( @text, @layertext );
fae52efd 651 if( exists $wit->{'content'} ) {
652 # We need to tokenize the text ourselves.
653 @words = _split_words( $self, $wit->{'content'} );
654 } elsif( exists $wit->{'tokens'} ) {
655 # We have a bunch of pretokenized words.
656 my $ctr = 0;
657 foreach my $token ( @{$wit->{'tokens'}} ) {
658 my $w_obj = $self->tradition->collation->add_reading({
65ed66b9 659 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
fae52efd 660 push( @words, $w_obj );
038d6b50 661 push( @text, $token->{'t'} ); # TODO unless...?
fae52efd 662 }
663 ## TODO rethink this JSOn mechanism
664 if( exists $wit->{'layertokens'} ) {
665 foreach my $token ( @{$wit->{'layertokens'}} ) {
666 my $w_obj = $self->tradition->collation->add_reading({
65ed66b9 667 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
fae52efd 668 push( @layerwords, $w_obj );
038d6b50 669 push( @layertext, $token->{'t'} );
7158714d 670 }
fae52efd 671 }
7158714d 672 }
038d6b50 673 $self->text( \@text );
674 $self->layertext( \@layertext ) if @layertext;
fae52efd 675 $self->path( \@words );
676 $self->uncorrected_path( \@layerwords ) if @layerwords;
7158714d 677}
678
fae52efd 679sub _init_from_plaintext {
680 my( $self ) = @_;
681 my $str;
682 if( $self->has_file ) {
683 my $ok = open( INPUT, $self->file );
684 unless( $ok ) {
685 throw( ident => "bad source",
686 message => 'Could not open ' . $self->file . ' for reading' );
687 }
688 binmode( INPUT, ':encoding(UTF-8)' );
689 my @lines = <INPUT>;
690 close INPUT;
691 $str = join( '', @lines );
692 } elsif( $self->has_object ) { # ...seriously?
693 $str = ${$self->object};
694 } else {
695 $str = $self->string;
696 }
697
698 # TODO allow a different word separation expression
699 my @text = split( /\s+/, $str );
700 $self->text( \@text );
701 my @words = _split_words( $self, $str );
702 $self->path( \@words );
703}
704
705sub throw {
706 Text::Tradition::Error->throw(
707 'ident' => 'Witness parsing error',
708 'message' => $_[0],
709 );
710}
711
712sub _xpc_for_el {
713 my $el = shift;
714 my $xpc = XML::LibXML::XPathContext->new( $el );
715 if( $el->namespaceURI ) {
716 $xpc->registerNs( 'tei', $el->namespaceURI );
717 }
718 return $xpc;
719}
720
f025e303 721=head2 export_as_json
722
723Exports the witness as a JSON structure, with the following keys:
724
725=over 4
726
727=item * id - The witness sigil
728
729=item * name - The witness identifier
730
731=item * tokens - An array of hashes of the form { "t":"WORD" }
732
733=back
734
735=begin testing
736
737use Text::Tradition;
fae52efd 738my $trad = Text::Tradition->new();
f025e303 739
fae52efd 740my @text = qw/ Thhis is a line of text /;
741my $wit = $trad->add_witness(
f025e303 742 'sigil' => 'A',
fae52efd 743 'string' => join( ' ', @text ),
744 'sourcetype' => 'plaintext',
f025e303 745 'identifier' => 'test witness',
746 );
747my $jsonstruct = $wit->export_as_json;
748is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
749is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" );
750is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" );
751foreach my $idx ( 0 .. $#text ) {
752 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" );
753}
754
755my @ctext = qw( when april with his showers sweet with fruit the drought of march
756 has pierced unto the root );
fae52efd 757$trad = Text::Tradition->new(
f025e303 758 'input' => 'CollateX',
759 'file' => 't/data/Collatex-16.xml' );
760
761$jsonstruct = $trad->witness('A')->export_as_json;
762is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
763is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" );
764is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" );
765foreach my $idx ( 0 .. $#ctext ) {
766 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" );
767}
768
fae52efd 769## TODO test layertext export
770
f025e303 771=end testing
772
773=cut
774
775sub export_as_json {
776 my $self = shift;
777 my @wordlist = map { { 't' => $_ || '' } } @{$self->text};
fae52efd 778 my $obj = {
f025e303 779 'id' => $self->sigil,
780 'tokens' => \@wordlist,
781 'name' => $self->identifier,
782 };
fae52efd 783 if( $self->is_layered ) {
784 my @lwlist = map { { 't' => $_ || '' } } @{$self->uncorrected};
785 $obj->{'layertokens'} = \@lwlist;
786 }
787 return $obj;
f025e303 788}
789
dd3b58b0 790no Moose;
791__PACKAGE__->meta->make_immutable;
7158714d 792
793=head1 BUGS / TODO
794
795=over
796
06e7cbc7 797=item * Figure out how to serialize a witness
798
fae52efd 799=item * Support encodings other than UTF-8
7158714d 800
801=back
802
803=head1 LICENSE
804
805This package is free software and is provided "as is" without express
806or implied warranty. You can redistribute it and/or modify it under
807the same terms as Perl itself.
808
809=head1 AUTHOR
810
811Tara L Andrews E<lt>aurum@cpan.orgE<gt>