remove some debugging statements
[scpubgit/stemmatology.git] / lib / Text / Tradition / Witness.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Witness;
fae52efd 2
3use vars qw( %tags );
4use JSON;
dd3b58b0 5use Moose;
7854e12e 6use Moose::Util::TypeConstraints;
fae52efd 7use Text::TEI::Markup qw( word_tag_wrap );
8use TryCatch;
9use XML::Easy::Syntax qw( $xml10_name_rx );
dd3b58b0 10
7158714d 11=head1 NAME
12
13Text::Tradition::Witness - a manuscript witness to a text tradition
14
15=head1 SYNOPSIS
16
17 use Text::Tradition::Witness;
18 my $w = Text::Tradition::Witness->new(
19 'sigil' => 'A',
20 'identifier' => 'Oxford MS Ex.1932',
21 );
22
23=head1 DESCRIPTION
24
25Text::Tradition::Witness is an object representation of a manuscript
26witness to a text tradition. A manuscript has a sigil (a short code that
27represents it in the wider tradition), an identifier (e.g. the library ID),
28and probably a text.
29
30=head1 METHODS
31
32=head2 new
33
34Create a new witness. Options include:
35
36=over
37
38=item * sigil - A short code to represent the manuscript. Required.
39
fae52efd 40=item * sourcetype - What sort of witness data this is. Options are
41'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be
42used by Collation parsers.)
861c3e27 43
fae52efd 44=item * file
45=item * string
46=item * object
47
48The data source for the witness. Use the appropriate option.
49
50=item * use_text - An initialization option. If the witness is read from a
51TEI document and more than one <text/> tag exists therein, the default
52behavior is to use the first defined text. If this is not desired,
53use_text should be set to an XPath expression that will select the correct
54text.
7158714d 55
fae52efd 56=item * language - The name of the applicable L<Text::Tradition::Lang>
57module for language handling. Usually inherited from the language set in
58the L<Text::Tradition> object, and defaults to Default.
7158714d 59
60=item * identifier - The recognized name of the manuscript, e.g. a library
fae52efd 61identifier. Taken from the msDesc element for a TEI file.
7158714d 62
63=item * other_info - A freeform string for any other description of the
fae52efd 64manuscript.
7158714d 65
66=back
67
68=head2 sigil
69
fae52efd 70The sigil by which to identify this manuscript, which must conform to the
71specification for XML attribute strings (broadly speaking, it must begin
72with a letter and can have only a few sorts of punctuation characters in
73it.)
74
75=head2 identifier
76
77A freeform name by which to identify the manuscript, which may be longer
78than the sigil. Defaults to 'Unidentified ms', but will be taken from the
79TEI msName attribute, or constructed from the settlement and idno if
80supplied.
81
82=head2 settlement
83
84The city, town, etc. where the manuscript is held. Will be read from the
85TEI msDesc element if supplied.
86
87=head2 repository
88
89The institution that holds the manuscript. Will be read from the TEI msDesc
90element if supplied.
91
92=head2 idno
93
94The identification or call number of the manuscript. Will be read from the
95TEI msDesc element if supplied.
7158714d 96
97=head2 text
98
fae52efd 99An array of strings (words) that contains the text of the
100manuscript. This should not change after the witness has been
101instantiated, and the path through the collation should always match it.
102
103=head2 layertext
7158714d 104
fae52efd 105An array of strings (words) that contains the layered
106text, if any, of the manuscript. This should not change after the witness
107has been instantiated, and the path through the collation should always
108match it.
7158714d 109
fae52efd 110=head2 language
111
112Accessor method to get the witness language.
7158714d 113
114=head2 identifier
115
116Accessor method for the witness identifier.
117
118=head2 other_info
119
120Accessor method for the general witness description.
121
06e7cbc7 122=head2 has_source
123
124Boolean method that returns a true value if the witness was created with a
125data source (that is, a file, string, or object to be parsed).
126
1f7aa795 127=head2 is_layered
7158714d 128
1f7aa795 129Boolean method to note whether the witness has layers (e.g. pre-correction
130readings) in the collation.
7158714d 131
132=begin testing
133
fae52efd 134use Text::Tradition;
135my $trad = Text::Tradition->new( 'name' => 'test tradition' );
136my $c = $trad->collation;
7158714d 137
fae52efd 138# Test a plaintext witness via string
139my $str = 'This is a line of text';
140my $ptwit = $trad->add_witness(
7158714d 141 'sigil' => 'A',
fae52efd 142 'sourcetype' => 'plaintext',
143 'string' => $str
7158714d 144 );
fae52efd 145is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' );
146if( $ptwit ) {
147 is( $ptwit->sigil, 'A', "Witness has correct sigil" );
148 is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
7158714d 149}
150
65ed66b9 151# Test some JSON witnesses via object
152open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
153binmode( JSIN, ':encoding(UTF-8)' );
154my @lines = <JSIN>;
155close JSIN;
156$trad->add_json_witnesses( join( '', @lines ) );
157is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
158 "Found first JSON witness" );
159is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness',
160 "Found second JSON witness" );
161
b39fb0b3 162# Test an XML witness via file
163my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
164 'file' => 't/data/witnesses/teiwit.xml' );
165is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
166if( $xmlwit ) {
167 is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
168 ok( $xmlwit->is_layered, "Picked up correction layer" );
169 is( @{$xmlwit->text}, 182, "Got correct text length" );
170 is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" );
171}
172my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings;
173is( @allwitwords, 184, "Reused appropriate readings" );
fae52efd 174
175## Test use_text
b39fb0b3 176my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
177 'file' => 't/data/witnesses/group.xml',
178 'use_text' => '//tei:group/tei:text[2]' );
179is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" );
180if( $xpwit ) {
181 is( $xpwit->sigil, 'G', "XML part witness has correct sigil" );
182 ok( !$xpwit->is_layered, "Picked up no correction layer" );
183 is( @{$xpwit->text}, 157, "Got correct text length" );
184}
185
fae52efd 186
7158714d 187=end testing
188
189=cut
190
fae52efd 191subtype 'SourceType',
192 as 'Str',
193 where { $_ =~ /^(xmldesc|plaintext|json|collation)$/ },
194 message { 'Source type must be one of xmldesc, plaintext, json, collation' };
195
196subtype 'Sigil',
197 as 'Str',
198 where { $_ =~ /\A$xml10_name_rx\z/ },
199 message { 'Sigil must be a valid XML attribute string' };
200
201no Moose::Util::TypeConstraints;
202
203has 'tradition' => (
204 'is' => 'ro',
205 'isa' => 'Text::Tradition',
206 'required' => 1,
207 );
208
209# Sigil. Required identifier for a witness, but may be found inside
210# the XML file.
dd3b58b0 211has 'sigil' => (
7158714d 212 is => 'ro',
fae52efd 213 isa => 'Sigil',
214 predicate => 'has_sigil',
215 writer => '_set_sigil',
216 );
217
b39fb0b3 218has 'language' => (
219 is => 'ro',
220 isa => 'Str',
221 default => 'Default',
222 );
223
fae52efd 224# Other identifying information
225has 'identifier' => (
226 is => 'rw',
227 isa => 'Str',
228 );
229
230has 'settlement' => (
231 is => 'rw',
232 isa => 'Str',
233 );
234
235has 'repository' => (
236 is => 'rw',
237 isa => 'Str',
238 );
239
240has 'idno' => (
241 is => 'rw',
242 isa => 'Str',
243 );
244
b39fb0b3 245# Source. Can be XML obj, JSON data struct, or string.
246# Not used if the witness is created by parsing a collation.
fae52efd 247has 'sourcetype' => (
248 is => 'ro',
249 isa => 'SourceType',
250 required => 1,
251);
252
fae52efd 253has 'file' => (
254 is => 'ro',
255 isa => 'Str',
256 predicate => 'has_file',
257);
258
259has 'string' => (
260 is => 'ro',
261 isa => 'Str',
262 predicate => 'has_string',
263);
264
265has 'object' => ( # could be anything.
266 is => 'ro',
267 predicate => 'has_object',
268 clearer => 'clear_object',
269);
270
271# In the case of a TEI document with multiple texts, specify
272# which text is the root. Should be an XPath expression.
273has 'use_text' => (
274 is => 'ro',
7158714d 275 isa => 'Str',
fae52efd 276 );
277
7158714d 278# Text. This is an array of strings (i.e. word tokens).
d047cd52 279# TODO Think about how to handle this for the case of pre-prepared
280# collations, where the tokens are in the graph already.
dd3b58b0 281has 'text' => (
7158714d 282 is => 'rw',
283 isa => 'ArrayRef[Str]',
284 predicate => 'has_text',
285 );
b0b4421a 286
287has 'layertext' => (
288 is => 'rw',
289 isa => 'ArrayRef[Str]',
290 predicate => 'has_layertext',
291 );
fae52efd 292
1f7aa795 293# Path. This is an array of Reading nodes that can be saved during
294# initialization, but should be cleared before saving in a DB.
4a8828f0 295has 'path' => (
7158714d 296 is => 'rw',
297 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
298 predicate => 'has_path',
1f7aa795 299 clearer => 'clear_path',
7158714d 300 );
4a8828f0 301
b39fb0b3 302## TODO change the name of this
b15511bf 303has 'uncorrected_path' => (
7158714d 304 is => 'rw',
305 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
1f7aa795 306 clearer => 'clear_uncorrected_path',
307 );
308
309has 'is_layered' => (
310 is => 'rw',
311 isa => 'Bool',
7158714d 312 );
f6066bac 313
1f7aa795 314# If we set an uncorrected path, ever, remember that we did so.
315around 'uncorrected_path' => sub {
316 my $orig = shift;
317 my $self = shift;
318
319 $self->is_layered( 1 );
320 $self->$orig( @_ );
321};
e2902068 322
784877d9 323sub BUILD {
7158714d 324 my $self = shift;
325 if( $self->has_source ) {
fae52efd 326 my $init_sub = '_init_from_' . $self->sourcetype;
327 $self->$init_sub();
328 # Remove our XML / source objects; we no longer need them.
329 $self->clear_object if $self->has_object;
330 $self->tradition->collation->make_witness_path( $self );
331 }
332 return $self;
333}
334
335sub has_source {
336 my $self = shift;
337 return $self->has_file || $self->has_string || $self->has_object;
338}
339
340sub _init_from_xmldesc {
341 my $self = shift;
342 my $xmlobj;
343 if( $self->has_object ) {
344 unless( ref( $self->object ) eq 'XML::LibXML::Element' ) {
345 throw( ident => "bad source",
346 message => "Source object must be an XML::LibXML::Element (this is "
347 . ref( $self->object ) . ");" );
348 }
349 $xmlobj = $self->object;
350 } else {
351 my $parser = XML::LibXML->new();
352 my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
353 try {
354 $xmlobj = $parser->$parsersub( $self->file )->documentElement;
355 } catch( XML::LibXML::Error $e ) {
356 throw( ident => "bad source",
357 message => "XML parsing error: " . $e->as_string );
358 }
359 }
360
361 unless( $xmlobj->nodeName eq 'TEI' ) {
362 throw( ident => "bad source",
363 message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" );
364 }
365
366 # Set up the tags we need, with or without namespaces.
367 map { $tags{$_} = "//$_" }
368 qw/ msDesc msName settlement repository idno p lg w seg add del /;
369 # Set up our XPath object
370 my $xpc = _xpc_for_el( $xmlobj );
371 # Use namespace-aware tags if we have to
372 if( $xmlobj->namespaceURI ) {
373 map { $tags{$_} = "//tei:$_" } keys %tags;
374 }
375
376 # Get the identifier
377 if( my $desc = $xpc->find( $tags{msDesc} ) ) {
378 my $descnode = $desc->get_node(1);
fae52efd 379 # First try to use settlement/repository/idno.
380 my( $setNode, $reposNode, $idNode ) =
381 ( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
382 $xpc->find( $tags{repository}, $descnode )->get_node(1),
383 $xpc->find( $tags{idno}, $descnode )->get_node(1) );
384 $self->settlement( $setNode ? $setNode->textContent : '' );
385 $self->repository( $reposNode ? $reposNode->textContent : '' );
386 $self->idno( $idNode ? $idNode->textContent : '' );
387 if( $self->settlement && $self->idno ) {
388 $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) );
389 } else {
390 # Look for an msName.
391 my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1);
392 if( $msNameNode ) {
393 $self->identifier( $msNameNode->textContent );
394 } else {
395 # We have an msDesc but who knows what is in it?
396 my $desc = $descnode->textContent;
397 $desc =~ s/\n/ /gs;
398 $desc =~ s/\s+/ /g;
399 $self->identifier( $desc );
400 }
401 }
402 if( $descnode->hasAttribute('xml:id') ) {
403 $self->_set_sigil( $descnode->getAttribute('xml:id') );
404 } elsif( !$self->has_sigil ) {
65ed66b9 405 throw( ident => 'missing sigil',
406 message => 'Could not find xml:id witness sigil' );
fae52efd 407 }
408 } else {
409 throw( ident => "bad source",
410 message => "Could not find manuscript description element in TEI header" );
411 }
412
413 # Now get the words out.
414 my @words;
415 my @layerwords; # if the witness has layers
416 # First, make sure all the words are wrapped in tags.
417 # TODO Make this not necessarily dependent upon whitespace...
418 word_tag_wrap( $xmlobj );
419 # Now go text hunting.
420 my @textnodes;
421 if( $self->use_text ) {
422 @textnodes = $xpc->findnodes( $self->use_text );
423 } else {
424 # Use the first 'text' node in the document.
425 @textnodes = $xmlobj->getElementsByTagName( 'text' );
426 }
427 my $teitext = $textnodes[0];
428 if( $teitext ) {
429 _tokenize_text( $self, $teitext, \@words, \@layerwords );
430 } else {
431 throw( ident => "bad source",
432 message => "No text element in document '" . $self->{'identifier'} . "!" );
433 }
434
b39fb0b3 435 my @text = map { $_->text } @words;
436 my @layertext = map { $_->text } @layerwords;
fae52efd 437 $self->path( \@words );
b39fb0b3 438 $self->text( \@text );
439 if( join( ' ', @text ) ne join( ' ', @layertext ) ) {
fae52efd 440 $self->uncorrected_path( \@layerwords );
b39fb0b3 441 $self->layertext( \@layertext );
fae52efd 442 }
fae52efd 443}
444
445sub _tokenize_text {
446 my( $self, $teitext, $wordlist, $uncorrlist ) = @_;
447 # Strip out the words.
448 my $xpc = _xpc_for_el( $teitext );
449 my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' );
450 foreach( @divs ) {
451 my $place_str;
452 if( my $n = $_->getAttribute( 'n' ) ) {
453 $place_str = '#DIV_' . $n . '#';
454 } else {
455 $place_str = '#DIV#';
456 }
457 $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str );
458 } # foreach <div/>
459
460 # But maybe we don't have any divs. Just paragraphs.
461 unless( @divs ) {
462 $self->_objectify_words( $teitext, $wordlist, $uncorrlist );
463 }
464}
465
466sub _objectify_words {
467 my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_;
468
469 my $xpc = _xpc_for_el( $element );
470 my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg};
471 my @pgraphs = $xpc->findnodes( $xpexpr );
472 return () unless @pgraphs;
473 # Set up an expression to look for words and segs
474 $xpexpr = '.' . $tags{w} . '|.' . $tags{seg};
475 foreach my $pg ( @pgraphs ) {
476 # If this paragraph is the descendant of a note element,
477 # skip it.
478 my @noop_container = $xpc->findnodes( 'ancestor::note', $pg );
479 next if scalar @noop_container;
480 # Get the text of each node
481 my $first_word = 1;
482 # Hunt down each wrapped word/seg, and make an object (or two objects)
483 # of it, if necessary.
484 foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
b39fb0b3 485 my( $text, $uncorr ) = _get_word_strings( $c );
fae52efd 486# try {
487# ( $text, $uncorr ) = _get_word_object( $c );
488# } catch( Text::Tradition::Error $e
489# where { $_->has_tag( 'lb' ) } ) {
490# next;
491# }
492 unless( defined $text || defined $uncorr ) {
493 print STDERR "WARNING: no text in node " . $c->nodeName
494 . "\n" unless $c->nodeName eq 'lb';
495 next;
496 }
497 print STDERR "DEBUG: space found in element node "
498 . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/;
499
500 my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist;
501 while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) {
502 $ctr++;
503 }
504 my $id = $self->sigil . 'r' . $ctr;
505 my( $word, $acword );
506 if( $text ) {
507 $word = $self->tradition->collation->add_reading(
508 { 'id' => $id, 'text' => $text });
509 }
510 if( $uncorr && $uncorr ne $text ) {
511 $id .= '_ac';
512 $acword = $self->tradition->collation->add_reading(
513 { 'id' => $id, 'text' => $uncorr });
514 } elsif( $uncorr ) {
515 $acword = $word;
516 }
517
518# if( $first_word ) {
519# $first_word = 0;
520# # Set the relevant sectioning markers
521# if( $divmarker ) {
522# $w->add_placeholder( $divmarker );
523# $divmarker = undef;
524# }
525# $w->add_placeholder( '#PG#' );
526# }
527 push( @$wordlist, $word ) if $word;
528 push( @$uncorrlist, $acword ) if $acword;
529 }
530 }
531}
532
533# Given a word or segment node, make a Reading object for the word
534# therein. Make two Reading objects if there is an 'uncorrected' vs.
535# 'corrected' state.
536
537sub _get_word_strings {
538 my( $node ) = @_;
539 my( $text, $uncorrtext );
540 # We can have an lb or pb in the middle of a word; if we do, the
541 # whitespace (including \n) after the break becomes insignificant
542 # and we want to nuke it.
543 my $strip_leading_space = 0;
544 my $word_excluded = 0;
545 my $xpc = _xpc_for_el( $node );
546 # TODO This does not cope with nested add/dels.
b39fb0b3 547 my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) );
548 my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) );
fae52efd 549 foreach my $c ($node->childNodes() ) {
550 if( $c->nodeName eq 'num'
551 && defined $c->getAttribute( 'value' ) ) {
552 # Push the number.
553 $text .= $c->getAttribute( 'value' ) unless @deletion;
554 $uncorrtext .= $c->getAttribute( 'value' ) unless @addition;
555 # If this is just after a line/page break, return to normal behavior.
556 $strip_leading_space = 0;
557 } elsif ( $c->nodeName =~ /^[lp]b$/ ) {
558 # Set a flag that strips leading whitespace until we
559 # get to the next bit of non-whitespace.
560 $strip_leading_space = 1;
561 } elsif ( $c->nodeName eq 'fw' # for catchwords
562 || $c->nodeName eq 'sic'
563 || $c->nodeName eq 'note' #TODO: decide how to deal with notes
564 || $c->textContent eq ''
565 || ref( $c ) eq 'XML::LibXML::Comment' ) {
566 $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
567 next;
568 } elsif( $c->nodeName eq 'add' ) {
b39fb0b3 569 my( $use, $discard ) = _get_word_strings( $c );
fae52efd 570 $text .= $use;
571 } elsif( $c->nodeName eq 'del' ) {
b39fb0b3 572 my( $discard, $use ) = _get_word_strings( $c );
fae52efd 573 $uncorrtext .= $use;
574 } else {
b39fb0b3 575 my ( $tagtxt, $taguncorr );
fae52efd 576 if( ref( $c ) eq 'XML::LibXML::Text' ) {
577 # A text node.
578 $tagtxt = $c->textContent;
b39fb0b3 579 $taguncorr = $c->textContent;
fae52efd 580 } else {
b39fb0b3 581 ( $tagtxt, $taguncorr ) = _get_word_strings( $c );
fae52efd 582 }
583 if( $strip_leading_space ) {
584 $tagtxt =~ s/^[\s\n]+//s;
b39fb0b3 585 $taguncorr =~ s/^[\s\n]+//s;
fae52efd 586 # Unset the flag as soon as we see non-whitespace.
587 $strip_leading_space = 0 if $tagtxt;
588 }
589 $text .= $tagtxt;
b39fb0b3 590 $uncorrtext .= $taguncorr;
fae52efd 591 }
592 }
593 throw( ident => "text not found",
594 tags => [ $node->nodeName ],
595 message => "No text found in node " . $node->toString(0) )
596 unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/;
597 return( $text, $uncorrtext );
598}
599
600sub _split_words {
601 my( $self, $string, $c ) = @_;
602 my @raw_words = split( /\s+/, $string );
603 my @words;
604 foreach my $w ( @raw_words ) {
605 my $id = $self->sigil . 'r'. $c++;
606 my %opts = ( 'text' => $w, 'id' => $id, 'language' => $self->language );
607 my $w_obj = $self->tradition->collation->add_reading( \%opts );
608 # Skip any words that have been canonized out of existence.
609 next if( length( $w_obj->text ) == 0 );
610 push( @words, $w_obj );
611 }
612 return @words;
613}
614
615sub _init_from_json {
616 my( $self ) = shift;
617 my $wit;
618 if( $self->has_object ) {
619 $wit = $self->object;
65ed66b9 620 } elsif( $self->has_string ) {
621 $wit = from_json( $self->string );
622 } elsif( $self->has_file ) {
623 my $ok = open( INPUT, $self->file );
624 unless( $ok ) {
625 throw( ident => "bad source",
626 message => 'Could not open ' . $self->file . ' for reading' );
627 }
628 binmode( INPUT, ':encoding(UTF-8)' );
629 my @lines = <INPUT>;
630 close INPUT;
631 $wit = from_json( join( '', @lines ) );
fae52efd 632 }
633
65ed66b9 634 if( exists $wit->{'id'} ) {
635 $self->_set_sigil( $wit->{'id'} );
636 } elsif( !$self->has_sigil ) {
637 throw( ident => 'missing sigil',
638 message => 'Could not find witness sigil (id) in JSON spec' );
639 }
fae52efd 640 $self->identifier( $wit->{'name'} );
641 my @words;
642 my @layerwords;
038d6b50 643 my( @text, @layertext );
fae52efd 644 if( exists $wit->{'content'} ) {
645 # We need to tokenize the text ourselves.
646 @words = _split_words( $self, $wit->{'content'} );
647 } elsif( exists $wit->{'tokens'} ) {
648 # We have a bunch of pretokenized words.
649 my $ctr = 0;
650 foreach my $token ( @{$wit->{'tokens'}} ) {
651 my $w_obj = $self->tradition->collation->add_reading({
65ed66b9 652 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
fae52efd 653 push( @words, $w_obj );
038d6b50 654 push( @text, $token->{'t'} ); # TODO unless...?
fae52efd 655 }
656 ## TODO rethink this JSOn mechanism
657 if( exists $wit->{'layertokens'} ) {
658 foreach my $token ( @{$wit->{'layertokens'}} ) {
659 my $w_obj = $self->tradition->collation->add_reading({
65ed66b9 660 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
fae52efd 661 push( @layerwords, $w_obj );
038d6b50 662 push( @layertext, $token->{'t'} );
7158714d 663 }
fae52efd 664 }
7158714d 665 }
038d6b50 666 $self->text( \@text );
667 $self->layertext( \@layertext ) if @layertext;
fae52efd 668 $self->path( \@words );
669 $self->uncorrected_path( \@layerwords ) if @layerwords;
7158714d 670}
671
fae52efd 672sub _init_from_plaintext {
673 my( $self ) = @_;
674 my $str;
675 if( $self->has_file ) {
676 my $ok = open( INPUT, $self->file );
677 unless( $ok ) {
678 throw( ident => "bad source",
679 message => 'Could not open ' . $self->file . ' for reading' );
680 }
681 binmode( INPUT, ':encoding(UTF-8)' );
682 my @lines = <INPUT>;
683 close INPUT;
684 $str = join( '', @lines );
685 } elsif( $self->has_object ) { # ...seriously?
686 $str = ${$self->object};
687 } else {
688 $str = $self->string;
689 }
690
691 # TODO allow a different word separation expression
692 my @text = split( /\s+/, $str );
693 $self->text( \@text );
694 my @words = _split_words( $self, $str );
695 $self->path( \@words );
696}
697
698sub throw {
699 Text::Tradition::Error->throw(
700 'ident' => 'Witness parsing error',
701 'message' => $_[0],
702 );
703}
704
705sub _xpc_for_el {
706 my $el = shift;
707 my $xpc = XML::LibXML::XPathContext->new( $el );
708 if( $el->namespaceURI ) {
709 $xpc->registerNs( 'tei', $el->namespaceURI );
710 }
711 return $xpc;
712}
713
f025e303 714=head2 export_as_json
715
716Exports the witness as a JSON structure, with the following keys:
717
718=over 4
719
720=item * id - The witness sigil
721
722=item * name - The witness identifier
723
724=item * tokens - An array of hashes of the form { "t":"WORD" }
725
726=back
727
728=begin testing
729
730use Text::Tradition;
fae52efd 731my $trad = Text::Tradition->new();
f025e303 732
fae52efd 733my @text = qw/ Thhis is a line of text /;
734my $wit = $trad->add_witness(
f025e303 735 'sigil' => 'A',
fae52efd 736 'string' => join( ' ', @text ),
737 'sourcetype' => 'plaintext',
f025e303 738 'identifier' => 'test witness',
739 );
740my $jsonstruct = $wit->export_as_json;
741is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
742is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" );
743is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" );
744foreach my $idx ( 0 .. $#text ) {
745 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" );
746}
747
748my @ctext = qw( when april with his showers sweet with fruit the drought of march
749 has pierced unto the root );
fae52efd 750$trad = Text::Tradition->new(
f025e303 751 'input' => 'CollateX',
752 'file' => 't/data/Collatex-16.xml' );
753
754$jsonstruct = $trad->witness('A')->export_as_json;
755is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
756is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" );
757is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" );
758foreach my $idx ( 0 .. $#ctext ) {
759 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" );
760}
761
fae52efd 762## TODO test layertext export
763
f025e303 764=end testing
765
766=cut
767
768sub export_as_json {
769 my $self = shift;
770 my @wordlist = map { { 't' => $_ || '' } } @{$self->text};
fae52efd 771 my $obj = {
f025e303 772 'id' => $self->sigil,
773 'tokens' => \@wordlist,
774 'name' => $self->identifier,
775 };
fae52efd 776 if( $self->is_layered ) {
777 my @lwlist = map { { 't' => $_ || '' } } @{$self->uncorrected};
778 $obj->{'layertokens'} = \@lwlist;
779 }
780 return $obj;
f025e303 781}
782
dd3b58b0 783no Moose;
784__PACKAGE__->meta->make_immutable;
7158714d 785
786=head1 BUGS / TODO
787
788=over
789
06e7cbc7 790=item * Figure out how to serialize a witness
791
fae52efd 792=item * Support encodings other than UTF-8
7158714d 793
794=back
795
796=head1 LICENSE
797
798This package is free software and is provided "as is" without express
799or implied warranty. You can redistribute it and/or modify it under
800the same terms as Perl itself.
801
802=head1 AUTHOR
803
804Tara L Andrews E<lt>aurum@cpan.orgE<gt>