start to add a proper and extensible relationship typology
[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Witness.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Witness;
fae52efd 2
3use vars qw( %tags );
4use JSON;
dd3b58b0 5use Moose;
7854e12e 6use Moose::Util::TypeConstraints;
fae52efd 7use Text::TEI::Markup qw( word_tag_wrap );
8use TryCatch;
9use XML::Easy::Syntax qw( $xml10_name_rx );
dd3b58b0 10
7158714d 11=head1 NAME
12
13Text::Tradition::Witness - a manuscript witness to a text tradition
14
15=head1 SYNOPSIS
16
17 use Text::Tradition::Witness;
18 my $w = Text::Tradition::Witness->new(
19 'sigil' => 'A',
20 'identifier' => 'Oxford MS Ex.1932',
21 );
22
23=head1 DESCRIPTION
24
25Text::Tradition::Witness is an object representation of a manuscript
26witness to a text tradition. A manuscript has a sigil (a short code that
27represents it in the wider tradition), an identifier (e.g. the library ID),
28and probably a text.
29
30=head1 METHODS
31
32=head2 new
33
34Create a new witness. Options include:
35
36=over
37
38=item * sigil - A short code to represent the manuscript. Required.
39
fae52efd 40=item * sourcetype - What sort of witness data this is. Options are
41'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be
42used by Collation parsers.)
861c3e27 43
fae52efd 44=item * file
45=item * string
46=item * object
47
48The data source for the witness. Use the appropriate option.
49
50=item * use_text - An initialization option. If the witness is read from a
51TEI document and more than one <text/> tag exists therein, the default
52behavior is to use the first defined text. If this is not desired,
53use_text should be set to an XPath expression that will select the correct
54text.
7158714d 55
7158714d 56=item * identifier - The recognized name of the manuscript, e.g. a library
fae52efd 57identifier. Taken from the msDesc element for a TEI file.
7158714d 58
59=item * other_info - A freeform string for any other description of the
fae52efd 60manuscript.
7158714d 61
62=back
63
64=head2 sigil
65
fae52efd 66The sigil by which to identify this manuscript, which must conform to the
67specification for XML attribute strings (broadly speaking, it must begin
68with a letter and can have only a few sorts of punctuation characters in
69it.)
70
71=head2 identifier
72
73A freeform name by which to identify the manuscript, which may be longer
74than the sigil. Defaults to 'Unidentified ms', but will be taken from the
75TEI msName attribute, or constructed from the settlement and idno if
76supplied.
77
78=head2 settlement
79
80The city, town, etc. where the manuscript is held. Will be read from the
81TEI msDesc element if supplied.
82
83=head2 repository
84
85The institution that holds the manuscript. Will be read from the TEI msDesc
86element if supplied.
87
88=head2 idno
89
90The identification or call number of the manuscript. Will be read from the
91TEI msDesc element if supplied.
7158714d 92
93=head2 text
94
fae52efd 95An array of strings (words) that contains the text of the
96manuscript. This should not change after the witness has been
97instantiated, and the path through the collation should always match it.
98
99=head2 layertext
7158714d 100
fae52efd 101An array of strings (words) that contains the layered
102text, if any, of the manuscript. This should not change after the witness
103has been instantiated, and the path through the collation should always
104match it.
7158714d 105
7158714d 106=head2 identifier
107
108Accessor method for the witness identifier.
109
110=head2 other_info
111
112Accessor method for the general witness description.
113
06e7cbc7 114=head2 has_source
115
116Boolean method that returns a true value if the witness was created with a
117data source (that is, a file, string, or object to be parsed).
118
1f7aa795 119=head2 is_layered
7158714d 120
1f7aa795 121Boolean method to note whether the witness has layers (e.g. pre-correction
122readings) in the collation.
7158714d 123
124=begin testing
125
fae52efd 126use Text::Tradition;
127my $trad = Text::Tradition->new( 'name' => 'test tradition' );
128my $c = $trad->collation;
7158714d 129
fae52efd 130# Test a plaintext witness via string
131my $str = 'This is a line of text';
132my $ptwit = $trad->add_witness(
7158714d 133 'sigil' => 'A',
fae52efd 134 'sourcetype' => 'plaintext',
135 'string' => $str
7158714d 136 );
fae52efd 137is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' );
138if( $ptwit ) {
139 is( $ptwit->sigil, 'A', "Witness has correct sigil" );
140 is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
7158714d 141}
142
65ed66b9 143# Test some JSON witnesses via object
144open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
145binmode( JSIN, ':encoding(UTF-8)' );
146my @lines = <JSIN>;
147close JSIN;
148$trad->add_json_witnesses( join( '', @lines ) );
149is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
150 "Found first JSON witness" );
151is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness',
152 "Found second JSON witness" );
153
b39fb0b3 154# Test an XML witness via file
155my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
156 'file' => 't/data/witnesses/teiwit.xml' );
157is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
158if( $xmlwit ) {
159 is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
160 ok( $xmlwit->is_layered, "Picked up correction layer" );
161 is( @{$xmlwit->text}, 182, "Got correct text length" );
162 is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" );
163}
164my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings;
165is( @allwitwords, 184, "Reused appropriate readings" );
fae52efd 166
167## Test use_text
b39fb0b3 168my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
169 'file' => 't/data/witnesses/group.xml',
170 'use_text' => '//tei:group/tei:text[2]' );
171is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" );
172if( $xpwit ) {
173 is( $xpwit->sigil, 'G', "XML part witness has correct sigil" );
174 ok( !$xpwit->is_layered, "Picked up no correction layer" );
175 is( @{$xpwit->text}, 157, "Got correct text length" );
176}
177
fae52efd 178
7158714d 179=end testing
180
181=cut
182
bf7d52b5 183# Enable plugin(s) if available
184eval { with 'Text::Tradition::WitLanguage'; };
185
fae52efd 186subtype 'SourceType',
187 as 'Str',
188 where { $_ =~ /^(xmldesc|plaintext|json|collation)$/ },
189 message { 'Source type must be one of xmldesc, plaintext, json, collation' };
190
191subtype 'Sigil',
192 as 'Str',
193 where { $_ =~ /\A$xml10_name_rx\z/ },
194 message { 'Sigil must be a valid XML attribute string' };
195
196no Moose::Util::TypeConstraints;
197
198has 'tradition' => (
199 'is' => 'ro',
200 'isa' => 'Text::Tradition',
201 'required' => 1,
5c0072ef 202 weak_ref => 1
fae52efd 203 );
204
205# Sigil. Required identifier for a witness, but may be found inside
206# the XML file.
dd3b58b0 207has 'sigil' => (
7158714d 208 is => 'ro',
fae52efd 209 isa => 'Sigil',
210 predicate => 'has_sigil',
211 writer => '_set_sigil',
212 );
213
214# Other identifying information
215has 'identifier' => (
216 is => 'rw',
217 isa => 'Str',
218 );
219
220has 'settlement' => (
221 is => 'rw',
222 isa => 'Str',
223 );
224
225has 'repository' => (
226 is => 'rw',
227 isa => 'Str',
228 );
229
230has 'idno' => (
231 is => 'rw',
232 isa => 'Str',
233 );
234
b39fb0b3 235# Source. Can be XML obj, JSON data struct, or string.
236# Not used if the witness is created by parsing a collation.
fae52efd 237has 'sourcetype' => (
238 is => 'ro',
239 isa => 'SourceType',
240 required => 1,
241);
242
fae52efd 243has 'file' => (
244 is => 'ro',
245 isa => 'Str',
246 predicate => 'has_file',
247);
248
249has 'string' => (
250 is => 'ro',
251 isa => 'Str',
252 predicate => 'has_string',
253);
254
255has 'object' => ( # could be anything.
256 is => 'ro',
257 predicate => 'has_object',
258 clearer => 'clear_object',
259);
260
261# In the case of a TEI document with multiple texts, specify
262# which text is the root. Should be an XPath expression.
263has 'use_text' => (
264 is => 'ro',
7158714d 265 isa => 'Str',
fae52efd 266 );
267
7158714d 268# Text. This is an array of strings (i.e. word tokens).
d047cd52 269# TODO Think about how to handle this for the case of pre-prepared
270# collations, where the tokens are in the graph already.
dd3b58b0 271has 'text' => (
7158714d 272 is => 'rw',
273 isa => 'ArrayRef[Str]',
274 predicate => 'has_text',
275 );
b0b4421a 276
277has 'layertext' => (
278 is => 'rw',
279 isa => 'ArrayRef[Str]',
280 predicate => 'has_layertext',
281 );
fae52efd 282
4889be4f 283has 'is_collated' => (
284 is => 'rw',
285 isa => 'Bool'
286 );
287
1f7aa795 288# Path. This is an array of Reading nodes that can be saved during
289# initialization, but should be cleared before saving in a DB.
4a8828f0 290has 'path' => (
7158714d 291 is => 'rw',
292 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
293 predicate => 'has_path',
1f7aa795 294 clearer => 'clear_path',
7158714d 295 );
4a8828f0 296
b39fb0b3 297## TODO change the name of this
b15511bf 298has 'uncorrected_path' => (
7158714d 299 is => 'rw',
300 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
1f7aa795 301 clearer => 'clear_uncorrected_path',
302 );
303
304has 'is_layered' => (
305 is => 'rw',
306 isa => 'Bool',
7158714d 307 );
f6066bac 308
1f7aa795 309# If we set an uncorrected path, ever, remember that we did so.
310around 'uncorrected_path' => sub {
311 my $orig = shift;
312 my $self = shift;
313
314 $self->is_layered( 1 );
315 $self->$orig( @_ );
316};
e2902068 317
784877d9 318sub BUILD {
7158714d 319 my $self = shift;
320 if( $self->has_source ) {
fae52efd 321 my $init_sub = '_init_from_' . $self->sourcetype;
322 $self->$init_sub();
323 # Remove our XML / source objects; we no longer need them.
324 $self->clear_object if $self->has_object;
4889be4f 325 # $self->tradition->collation->make_witness_path( $self );
326 }
327 if( $self->sourcetype eq 'collation' ) {
328 $self->is_collated( 1 );
fae52efd 329 }
330 return $self;
331}
332
333sub has_source {
334 my $self = shift;
335 return $self->has_file || $self->has_string || $self->has_object;
336}
337
338sub _init_from_xmldesc {
339 my $self = shift;
340 my $xmlobj;
341 if( $self->has_object ) {
342 unless( ref( $self->object ) eq 'XML::LibXML::Element' ) {
343 throw( ident => "bad source",
344 message => "Source object must be an XML::LibXML::Element (this is "
345 . ref( $self->object ) . ");" );
346 }
347 $xmlobj = $self->object;
348 } else {
428bcf0b 349 require XML::LibXML;
fae52efd 350 my $parser = XML::LibXML->new();
351 my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
352 try {
353 $xmlobj = $parser->$parsersub( $self->file )->documentElement;
354 } catch( XML::LibXML::Error $e ) {
355 throw( ident => "bad source",
356 message => "XML parsing error: " . $e->as_string );
357 }
358 }
359
360 unless( $xmlobj->nodeName eq 'TEI' ) {
361 throw( ident => "bad source",
362 message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" );
363 }
364
365 # Set up the tags we need, with or without namespaces.
366 map { $tags{$_} = "//$_" }
367 qw/ msDesc msName settlement repository idno p lg w seg add del /;
368 # Set up our XPath object
369 my $xpc = _xpc_for_el( $xmlobj );
370 # Use namespace-aware tags if we have to
371 if( $xmlobj->namespaceURI ) {
372 map { $tags{$_} = "//tei:$_" } keys %tags;
373 }
374
375 # Get the identifier
376 if( my $desc = $xpc->find( $tags{msDesc} ) ) {
377 my $descnode = $desc->get_node(1);
fae52efd 378 # First try to use settlement/repository/idno.
379 my( $setNode, $reposNode, $idNode ) =
380 ( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
381 $xpc->find( $tags{repository}, $descnode )->get_node(1),
382 $xpc->find( $tags{idno}, $descnode )->get_node(1) );
383 $self->settlement( $setNode ? $setNode->textContent : '' );
384 $self->repository( $reposNode ? $reposNode->textContent : '' );
385 $self->idno( $idNode ? $idNode->textContent : '' );
386 if( $self->settlement && $self->idno ) {
387 $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) );
388 } else {
389 # Look for an msName.
390 my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1);
391 if( $msNameNode ) {
392 $self->identifier( $msNameNode->textContent );
393 } else {
394 # We have an msDesc but who knows what is in it?
395 my $desc = $descnode->textContent;
396 $desc =~ s/\n/ /gs;
397 $desc =~ s/\s+/ /g;
398 $self->identifier( $desc );
399 }
400 }
401 if( $descnode->hasAttribute('xml:id') ) {
402 $self->_set_sigil( $descnode->getAttribute('xml:id') );
403 } elsif( !$self->has_sigil ) {
65ed66b9 404 throw( ident => 'missing sigil',
405 message => 'Could not find xml:id witness sigil' );
fae52efd 406 }
407 } else {
408 throw( ident => "bad source",
409 message => "Could not find manuscript description element in TEI header" );
410 }
411
412 # Now get the words out.
413 my @words;
414 my @layerwords; # if the witness has layers
415 # First, make sure all the words are wrapped in tags.
416 # TODO Make this not necessarily dependent upon whitespace...
417 word_tag_wrap( $xmlobj );
418 # Now go text hunting.
419 my @textnodes;
420 if( $self->use_text ) {
421 @textnodes = $xpc->findnodes( $self->use_text );
422 } else {
423 # Use the first 'text' node in the document.
424 @textnodes = $xmlobj->getElementsByTagName( 'text' );
425 }
426 my $teitext = $textnodes[0];
427 if( $teitext ) {
428 _tokenize_text( $self, $teitext, \@words, \@layerwords );
429 } else {
430 throw( ident => "bad source",
431 message => "No text element in document '" . $self->{'identifier'} . "!" );
432 }
433
b39fb0b3 434 my @text = map { $_->text } @words;
435 my @layertext = map { $_->text } @layerwords;
fae52efd 436 $self->path( \@words );
b39fb0b3 437 $self->text( \@text );
438 if( join( ' ', @text ) ne join( ' ', @layertext ) ) {
fae52efd 439 $self->uncorrected_path( \@layerwords );
b39fb0b3 440 $self->layertext( \@layertext );
fae52efd 441 }
fae52efd 442}
443
444sub _tokenize_text {
445 my( $self, $teitext, $wordlist, $uncorrlist ) = @_;
446 # Strip out the words.
447 my $xpc = _xpc_for_el( $teitext );
448 my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' );
449 foreach( @divs ) {
450 my $place_str;
451 if( my $n = $_->getAttribute( 'n' ) ) {
452 $place_str = '#DIV_' . $n . '#';
453 } else {
454 $place_str = '#DIV#';
455 }
456 $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str );
457 } # foreach <div/>
458
459 # But maybe we don't have any divs. Just paragraphs.
460 unless( @divs ) {
461 $self->_objectify_words( $teitext, $wordlist, $uncorrlist );
462 }
463}
464
465sub _objectify_words {
466 my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_;
467
468 my $xpc = _xpc_for_el( $element );
469 my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg};
470 my @pgraphs = $xpc->findnodes( $xpexpr );
471 return () unless @pgraphs;
472 # Set up an expression to look for words and segs
473 $xpexpr = '.' . $tags{w} . '|.' . $tags{seg};
474 foreach my $pg ( @pgraphs ) {
475 # If this paragraph is the descendant of a note element,
476 # skip it.
477 my @noop_container = $xpc->findnodes( 'ancestor::note', $pg );
478 next if scalar @noop_container;
479 # Get the text of each node
480 my $first_word = 1;
481 # Hunt down each wrapped word/seg, and make an object (or two objects)
482 # of it, if necessary.
483 foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
b39fb0b3 484 my( $text, $uncorr ) = _get_word_strings( $c );
fae52efd 485# try {
486# ( $text, $uncorr ) = _get_word_object( $c );
487# } catch( Text::Tradition::Error $e
488# where { $_->has_tag( 'lb' ) } ) {
489# next;
490# }
491 unless( defined $text || defined $uncorr ) {
492 print STDERR "WARNING: no text in node " . $c->nodeName
493 . "\n" unless $c->nodeName eq 'lb';
494 next;
495 }
496 print STDERR "DEBUG: space found in element node "
497 . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/;
498
499 my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist;
500 while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) {
501 $ctr++;
502 }
503 my $id = $self->sigil . 'r' . $ctr;
504 my( $word, $acword );
505 if( $text ) {
506 $word = $self->tradition->collation->add_reading(
507 { 'id' => $id, 'text' => $text });
508 }
509 if( $uncorr && $uncorr ne $text ) {
510 $id .= '_ac';
511 $acword = $self->tradition->collation->add_reading(
512 { 'id' => $id, 'text' => $uncorr });
513 } elsif( $uncorr ) {
514 $acword = $word;
515 }
516
517# if( $first_word ) {
518# $first_word = 0;
519# # Set the relevant sectioning markers
520# if( $divmarker ) {
521# $w->add_placeholder( $divmarker );
522# $divmarker = undef;
523# }
524# $w->add_placeholder( '#PG#' );
525# }
526 push( @$wordlist, $word ) if $word;
527 push( @$uncorrlist, $acword ) if $acword;
528 }
529 }
530}
531
532# Given a word or segment node, make a Reading object for the word
533# therein. Make two Reading objects if there is an 'uncorrected' vs.
534# 'corrected' state.
535
536sub _get_word_strings {
537 my( $node ) = @_;
538 my( $text, $uncorrtext );
539 # We can have an lb or pb in the middle of a word; if we do, the
540 # whitespace (including \n) after the break becomes insignificant
541 # and we want to nuke it.
542 my $strip_leading_space = 0;
543 my $word_excluded = 0;
544 my $xpc = _xpc_for_el( $node );
545 # TODO This does not cope with nested add/dels.
b39fb0b3 546 my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) );
547 my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) );
fae52efd 548 foreach my $c ($node->childNodes() ) {
549 if( $c->nodeName eq 'num'
550 && defined $c->getAttribute( 'value' ) ) {
551 # Push the number.
552 $text .= $c->getAttribute( 'value' ) unless @deletion;
553 $uncorrtext .= $c->getAttribute( 'value' ) unless @addition;
554 # If this is just after a line/page break, return to normal behavior.
555 $strip_leading_space = 0;
556 } elsif ( $c->nodeName =~ /^[lp]b$/ ) {
557 # Set a flag that strips leading whitespace until we
558 # get to the next bit of non-whitespace.
559 $strip_leading_space = 1;
560 } elsif ( $c->nodeName eq 'fw' # for catchwords
561 || $c->nodeName eq 'sic'
562 || $c->nodeName eq 'note' #TODO: decide how to deal with notes
563 || $c->textContent eq ''
564 || ref( $c ) eq 'XML::LibXML::Comment' ) {
565 $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
566 next;
567 } elsif( $c->nodeName eq 'add' ) {
b39fb0b3 568 my( $use, $discard ) = _get_word_strings( $c );
fae52efd 569 $text .= $use;
570 } elsif( $c->nodeName eq 'del' ) {
b39fb0b3 571 my( $discard, $use ) = _get_word_strings( $c );
fae52efd 572 $uncorrtext .= $use;
573 } else {
b39fb0b3 574 my ( $tagtxt, $taguncorr );
fae52efd 575 if( ref( $c ) eq 'XML::LibXML::Text' ) {
576 # A text node.
577 $tagtxt = $c->textContent;
b39fb0b3 578 $taguncorr = $c->textContent;
fae52efd 579 } else {
b39fb0b3 580 ( $tagtxt, $taguncorr ) = _get_word_strings( $c );
fae52efd 581 }
582 if( $strip_leading_space ) {
583 $tagtxt =~ s/^[\s\n]+//s;
b39fb0b3 584 $taguncorr =~ s/^[\s\n]+//s;
fae52efd 585 # Unset the flag as soon as we see non-whitespace.
586 $strip_leading_space = 0 if $tagtxt;
587 }
588 $text .= $tagtxt;
b39fb0b3 589 $uncorrtext .= $taguncorr;
fae52efd 590 }
591 }
592 throw( ident => "text not found",
593 tags => [ $node->nodeName ],
594 message => "No text found in node " . $node->toString(0) )
595 unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/;
596 return( $text, $uncorrtext );
597}
598
599sub _split_words {
600 my( $self, $string, $c ) = @_;
601 my @raw_words = split( /\s+/, $string );
602 my @words;
603 foreach my $w ( @raw_words ) {
604 my $id = $self->sigil . 'r'. $c++;
e92d4229 605 my %opts = ( 'text' => $w, 'id' => $id );
fae52efd 606 my $w_obj = $self->tradition->collation->add_reading( \%opts );
607 # Skip any words that have been canonized out of existence.
608 next if( length( $w_obj->text ) == 0 );
609 push( @words, $w_obj );
610 }
611 return @words;
612}
613
614sub _init_from_json {
615 my( $self ) = shift;
616 my $wit;
617 if( $self->has_object ) {
618 $wit = $self->object;
65ed66b9 619 } elsif( $self->has_string ) {
620 $wit = from_json( $self->string );
621 } elsif( $self->has_file ) {
622 my $ok = open( INPUT, $self->file );
623 unless( $ok ) {
624 throw( ident => "bad source",
625 message => 'Could not open ' . $self->file . ' for reading' );
626 }
627 binmode( INPUT, ':encoding(UTF-8)' );
628 my @lines = <INPUT>;
629 close INPUT;
630 $wit = from_json( join( '', @lines ) );
fae52efd 631 }
632
65ed66b9 633 if( exists $wit->{'id'} ) {
634 $self->_set_sigil( $wit->{'id'} );
635 } elsif( !$self->has_sigil ) {
636 throw( ident => 'missing sigil',
637 message => 'Could not find witness sigil (id) in JSON spec' );
638 }
fae52efd 639 $self->identifier( $wit->{'name'} );
640 my @words;
641 my @layerwords;
038d6b50 642 my( @text, @layertext );
fae52efd 643 if( exists $wit->{'content'} ) {
644 # We need to tokenize the text ourselves.
645 @words = _split_words( $self, $wit->{'content'} );
646 } elsif( exists $wit->{'tokens'} ) {
647 # We have a bunch of pretokenized words.
648 my $ctr = 0;
649 foreach my $token ( @{$wit->{'tokens'}} ) {
650 my $w_obj = $self->tradition->collation->add_reading({
65ed66b9 651 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
fae52efd 652 push( @words, $w_obj );
038d6b50 653 push( @text, $token->{'t'} ); # TODO unless...?
fae52efd 654 }
655 ## TODO rethink this JSOn mechanism
656 if( exists $wit->{'layertokens'} ) {
657 foreach my $token ( @{$wit->{'layertokens'}} ) {
658 my $w_obj = $self->tradition->collation->add_reading({
65ed66b9 659 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
fae52efd 660 push( @layerwords, $w_obj );
038d6b50 661 push( @layertext, $token->{'t'} );
7158714d 662 }
fae52efd 663 }
7158714d 664 }
038d6b50 665 $self->text( \@text );
666 $self->layertext( \@layertext ) if @layertext;
fae52efd 667 $self->path( \@words );
668 $self->uncorrected_path( \@layerwords ) if @layerwords;
7158714d 669}
670
fae52efd 671sub _init_from_plaintext {
672 my( $self ) = @_;
bf7d52b5 673 unless( $self->has_sigil ) {
674 throw( "No sigil defined for the plaintext witness" );
675 }
fae52efd 676 my $str;
677 if( $self->has_file ) {
678 my $ok = open( INPUT, $self->file );
679 unless( $ok ) {
680 throw( ident => "bad source",
681 message => 'Could not open ' . $self->file . ' for reading' );
682 }
683 binmode( INPUT, ':encoding(UTF-8)' );
684 my @lines = <INPUT>;
685 close INPUT;
686 $str = join( '', @lines );
687 } elsif( $self->has_object ) { # ...seriously?
688 $str = ${$self->object};
689 } else {
690 $str = $self->string;
691 }
692
693 # TODO allow a different word separation expression
694 my @text = split( /\s+/, $str );
695 $self->text( \@text );
696 my @words = _split_words( $self, $str );
697 $self->path( \@words );
698}
699
700sub throw {
701 Text::Tradition::Error->throw(
702 'ident' => 'Witness parsing error',
703 'message' => $_[0],
704 );
705}
706
707sub _xpc_for_el {
708 my $el = shift;
709 my $xpc = XML::LibXML::XPathContext->new( $el );
710 if( $el->namespaceURI ) {
711 $xpc->registerNs( 'tei', $el->namespaceURI );
712 }
713 return $xpc;
714}
715
f025e303 716=head2 export_as_json
717
718Exports the witness as a JSON structure, with the following keys:
719
720=over 4
721
722=item * id - The witness sigil
723
724=item * name - The witness identifier
725
726=item * tokens - An array of hashes of the form { "t":"WORD" }
727
728=back
729
730=begin testing
731
732use Text::Tradition;
fae52efd 733my $trad = Text::Tradition->new();
f025e303 734
fae52efd 735my @text = qw/ Thhis is a line of text /;
736my $wit = $trad->add_witness(
f025e303 737 'sigil' => 'A',
fae52efd 738 'string' => join( ' ', @text ),
739 'sourcetype' => 'plaintext',
f025e303 740 'identifier' => 'test witness',
741 );
742my $jsonstruct = $wit->export_as_json;
743is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
744is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" );
745is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" );
746foreach my $idx ( 0 .. $#text ) {
747 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" );
748}
749
750my @ctext = qw( when april with his showers sweet with fruit the drought of march
751 has pierced unto the root );
fae52efd 752$trad = Text::Tradition->new(
f025e303 753 'input' => 'CollateX',
754 'file' => 't/data/Collatex-16.xml' );
755
756$jsonstruct = $trad->witness('A')->export_as_json;
757is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
758is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" );
759is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" );
760foreach my $idx ( 0 .. $#ctext ) {
761 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" );
762}
763
fae52efd 764## TODO test layertext export
765
f025e303 766=end testing
767
768=cut
769
770sub export_as_json {
771 my $self = shift;
772 my @wordlist = map { { 't' => $_ || '' } } @{$self->text};
fae52efd 773 my $obj = {
f025e303 774 'id' => $self->sigil,
775 'tokens' => \@wordlist,
776 'name' => $self->identifier,
777 };
fae52efd 778 if( $self->is_layered ) {
779 my @lwlist = map { { 't' => $_ || '' } } @{$self->uncorrected};
780 $obj->{'layertokens'} = \@lwlist;
781 }
782 return $obj;
f025e303 783}
784
dd3b58b0 785no Moose;
786__PACKAGE__->meta->make_immutable;
7158714d 787
788=head1 BUGS / TODO
789
790=over
791
06e7cbc7 792=item * Figure out how to serialize a witness
793
fae52efd 794=item * Support encodings other than UTF-8
7158714d 795
796=back
797
798=head1 LICENSE
799
800This package is free software and is provided "as is" without express
801or implied warranty. You can redistribute it and/or modify it under
802the same terms as Perl itself.
803
804=head1 AUTHOR
805
806Tara L Andrews E<lt>aurum@cpan.orgE<gt>