make XML witness parsing work
[scpubgit/stemmatology.git] / lib / Text / Tradition / Witness.pm
CommitLineData
dd3b58b0 1package Text::Tradition::Witness;
fae52efd 2
3use vars qw( %tags );
4use JSON;
dd3b58b0 5use Moose;
7854e12e 6use Moose::Util::TypeConstraints;
fae52efd 7use Text::TEI::Markup qw( word_tag_wrap );
8use TryCatch;
9use XML::Easy::Syntax qw( $xml10_name_rx );
dd3b58b0 10
7158714d 11=head1 NAME
12
13Text::Tradition::Witness - a manuscript witness to a text tradition
14
15=head1 SYNOPSIS
16
17 use Text::Tradition::Witness;
18 my $w = Text::Tradition::Witness->new(
19 'sigil' => 'A',
20 'identifier' => 'Oxford MS Ex.1932',
21 );
22
23=head1 DESCRIPTION
24
25Text::Tradition::Witness is an object representation of a manuscript
26witness to a text tradition. A manuscript has a sigil (a short code that
27represents it in the wider tradition), an identifier (e.g. the library ID),
28and probably a text.
29
30=head1 METHODS
31
32=head2 new
33
34Create a new witness. Options include:
35
36=over
37
38=item * sigil - A short code to represent the manuscript. Required.
39
fae52efd 40=item * sourcetype - What sort of witness data this is. Options are
41'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be
42used by Collation parsers.)
861c3e27 43
fae52efd 44=item * file
45=item * string
46=item * object
47
48The data source for the witness. Use the appropriate option.
49
50=item * use_text - An initialization option. If the witness is read from a
51TEI document and more than one <text/> tag exists therein, the default
52behavior is to use the first defined text. If this is not desired,
53use_text should be set to an XPath expression that will select the correct
54text.
7158714d 55
fae52efd 56=item * language - The name of the applicable L<Text::Tradition::Lang>
57module for language handling. Usually inherited from the language set in
58the L<Text::Tradition> object, and defaults to Default.
7158714d 59
60=item * identifier - The recognized name of the manuscript, e.g. a library
fae52efd 61identifier. Taken from the msDesc element for a TEI file.
7158714d 62
63=item * other_info - A freeform string for any other description of the
fae52efd 64manuscript.
7158714d 65
66=back
67
68=head2 sigil
69
fae52efd 70The sigil by which to identify this manuscript, which must conform to the
71specification for XML attribute strings (broadly speaking, it must begin
72with a letter and can have only a few sorts of punctuation characters in
73it.)
74
75=head2 identifier
76
77A freeform name by which to identify the manuscript, which may be longer
78than the sigil. Defaults to 'Unidentified ms', but will be taken from the
79TEI msName attribute, or constructed from the settlement and idno if
80supplied.
81
82=head2 settlement
83
84The city, town, etc. where the manuscript is held. Will be read from the
85TEI msDesc element if supplied.
86
87=head2 repository
88
89The institution that holds the manuscript. Will be read from the TEI msDesc
90element if supplied.
91
92=head2 idno
93
94The identification or call number of the manuscript. Will be read from the
95TEI msDesc element if supplied.
7158714d 96
97=head2 text
98
fae52efd 99An array of strings (words) that contains the text of the
100manuscript. This should not change after the witness has been
101instantiated, and the path through the collation should always match it.
102
103=head2 layertext
7158714d 104
fae52efd 105An array of strings (words) that contains the layered
106text, if any, of the manuscript. This should not change after the witness
107has been instantiated, and the path through the collation should always
108match it.
7158714d 109
fae52efd 110=head2 language
111
112Accessor method to get the witness language.
7158714d 113
114=head2 identifier
115
116Accessor method for the witness identifier.
117
118=head2 other_info
119
120Accessor method for the general witness description.
121
1f7aa795 122=head2 is_layered
7158714d 123
1f7aa795 124Boolean method to note whether the witness has layers (e.g. pre-correction
125readings) in the collation.
7158714d 126
127=begin testing
128
fae52efd 129use Text::Tradition;
130my $trad = Text::Tradition->new( 'name' => 'test tradition' );
131my $c = $trad->collation;
7158714d 132
fae52efd 133# Test a plaintext witness via string
134my $str = 'This is a line of text';
135my $ptwit = $trad->add_witness(
7158714d 136 'sigil' => 'A',
fae52efd 137 'sourcetype' => 'plaintext',
138 'string' => $str
7158714d 139 );
fae52efd 140is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' );
141if( $ptwit ) {
142 is( $ptwit->sigil, 'A', "Witness has correct sigil" );
143 is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" );
7158714d 144}
145
65ed66b9 146# Test some JSON witnesses via object
147open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input";
148binmode( JSIN, ':encoding(UTF-8)' );
149my @lines = <JSIN>;
150close JSIN;
151$trad->add_json_witnesses( join( '', @lines ) );
152is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness',
153 "Found first JSON witness" );
154is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness',
155 "Found second JSON witness" );
156
b39fb0b3 157# Test an XML witness via file
158my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
159 'file' => 't/data/witnesses/teiwit.xml' );
160is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" );
161if( $xmlwit ) {
162 is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" );
163 ok( $xmlwit->is_layered, "Picked up correction layer" );
164 is( @{$xmlwit->text}, 182, "Got correct text length" );
165 is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" );
166}
167my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings;
168is( @allwitwords, 184, "Reused appropriate readings" );
fae52efd 169
170## Test use_text
b39fb0b3 171my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc',
172 'file' => 't/data/witnesses/group.xml',
173 'use_text' => '//tei:group/tei:text[2]' );
174is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" );
175if( $xpwit ) {
176 is( $xpwit->sigil, 'G', "XML part witness has correct sigil" );
177 ok( !$xpwit->is_layered, "Picked up no correction layer" );
178 is( @{$xpwit->text}, 157, "Got correct text length" );
179}
180
fae52efd 181
7158714d 182=end testing
183
184=cut
185
fae52efd 186subtype 'SourceType',
187 as 'Str',
188 where { $_ =~ /^(xmldesc|plaintext|json|collation)$/ },
189 message { 'Source type must be one of xmldesc, plaintext, json, collation' };
190
191subtype 'Sigil',
192 as 'Str',
193 where { $_ =~ /\A$xml10_name_rx\z/ },
194 message { 'Sigil must be a valid XML attribute string' };
195
196no Moose::Util::TypeConstraints;
197
198has 'tradition' => (
199 'is' => 'ro',
200 'isa' => 'Text::Tradition',
201 'required' => 1,
202 );
203
204# Sigil. Required identifier for a witness, but may be found inside
205# the XML file.
dd3b58b0 206has 'sigil' => (
7158714d 207 is => 'ro',
fae52efd 208 isa => 'Sigil',
209 predicate => 'has_sigil',
210 writer => '_set_sigil',
211 );
212
b39fb0b3 213has 'language' => (
214 is => 'ro',
215 isa => 'Str',
216 default => 'Default',
217 );
218
fae52efd 219# Other identifying information
220has 'identifier' => (
221 is => 'rw',
222 isa => 'Str',
223 );
224
225has 'settlement' => (
226 is => 'rw',
227 isa => 'Str',
228 );
229
230has 'repository' => (
231 is => 'rw',
232 isa => 'Str',
233 );
234
235has 'idno' => (
236 is => 'rw',
237 isa => 'Str',
238 );
239
b39fb0b3 240# Source. Can be XML obj, JSON data struct, or string.
241# Not used if the witness is created by parsing a collation.
fae52efd 242has 'sourcetype' => (
243 is => 'ro',
244 isa => 'SourceType',
245 required => 1,
246);
247
fae52efd 248has 'file' => (
249 is => 'ro',
250 isa => 'Str',
251 predicate => 'has_file',
252);
253
254has 'string' => (
255 is => 'ro',
256 isa => 'Str',
257 predicate => 'has_string',
258);
259
260has 'object' => ( # could be anything.
261 is => 'ro',
262 predicate => 'has_object',
263 clearer => 'clear_object',
264);
265
266# In the case of a TEI document with multiple texts, specify
267# which text is the root. Should be an XPath expression.
268has 'use_text' => (
269 is => 'ro',
7158714d 270 isa => 'Str',
fae52efd 271 );
272
7158714d 273# Text. This is an array of strings (i.e. word tokens).
d047cd52 274# TODO Think about how to handle this for the case of pre-prepared
275# collations, where the tokens are in the graph already.
dd3b58b0 276has 'text' => (
7158714d 277 is => 'rw',
278 isa => 'ArrayRef[Str]',
279 predicate => 'has_text',
280 );
b0b4421a 281
282has 'layertext' => (
283 is => 'rw',
284 isa => 'ArrayRef[Str]',
285 predicate => 'has_layertext',
286 );
fae52efd 287
1f7aa795 288# Path. This is an array of Reading nodes that can be saved during
289# initialization, but should be cleared before saving in a DB.
4a8828f0 290has 'path' => (
7158714d 291 is => 'rw',
292 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
293 predicate => 'has_path',
1f7aa795 294 clearer => 'clear_path',
7158714d 295 );
4a8828f0 296
b39fb0b3 297## TODO change the name of this
b15511bf 298has 'uncorrected_path' => (
7158714d 299 is => 'rw',
300 isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
1f7aa795 301 clearer => 'clear_uncorrected_path',
302 );
303
304has 'is_layered' => (
305 is => 'rw',
306 isa => 'Bool',
7158714d 307 );
f6066bac 308
1f7aa795 309# If we set an uncorrected path, ever, remember that we did so.
310around 'uncorrected_path' => sub {
311 my $orig = shift;
312 my $self = shift;
313
314 $self->is_layered( 1 );
315 $self->$orig( @_ );
316};
e2902068 317
784877d9 318sub BUILD {
7158714d 319 my $self = shift;
320 if( $self->has_source ) {
fae52efd 321 my $init_sub = '_init_from_' . $self->sourcetype;
322 $self->$init_sub();
323 # Remove our XML / source objects; we no longer need them.
324 $self->clear_object if $self->has_object;
325 $self->tradition->collation->make_witness_path( $self );
326 }
327 return $self;
328}
329
330sub has_source {
331 my $self = shift;
332 return $self->has_file || $self->has_string || $self->has_object;
333}
334
335sub _init_from_xmldesc {
336 my $self = shift;
337 my $xmlobj;
338 if( $self->has_object ) {
339 unless( ref( $self->object ) eq 'XML::LibXML::Element' ) {
340 throw( ident => "bad source",
341 message => "Source object must be an XML::LibXML::Element (this is "
342 . ref( $self->object ) . ");" );
343 }
344 $xmlobj = $self->object;
345 } else {
346 my $parser = XML::LibXML->new();
347 my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
348 try {
349 $xmlobj = $parser->$parsersub( $self->file )->documentElement;
350 } catch( XML::LibXML::Error $e ) {
351 throw( ident => "bad source",
352 message => "XML parsing error: " . $e->as_string );
353 }
354 }
355
356 unless( $xmlobj->nodeName eq 'TEI' ) {
357 throw( ident => "bad source",
358 message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" );
359 }
360
361 # Set up the tags we need, with or without namespaces.
362 map { $tags{$_} = "//$_" }
363 qw/ msDesc msName settlement repository idno p lg w seg add del /;
364 # Set up our XPath object
365 my $xpc = _xpc_for_el( $xmlobj );
366 # Use namespace-aware tags if we have to
367 if( $xmlobj->namespaceURI ) {
368 map { $tags{$_} = "//tei:$_" } keys %tags;
369 }
370
371 # Get the identifier
372 if( my $desc = $xpc->find( $tags{msDesc} ) ) {
373 my $descnode = $desc->get_node(1);
fae52efd 374 # First try to use settlement/repository/idno.
375 my( $setNode, $reposNode, $idNode ) =
376 ( $xpc->find( $tags{settlement}, $descnode )->get_node(1),
377 $xpc->find( $tags{repository}, $descnode )->get_node(1),
378 $xpc->find( $tags{idno}, $descnode )->get_node(1) );
379 $self->settlement( $setNode ? $setNode->textContent : '' );
380 $self->repository( $reposNode ? $reposNode->textContent : '' );
381 $self->idno( $idNode ? $idNode->textContent : '' );
382 if( $self->settlement && $self->idno ) {
383 $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) );
384 } else {
385 # Look for an msName.
386 my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1);
387 if( $msNameNode ) {
388 $self->identifier( $msNameNode->textContent );
389 } else {
390 # We have an msDesc but who knows what is in it?
391 my $desc = $descnode->textContent;
392 $desc =~ s/\n/ /gs;
393 $desc =~ s/\s+/ /g;
394 $self->identifier( $desc );
395 }
396 }
397 if( $descnode->hasAttribute('xml:id') ) {
398 $self->_set_sigil( $descnode->getAttribute('xml:id') );
399 } elsif( !$self->has_sigil ) {
65ed66b9 400 throw( ident => 'missing sigil',
401 message => 'Could not find xml:id witness sigil' );
fae52efd 402 }
403 } else {
404 throw( ident => "bad source",
405 message => "Could not find manuscript description element in TEI header" );
406 }
407
408 # Now get the words out.
409 my @words;
410 my @layerwords; # if the witness has layers
411 # First, make sure all the words are wrapped in tags.
412 # TODO Make this not necessarily dependent upon whitespace...
413 word_tag_wrap( $xmlobj );
414 # Now go text hunting.
415 my @textnodes;
416 if( $self->use_text ) {
417 @textnodes = $xpc->findnodes( $self->use_text );
418 } else {
419 # Use the first 'text' node in the document.
420 @textnodes = $xmlobj->getElementsByTagName( 'text' );
421 }
422 my $teitext = $textnodes[0];
423 if( $teitext ) {
424 _tokenize_text( $self, $teitext, \@words, \@layerwords );
425 } else {
426 throw( ident => "bad source",
427 message => "No text element in document '" . $self->{'identifier'} . "!" );
428 }
429
b39fb0b3 430 my @text = map { $_->text } @words;
431 my @layertext = map { $_->text } @layerwords;
fae52efd 432 $self->path( \@words );
b39fb0b3 433 $self->text( \@text );
434 if( join( ' ', @text ) ne join( ' ', @layertext ) ) {
fae52efd 435 $self->uncorrected_path( \@layerwords );
b39fb0b3 436 $self->layertext( \@layertext );
fae52efd 437 }
fae52efd 438}
439
440sub _tokenize_text {
441 my( $self, $teitext, $wordlist, $uncorrlist ) = @_;
442 # Strip out the words.
443 my $xpc = _xpc_for_el( $teitext );
444 my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' );
445 foreach( @divs ) {
446 my $place_str;
447 if( my $n = $_->getAttribute( 'n' ) ) {
448 $place_str = '#DIV_' . $n . '#';
449 } else {
450 $place_str = '#DIV#';
451 }
452 $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str );
453 } # foreach <div/>
454
455 # But maybe we don't have any divs. Just paragraphs.
456 unless( @divs ) {
457 $self->_objectify_words( $teitext, $wordlist, $uncorrlist );
458 }
459}
460
461sub _objectify_words {
462 my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_;
463
464 my $xpc = _xpc_for_el( $element );
465 my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg};
466 my @pgraphs = $xpc->findnodes( $xpexpr );
467 return () unless @pgraphs;
468 # Set up an expression to look for words and segs
469 $xpexpr = '.' . $tags{w} . '|.' . $tags{seg};
470 foreach my $pg ( @pgraphs ) {
471 # If this paragraph is the descendant of a note element,
472 # skip it.
473 my @noop_container = $xpc->findnodes( 'ancestor::note', $pg );
474 next if scalar @noop_container;
475 # Get the text of each node
476 my $first_word = 1;
477 # Hunt down each wrapped word/seg, and make an object (or two objects)
478 # of it, if necessary.
479 foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) {
b39fb0b3 480 my( $text, $uncorr ) = _get_word_strings( $c );
fae52efd 481# try {
482# ( $text, $uncorr ) = _get_word_object( $c );
483# } catch( Text::Tradition::Error $e
484# where { $_->has_tag( 'lb' ) } ) {
485# next;
486# }
487 unless( defined $text || defined $uncorr ) {
488 print STDERR "WARNING: no text in node " . $c->nodeName
489 . "\n" unless $c->nodeName eq 'lb';
490 next;
491 }
492 print STDERR "DEBUG: space found in element node "
493 . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/;
494
495 my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist;
496 while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) {
497 $ctr++;
498 }
499 my $id = $self->sigil . 'r' . $ctr;
500 my( $word, $acword );
501 if( $text ) {
502 $word = $self->tradition->collation->add_reading(
503 { 'id' => $id, 'text' => $text });
504 }
505 if( $uncorr && $uncorr ne $text ) {
506 $id .= '_ac';
507 $acword = $self->tradition->collation->add_reading(
508 { 'id' => $id, 'text' => $uncorr });
509 } elsif( $uncorr ) {
510 $acword = $word;
511 }
512
513# if( $first_word ) {
514# $first_word = 0;
515# # Set the relevant sectioning markers
516# if( $divmarker ) {
517# $w->add_placeholder( $divmarker );
518# $divmarker = undef;
519# }
520# $w->add_placeholder( '#PG#' );
521# }
522 push( @$wordlist, $word ) if $word;
523 push( @$uncorrlist, $acword ) if $acword;
524 }
525 }
526}
527
528# Given a word or segment node, make a Reading object for the word
529# therein. Make two Reading objects if there is an 'uncorrected' vs.
530# 'corrected' state.
531
532sub _get_word_strings {
533 my( $node ) = @_;
534 my( $text, $uncorrtext );
535 # We can have an lb or pb in the middle of a word; if we do, the
536 # whitespace (including \n) after the break becomes insignificant
537 # and we want to nuke it.
538 my $strip_leading_space = 0;
539 my $word_excluded = 0;
540 my $xpc = _xpc_for_el( $node );
541 # TODO This does not cope with nested add/dels.
b39fb0b3 542 my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) );
543 my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) );
fae52efd 544 foreach my $c ($node->childNodes() ) {
545 if( $c->nodeName eq 'num'
546 && defined $c->getAttribute( 'value' ) ) {
547 # Push the number.
548 $text .= $c->getAttribute( 'value' ) unless @deletion;
549 $uncorrtext .= $c->getAttribute( 'value' ) unless @addition;
550 # If this is just after a line/page break, return to normal behavior.
551 $strip_leading_space = 0;
552 } elsif ( $c->nodeName =~ /^[lp]b$/ ) {
553 # Set a flag that strips leading whitespace until we
554 # get to the next bit of non-whitespace.
555 $strip_leading_space = 1;
556 } elsif ( $c->nodeName eq 'fw' # for catchwords
557 || $c->nodeName eq 'sic'
558 || $c->nodeName eq 'note' #TODO: decide how to deal with notes
559 || $c->textContent eq ''
560 || ref( $c ) eq 'XML::LibXML::Comment' ) {
561 $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/;
562 next;
563 } elsif( $c->nodeName eq 'add' ) {
b39fb0b3 564 my( $use, $discard ) = _get_word_strings( $c );
fae52efd 565 $text .= $use;
566 } elsif( $c->nodeName eq 'del' ) {
b39fb0b3 567 my( $discard, $use ) = _get_word_strings( $c );
fae52efd 568 $uncorrtext .= $use;
569 } else {
b39fb0b3 570 my ( $tagtxt, $taguncorr );
fae52efd 571 if( ref( $c ) eq 'XML::LibXML::Text' ) {
572 # A text node.
573 $tagtxt = $c->textContent;
b39fb0b3 574 $taguncorr = $c->textContent;
fae52efd 575 } else {
b39fb0b3 576 ( $tagtxt, $taguncorr ) = _get_word_strings( $c );
fae52efd 577 }
578 if( $strip_leading_space ) {
579 $tagtxt =~ s/^[\s\n]+//s;
b39fb0b3 580 $taguncorr =~ s/^[\s\n]+//s;
fae52efd 581 # Unset the flag as soon as we see non-whitespace.
582 $strip_leading_space = 0 if $tagtxt;
583 }
584 $text .= $tagtxt;
b39fb0b3 585 $uncorrtext .= $taguncorr;
fae52efd 586 }
587 }
588 throw( ident => "text not found",
589 tags => [ $node->nodeName ],
590 message => "No text found in node " . $node->toString(0) )
591 unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/;
592 return( $text, $uncorrtext );
593}
594
595sub _split_words {
596 my( $self, $string, $c ) = @_;
597 my @raw_words = split( /\s+/, $string );
598 my @words;
599 foreach my $w ( @raw_words ) {
600 my $id = $self->sigil . 'r'. $c++;
601 my %opts = ( 'text' => $w, 'id' => $id, 'language' => $self->language );
602 my $w_obj = $self->tradition->collation->add_reading( \%opts );
603 # Skip any words that have been canonized out of existence.
604 next if( length( $w_obj->text ) == 0 );
605 push( @words, $w_obj );
606 }
607 return @words;
608}
609
610sub _init_from_json {
611 my( $self ) = shift;
612 my $wit;
613 if( $self->has_object ) {
614 $wit = $self->object;
65ed66b9 615 } elsif( $self->has_string ) {
616 $wit = from_json( $self->string );
617 } elsif( $self->has_file ) {
618 my $ok = open( INPUT, $self->file );
619 unless( $ok ) {
620 throw( ident => "bad source",
621 message => 'Could not open ' . $self->file . ' for reading' );
622 }
623 binmode( INPUT, ':encoding(UTF-8)' );
624 my @lines = <INPUT>;
625 close INPUT;
626 $wit = from_json( join( '', @lines ) );
fae52efd 627 }
628
65ed66b9 629 if( exists $wit->{'id'} ) {
630 $self->_set_sigil( $wit->{'id'} );
631 } elsif( !$self->has_sigil ) {
632 throw( ident => 'missing sigil',
633 message => 'Could not find witness sigil (id) in JSON spec' );
634 }
fae52efd 635 $self->identifier( $wit->{'name'} );
636 my @words;
637 my @layerwords;
038d6b50 638 my( @text, @layertext );
fae52efd 639 if( exists $wit->{'content'} ) {
640 # We need to tokenize the text ourselves.
641 @words = _split_words( $self, $wit->{'content'} );
642 } elsif( exists $wit->{'tokens'} ) {
643 # We have a bunch of pretokenized words.
644 my $ctr = 0;
645 foreach my $token ( @{$wit->{'tokens'}} ) {
646 my $w_obj = $self->tradition->collation->add_reading({
65ed66b9 647 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
fae52efd 648 push( @words, $w_obj );
038d6b50 649 push( @text, $token->{'t'} ); # TODO unless...?
fae52efd 650 }
651 ## TODO rethink this JSOn mechanism
652 if( exists $wit->{'layertokens'} ) {
653 foreach my $token ( @{$wit->{'layertokens'}} ) {
654 my $w_obj = $self->tradition->collation->add_reading({
65ed66b9 655 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ });
fae52efd 656 push( @layerwords, $w_obj );
038d6b50 657 push( @layertext, $token->{'t'} );
7158714d 658 }
fae52efd 659 }
7158714d 660 }
038d6b50 661 $self->text( \@text );
662 $self->layertext( \@layertext ) if @layertext;
fae52efd 663 $self->path( \@words );
664 $self->uncorrected_path( \@layerwords ) if @layerwords;
7158714d 665}
666
fae52efd 667sub _init_from_plaintext {
668 my( $self ) = @_;
669 my $str;
670 if( $self->has_file ) {
671 my $ok = open( INPUT, $self->file );
672 unless( $ok ) {
673 throw( ident => "bad source",
674 message => 'Could not open ' . $self->file . ' for reading' );
675 }
676 binmode( INPUT, ':encoding(UTF-8)' );
677 my @lines = <INPUT>;
678 close INPUT;
679 $str = join( '', @lines );
680 } elsif( $self->has_object ) { # ...seriously?
681 $str = ${$self->object};
682 } else {
683 $str = $self->string;
684 }
685
686 # TODO allow a different word separation expression
687 my @text = split( /\s+/, $str );
688 $self->text( \@text );
689 my @words = _split_words( $self, $str );
690 $self->path( \@words );
691}
692
693sub throw {
694 Text::Tradition::Error->throw(
695 'ident' => 'Witness parsing error',
696 'message' => $_[0],
697 );
698}
699
700sub _xpc_for_el {
701 my $el = shift;
702 my $xpc = XML::LibXML::XPathContext->new( $el );
703 if( $el->namespaceURI ) {
704 $xpc->registerNs( 'tei', $el->namespaceURI );
705 }
706 return $xpc;
707}
708
f025e303 709=head2 export_as_json
710
711Exports the witness as a JSON structure, with the following keys:
712
713=over 4
714
715=item * id - The witness sigil
716
717=item * name - The witness identifier
718
719=item * tokens - An array of hashes of the form { "t":"WORD" }
720
721=back
722
723=begin testing
724
725use Text::Tradition;
fae52efd 726my $trad = Text::Tradition->new();
f025e303 727
fae52efd 728my @text = qw/ Thhis is a line of text /;
729my $wit = $trad->add_witness(
f025e303 730 'sigil' => 'A',
fae52efd 731 'string' => join( ' ', @text ),
732 'sourcetype' => 'plaintext',
f025e303 733 'identifier' => 'test witness',
734 );
735my $jsonstruct = $wit->export_as_json;
736is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
737is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" );
738is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" );
739foreach my $idx ( 0 .. $#text ) {
740 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" );
741}
742
743my @ctext = qw( when april with his showers sweet with fruit the drought of march
744 has pierced unto the root );
fae52efd 745$trad = Text::Tradition->new(
f025e303 746 'input' => 'CollateX',
747 'file' => 't/data/Collatex-16.xml' );
748
749$jsonstruct = $trad->witness('A')->export_as_json;
750is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" );
751is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" );
752is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" );
753foreach my $idx ( 0 .. $#ctext ) {
754 is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" );
755}
756
fae52efd 757## TODO test layertext export
758
f025e303 759=end testing
760
761=cut
762
763sub export_as_json {
764 my $self = shift;
765 my @wordlist = map { { 't' => $_ || '' } } @{$self->text};
fae52efd 766 my $obj = {
f025e303 767 'id' => $self->sigil,
768 'tokens' => \@wordlist,
769 'name' => $self->identifier,
770 };
fae52efd 771 if( $self->is_layered ) {
772 my @lwlist = map { { 't' => $_ || '' } } @{$self->uncorrected};
773 $obj->{'layertokens'} = \@lwlist;
774 }
775 return $obj;
f025e303 776}
777
dd3b58b0 778no Moose;
779__PACKAGE__->meta->make_immutable;
7158714d 780
781=head1 BUGS / TODO
782
783=over
784
fae52efd 785=item * Support encodings other than UTF-8
7158714d 786
787=back
788
789=head1 LICENSE
790
791This package is free software and is provided "as is" without express
792or implied warranty. You can redistribute it and/or modify it under
793the same terms as Perl itself.
794
795=head1 AUTHOR
796
797Tara L Andrews E<lt>aurum@cpan.orgE<gt>