Commit | Line | Data |
dd3b58b0 |
1 | package Text::Tradition::Witness; |
fae52efd |
2 | |
3 | use vars qw( %tags ); |
4 | use JSON; |
dd3b58b0 |
5 | use Moose; |
7854e12e |
6 | use Moose::Util::TypeConstraints; |
fae52efd |
7 | use Text::TEI::Markup qw( word_tag_wrap ); |
8 | use TryCatch; |
9 | use XML::Easy::Syntax qw( $xml10_name_rx ); |
dd3b58b0 |
10 | |
7158714d |
11 | =head1 NAME |
12 | |
13 | Text::Tradition::Witness - a manuscript witness to a text tradition |
14 | |
15 | =head1 SYNOPSIS |
16 | |
17 | use Text::Tradition::Witness; |
18 | my $w = Text::Tradition::Witness->new( |
19 | 'sigil' => 'A', |
20 | 'identifier' => 'Oxford MS Ex.1932', |
21 | ); |
22 | |
23 | =head1 DESCRIPTION |
24 | |
25 | Text::Tradition::Witness is an object representation of a manuscript |
26 | witness to a text tradition. A manuscript has a sigil (a short code that |
27 | represents it in the wider tradition), an identifier (e.g. the library ID), |
28 | and probably a text. |
29 | |
30 | =head1 METHODS |
31 | |
32 | =head2 new |
33 | |
34 | Create a new witness. Options include: |
35 | |
36 | =over |
37 | |
38 | =item * sigil - A short code to represent the manuscript. Required. |
39 | |
fae52efd |
40 | =item * sourcetype - What sort of witness data this is. Options are |
41 | 'xmldesc', 'plaintext', 'json', or 'collation' (the last should only be |
42 | used by Collation parsers.) |
861c3e27 |
43 | |
fae52efd |
44 | =item * file |
45 | =item * string |
46 | =item * object |
47 | |
48 | The data source for the witness. Use the appropriate option. |
49 | |
50 | =item * use_text - An initialization option. If the witness is read from a |
51 | TEI document and more than one <text/> tag exists therein, the default |
52 | behavior is to use the first defined text. If this is not desired, |
53 | use_text should be set to an XPath expression that will select the correct |
54 | text. |
7158714d |
55 | |
7158714d |
56 | =item * identifier - The recognized name of the manuscript, e.g. a library |
fae52efd |
57 | identifier. Taken from the msDesc element for a TEI file. |
7158714d |
58 | |
59 | =item * other_info - A freeform string for any other description of the |
fae52efd |
60 | manuscript. |
7158714d |
61 | |
62 | =back |
63 | |
64 | =head2 sigil |
65 | |
fae52efd |
66 | The sigil by which to identify this manuscript, which must conform to the |
67 | specification for XML attribute strings (broadly speaking, it must begin |
68 | with a letter and can have only a few sorts of punctuation characters in |
69 | it.) |
70 | |
71 | =head2 identifier |
72 | |
73 | A freeform name by which to identify the manuscript, which may be longer |
74 | than the sigil. Defaults to 'Unidentified ms', but will be taken from the |
75 | TEI msName attribute, or constructed from the settlement and idno if |
76 | supplied. |
77 | |
78 | =head2 settlement |
79 | |
80 | The city, town, etc. where the manuscript is held. Will be read from the |
81 | TEI msDesc element if supplied. |
82 | |
83 | =head2 repository |
84 | |
85 | The institution that holds the manuscript. Will be read from the TEI msDesc |
86 | element if supplied. |
87 | |
88 | =head2 idno |
89 | |
90 | The identification or call number of the manuscript. Will be read from the |
91 | TEI msDesc element if supplied. |
7158714d |
92 | |
93 | =head2 text |
94 | |
fae52efd |
95 | An array of strings (words) that contains the text of the |
96 | manuscript. This should not change after the witness has been |
97 | instantiated, and the path through the collation should always match it. |
98 | |
99 | =head2 layertext |
7158714d |
100 | |
fae52efd |
101 | An array of strings (words) that contains the layered |
102 | text, if any, of the manuscript. This should not change after the witness |
103 | has been instantiated, and the path through the collation should always |
104 | match it. |
7158714d |
105 | |
7158714d |
106 | =head2 identifier |
107 | |
108 | Accessor method for the witness identifier. |
109 | |
110 | =head2 other_info |
111 | |
112 | Accessor method for the general witness description. |
113 | |
06e7cbc7 |
114 | =head2 has_source |
115 | |
116 | Boolean method that returns a true value if the witness was created with a |
117 | data source (that is, a file, string, or object to be parsed). |
118 | |
1f7aa795 |
119 | =head2 is_layered |
7158714d |
120 | |
1f7aa795 |
121 | Boolean method to note whether the witness has layers (e.g. pre-correction |
122 | readings) in the collation. |
7158714d |
123 | |
124 | =begin testing |
125 | |
fae52efd |
126 | use Text::Tradition; |
127 | my $trad = Text::Tradition->new( 'name' => 'test tradition' ); |
128 | my $c = $trad->collation; |
7158714d |
129 | |
fae52efd |
130 | # Test a plaintext witness via string |
131 | my $str = 'This is a line of text'; |
132 | my $ptwit = $trad->add_witness( |
7158714d |
133 | 'sigil' => 'A', |
fae52efd |
134 | 'sourcetype' => 'plaintext', |
135 | 'string' => $str |
7158714d |
136 | ); |
fae52efd |
137 | is( ref( $ptwit ), 'Text::Tradition::Witness', 'Created a witness' ); |
138 | if( $ptwit ) { |
139 | is( $ptwit->sigil, 'A', "Witness has correct sigil" ); |
248276a2 |
140 | $c->make_witness_path( $ptwit ); |
fae52efd |
141 | is( $c->path_text( $ptwit->sigil ), $str, "Witness has correct text" ); |
7158714d |
142 | } |
143 | |
65ed66b9 |
144 | # Test some JSON witnesses via object |
145 | open( JSIN, 't/data/witnesses/testwit.json' ) or die "Could not open JSON test input"; |
146 | binmode( JSIN, ':encoding(UTF-8)' ); |
147 | my @lines = <JSIN>; |
148 | close JSIN; |
149 | $trad->add_json_witnesses( join( '', @lines ) ); |
150 | is( ref( $trad->witness( 'MsAJ' ) ), 'Text::Tradition::Witness', |
151 | "Found first JSON witness" ); |
152 | is( ref( $trad->witness( 'MsBJ' ) ), 'Text::Tradition::Witness', |
153 | "Found second JSON witness" ); |
154 | |
b39fb0b3 |
155 | # Test an XML witness via file |
156 | my $xmlwit = $trad->add_witness( 'sourcetype' => 'xmldesc', |
157 | 'file' => 't/data/witnesses/teiwit.xml' ); |
158 | is( ref( $xmlwit ), 'Text::Tradition::Witness', "Created witness from XML file" ); |
159 | if( $xmlwit ) { |
160 | is( $xmlwit->sigil, 'V887', "XML witness has correct sigil" ); |
161 | ok( $xmlwit->is_layered, "Picked up correction layer" ); |
162 | is( @{$xmlwit->text}, 182, "Got correct text length" ); |
163 | is( @{$xmlwit->layertext}, 182, "Got correct a.c. text length" ); |
164 | } |
165 | my @allwitwords = grep { $_->id =~ /^V887/ } $c->readings; |
166 | is( @allwitwords, 184, "Reused appropriate readings" ); |
fae52efd |
167 | |
168 | ## Test use_text |
b39fb0b3 |
169 | my $xpwit = $trad->add_witness( 'sourcetype' => 'xmldesc', |
170 | 'file' => 't/data/witnesses/group.xml', |
171 | 'use_text' => '//tei:group/tei:text[2]' ); |
172 | is( ref( $xpwit ), 'Text::Tradition::Witness', "Created witness from XML group" ); |
173 | if( $xpwit ) { |
174 | is( $xpwit->sigil, 'G', "XML part witness has correct sigil" ); |
175 | ok( !$xpwit->is_layered, "Picked up no correction layer" ); |
176 | is( @{$xpwit->text}, 157, "Got correct text length" ); |
177 | } |
178 | |
fae52efd |
179 | |
7158714d |
180 | =end testing |
181 | |
182 | =cut |
183 | |
bf7d52b5 |
184 | # Enable plugin(s) if available |
185 | eval { with 'Text::Tradition::WitLanguage'; }; |
186 | |
fae52efd |
187 | subtype 'SourceType', |
188 | as 'Str', |
189 | where { $_ =~ /^(xmldesc|plaintext|json|collation)$/ }, |
190 | message { 'Source type must be one of xmldesc, plaintext, json, collation' }; |
191 | |
192 | subtype 'Sigil', |
193 | as 'Str', |
194 | where { $_ =~ /\A$xml10_name_rx\z/ }, |
195 | message { 'Sigil must be a valid XML attribute string' }; |
196 | |
197 | no Moose::Util::TypeConstraints; |
198 | |
199 | has 'tradition' => ( |
248276a2 |
200 | is => 'ro', |
201 | isa => 'Text::Tradition', |
202 | required => 1, |
203 | weak_ref => 1 |
fae52efd |
204 | ); |
205 | |
206 | # Sigil. Required identifier for a witness, but may be found inside |
207 | # the XML file. |
dd3b58b0 |
208 | has 'sigil' => ( |
7158714d |
209 | is => 'ro', |
fae52efd |
210 | isa => 'Sigil', |
211 | predicate => 'has_sigil', |
212 | writer => '_set_sigil', |
213 | ); |
214 | |
215 | # Other identifying information |
216 | has 'identifier' => ( |
217 | is => 'rw', |
218 | isa => 'Str', |
219 | ); |
220 | |
221 | has 'settlement' => ( |
222 | is => 'rw', |
223 | isa => 'Str', |
224 | ); |
225 | |
226 | has 'repository' => ( |
227 | is => 'rw', |
228 | isa => 'Str', |
229 | ); |
230 | |
231 | has 'idno' => ( |
232 | is => 'rw', |
233 | isa => 'Str', |
234 | ); |
235 | |
b39fb0b3 |
236 | # Source. Can be XML obj, JSON data struct, or string. |
237 | # Not used if the witness is created by parsing a collation. |
fae52efd |
238 | has 'sourcetype' => ( |
239 | is => 'ro', |
240 | isa => 'SourceType', |
241 | required => 1, |
242 | ); |
243 | |
fae52efd |
244 | has 'file' => ( |
245 | is => 'ro', |
246 | isa => 'Str', |
247 | predicate => 'has_file', |
248 | ); |
249 | |
250 | has 'string' => ( |
251 | is => 'ro', |
252 | isa => 'Str', |
253 | predicate => 'has_string', |
254 | ); |
255 | |
256 | has 'object' => ( # could be anything. |
257 | is => 'ro', |
258 | predicate => 'has_object', |
259 | clearer => 'clear_object', |
260 | ); |
261 | |
262 | # In the case of a TEI document with multiple texts, specify |
263 | # which text is the root. Should be an XPath expression. |
264 | has 'use_text' => ( |
265 | is => 'ro', |
7158714d |
266 | isa => 'Str', |
fae52efd |
267 | ); |
268 | |
7158714d |
269 | # Text. This is an array of strings (i.e. word tokens). |
d047cd52 |
270 | # TODO Think about how to handle this for the case of pre-prepared |
271 | # collations, where the tokens are in the graph already. |
dd3b58b0 |
272 | has 'text' => ( |
7158714d |
273 | is => 'rw', |
274 | isa => 'ArrayRef[Str]', |
275 | predicate => 'has_text', |
276 | ); |
b0b4421a |
277 | |
278 | has 'layertext' => ( |
279 | is => 'rw', |
280 | isa => 'ArrayRef[Str]', |
281 | predicate => 'has_layertext', |
282 | ); |
fae52efd |
283 | |
4889be4f |
284 | has 'is_collated' => ( |
285 | is => 'rw', |
286 | isa => 'Bool' |
287 | ); |
288 | |
1f7aa795 |
289 | # Path. This is an array of Reading nodes that can be saved during |
290 | # initialization, but should be cleared before saving in a DB. |
4a8828f0 |
291 | has 'path' => ( |
7158714d |
292 | is => 'rw', |
293 | isa => 'ArrayRef[Text::Tradition::Collation::Reading]', |
294 | predicate => 'has_path', |
1f7aa795 |
295 | clearer => 'clear_path', |
7158714d |
296 | ); |
4a8828f0 |
297 | |
b39fb0b3 |
298 | ## TODO change the name of this |
b15511bf |
299 | has 'uncorrected_path' => ( |
7158714d |
300 | is => 'rw', |
301 | isa => 'ArrayRef[Text::Tradition::Collation::Reading]', |
1f7aa795 |
302 | clearer => 'clear_uncorrected_path', |
303 | ); |
304 | |
305 | has 'is_layered' => ( |
306 | is => 'rw', |
307 | isa => 'Bool', |
7158714d |
308 | ); |
f6066bac |
309 | |
1f7aa795 |
310 | # If we set an uncorrected path, ever, remember that we did so. |
311 | around 'uncorrected_path' => sub { |
312 | my $orig = shift; |
313 | my $self = shift; |
314 | |
315 | $self->is_layered( 1 ); |
316 | $self->$orig( @_ ); |
317 | }; |
e2902068 |
318 | |
784877d9 |
319 | sub BUILD { |
7158714d |
320 | my $self = shift; |
321 | if( $self->has_source ) { |
fae52efd |
322 | my $init_sub = '_init_from_' . $self->sourcetype; |
323 | $self->$init_sub(); |
324 | # Remove our XML / source objects; we no longer need them. |
325 | $self->clear_object if $self->has_object; |
4889be4f |
326 | # $self->tradition->collation->make_witness_path( $self ); |
327 | } |
328 | if( $self->sourcetype eq 'collation' ) { |
329 | $self->is_collated( 1 ); |
fae52efd |
330 | } |
331 | return $self; |
332 | } |
333 | |
334 | sub has_source { |
335 | my $self = shift; |
336 | return $self->has_file || $self->has_string || $self->has_object; |
337 | } |
338 | |
339 | sub _init_from_xmldesc { |
340 | my $self = shift; |
341 | my $xmlobj; |
342 | if( $self->has_object ) { |
343 | unless( ref( $self->object ) eq 'XML::LibXML::Element' ) { |
344 | throw( ident => "bad source", |
345 | message => "Source object must be an XML::LibXML::Element (this is " |
346 | . ref( $self->object ) . ");" ); |
347 | } |
348 | $xmlobj = $self->object; |
349 | } else { |
428bcf0b |
350 | require XML::LibXML; |
fae52efd |
351 | my $parser = XML::LibXML->new(); |
352 | my $parsersub = $self->has_file ? 'parse_file' : 'parse_string'; |
353 | try { |
354 | $xmlobj = $parser->$parsersub( $self->file )->documentElement; |
355 | } catch( XML::LibXML::Error $e ) { |
356 | throw( ident => "bad source", |
357 | message => "XML parsing error: " . $e->as_string ); |
358 | } |
359 | } |
360 | |
361 | unless( $xmlobj->nodeName eq 'TEI' ) { |
362 | throw( ident => "bad source", |
363 | message => "Source XML must be TEI (this is " . $xmlobj->nodeName . ")" ); |
364 | } |
365 | |
366 | # Set up the tags we need, with or without namespaces. |
367 | map { $tags{$_} = "//$_" } |
368 | qw/ msDesc msName settlement repository idno p lg w seg add del /; |
369 | # Set up our XPath object |
370 | my $xpc = _xpc_for_el( $xmlobj ); |
371 | # Use namespace-aware tags if we have to |
372 | if( $xmlobj->namespaceURI ) { |
373 | map { $tags{$_} = "//tei:$_" } keys %tags; |
374 | } |
375 | |
376 | # Get the identifier |
377 | if( my $desc = $xpc->find( $tags{msDesc} ) ) { |
378 | my $descnode = $desc->get_node(1); |
fae52efd |
379 | # First try to use settlement/repository/idno. |
380 | my( $setNode, $reposNode, $idNode ) = |
381 | ( $xpc->find( $tags{settlement}, $descnode )->get_node(1), |
382 | $xpc->find( $tags{repository}, $descnode )->get_node(1), |
383 | $xpc->find( $tags{idno}, $descnode )->get_node(1) ); |
384 | $self->settlement( $setNode ? $setNode->textContent : '' ); |
385 | $self->repository( $reposNode ? $reposNode->textContent : '' ); |
386 | $self->idno( $idNode ? $idNode->textContent : '' ); |
387 | if( $self->settlement && $self->idno ) { |
388 | $self->identifier( join( ' ', $self->{'settlement'}, $self->{'idno'} ) ); |
389 | } else { |
390 | # Look for an msName. |
391 | my $msNameNode = $xpc->find( $tags{msName}, $descnode )->get_node(1); |
392 | if( $msNameNode ) { |
393 | $self->identifier( $msNameNode->textContent ); |
394 | } else { |
395 | # We have an msDesc but who knows what is in it? |
396 | my $desc = $descnode->textContent; |
397 | $desc =~ s/\n/ /gs; |
398 | $desc =~ s/\s+/ /g; |
399 | $self->identifier( $desc ); |
400 | } |
401 | } |
402 | if( $descnode->hasAttribute('xml:id') ) { |
403 | $self->_set_sigil( $descnode->getAttribute('xml:id') ); |
404 | } elsif( !$self->has_sigil ) { |
65ed66b9 |
405 | throw( ident => 'missing sigil', |
406 | message => 'Could not find xml:id witness sigil' ); |
fae52efd |
407 | } |
408 | } else { |
409 | throw( ident => "bad source", |
410 | message => "Could not find manuscript description element in TEI header" ); |
411 | } |
412 | |
413 | # Now get the words out. |
414 | my @words; |
415 | my @layerwords; # if the witness has layers |
416 | # First, make sure all the words are wrapped in tags. |
417 | # TODO Make this not necessarily dependent upon whitespace... |
418 | word_tag_wrap( $xmlobj ); |
419 | # Now go text hunting. |
420 | my @textnodes; |
421 | if( $self->use_text ) { |
422 | @textnodes = $xpc->findnodes( $self->use_text ); |
423 | } else { |
424 | # Use the first 'text' node in the document. |
425 | @textnodes = $xmlobj->getElementsByTagName( 'text' ); |
426 | } |
427 | my $teitext = $textnodes[0]; |
428 | if( $teitext ) { |
429 | _tokenize_text( $self, $teitext, \@words, \@layerwords ); |
430 | } else { |
431 | throw( ident => "bad source", |
432 | message => "No text element in document '" . $self->{'identifier'} . "!" ); |
433 | } |
434 | |
b39fb0b3 |
435 | my @text = map { $_->text } @words; |
436 | my @layertext = map { $_->text } @layerwords; |
fae52efd |
437 | $self->path( \@words ); |
b39fb0b3 |
438 | $self->text( \@text ); |
439 | if( join( ' ', @text ) ne join( ' ', @layertext ) ) { |
fae52efd |
440 | $self->uncorrected_path( \@layerwords ); |
b39fb0b3 |
441 | $self->layertext( \@layertext ); |
fae52efd |
442 | } |
fae52efd |
443 | } |
444 | |
445 | sub _tokenize_text { |
446 | my( $self, $teitext, $wordlist, $uncorrlist ) = @_; |
447 | # Strip out the words. |
448 | my $xpc = _xpc_for_el( $teitext ); |
449 | my @divs = $xpc->findnodes( '//*[starts-with(name(.), "div")]' ); |
450 | foreach( @divs ) { |
451 | my $place_str; |
452 | if( my $n = $_->getAttribute( 'n' ) ) { |
453 | $place_str = '#DIV_' . $n . '#'; |
454 | } else { |
455 | $place_str = '#DIV#'; |
456 | } |
457 | $self->_objectify_words( $teitext, $wordlist, $uncorrlist, $place_str ); |
458 | } # foreach <div/> |
459 | |
460 | # But maybe we don't have any divs. Just paragraphs. |
461 | unless( @divs ) { |
462 | $self->_objectify_words( $teitext, $wordlist, $uncorrlist ); |
463 | } |
464 | } |
465 | |
466 | sub _objectify_words { |
467 | my( $self, $element, $wordlist, $uncorrlist, $divmarker ) = @_; |
468 | |
469 | my $xpc = _xpc_for_el( $element ); |
470 | my $xpexpr = '.' . $tags{p} . '|.' . $tags{lg}; |
471 | my @pgraphs = $xpc->findnodes( $xpexpr ); |
472 | return () unless @pgraphs; |
473 | # Set up an expression to look for words and segs |
474 | $xpexpr = '.' . $tags{w} . '|.' . $tags{seg}; |
475 | foreach my $pg ( @pgraphs ) { |
476 | # If this paragraph is the descendant of a note element, |
477 | # skip it. |
478 | my @noop_container = $xpc->findnodes( 'ancestor::note', $pg ); |
479 | next if scalar @noop_container; |
480 | # Get the text of each node |
481 | my $first_word = 1; |
482 | # Hunt down each wrapped word/seg, and make an object (or two objects) |
483 | # of it, if necessary. |
484 | foreach my $c ( $xpc->findnodes( $xpexpr, $pg ) ) { |
b39fb0b3 |
485 | my( $text, $uncorr ) = _get_word_strings( $c ); |
fae52efd |
486 | # try { |
487 | # ( $text, $uncorr ) = _get_word_object( $c ); |
488 | # } catch( Text::Tradition::Error $e |
489 | # where { $_->has_tag( 'lb' ) } ) { |
490 | # next; |
491 | # } |
492 | unless( defined $text || defined $uncorr ) { |
493 | print STDERR "WARNING: no text in node " . $c->nodeName |
494 | . "\n" unless $c->nodeName eq 'lb'; |
495 | next; |
496 | } |
497 | print STDERR "DEBUG: space found in element node " |
498 | . $c->nodeName . "\n" if $text =~ /\s/ || $uncorr =~ /\s/; |
499 | |
500 | my $ctr = @$wordlist > @$uncorrlist ? @$wordlist : @$uncorrlist; |
501 | while( $self->tradition->collation->reading( $self->sigil.'r'.$ctr ) ) { |
502 | $ctr++; |
503 | } |
504 | my $id = $self->sigil . 'r' . $ctr; |
505 | my( $word, $acword ); |
506 | if( $text ) { |
507 | $word = $self->tradition->collation->add_reading( |
508 | { 'id' => $id, 'text' => $text }); |
509 | } |
510 | if( $uncorr && $uncorr ne $text ) { |
511 | $id .= '_ac'; |
512 | $acword = $self->tradition->collation->add_reading( |
513 | { 'id' => $id, 'text' => $uncorr }); |
514 | } elsif( $uncorr ) { |
515 | $acword = $word; |
516 | } |
517 | |
518 | # if( $first_word ) { |
519 | # $first_word = 0; |
520 | # # Set the relevant sectioning markers |
521 | # if( $divmarker ) { |
522 | # $w->add_placeholder( $divmarker ); |
523 | # $divmarker = undef; |
524 | # } |
525 | # $w->add_placeholder( '#PG#' ); |
526 | # } |
527 | push( @$wordlist, $word ) if $word; |
528 | push( @$uncorrlist, $acword ) if $acword; |
529 | } |
530 | } |
531 | } |
532 | |
533 | # Given a word or segment node, make a Reading object for the word |
534 | # therein. Make two Reading objects if there is an 'uncorrected' vs. |
535 | # 'corrected' state. |
536 | |
537 | sub _get_word_strings { |
538 | my( $node ) = @_; |
539 | my( $text, $uncorrtext ); |
540 | # We can have an lb or pb in the middle of a word; if we do, the |
541 | # whitespace (including \n) after the break becomes insignificant |
542 | # and we want to nuke it. |
543 | my $strip_leading_space = 0; |
544 | my $word_excluded = 0; |
545 | my $xpc = _xpc_for_el( $node ); |
546 | # TODO This does not cope with nested add/dels. |
b39fb0b3 |
547 | my @addition = $xpc->findnodes( 'ancestor::' . substr( $tags{add}, 2 ) ); |
548 | my @deletion = $xpc->findnodes( 'ancestor::' . substr( $tags{del}, 2 ) ); |
fae52efd |
549 | foreach my $c ($node->childNodes() ) { |
550 | if( $c->nodeName eq 'num' |
551 | && defined $c->getAttribute( 'value' ) ) { |
552 | # Push the number. |
553 | $text .= $c->getAttribute( 'value' ) unless @deletion; |
554 | $uncorrtext .= $c->getAttribute( 'value' ) unless @addition; |
555 | # If this is just after a line/page break, return to normal behavior. |
556 | $strip_leading_space = 0; |
557 | } elsif ( $c->nodeName =~ /^[lp]b$/ ) { |
558 | # Set a flag that strips leading whitespace until we |
559 | # get to the next bit of non-whitespace. |
560 | $strip_leading_space = 1; |
561 | } elsif ( $c->nodeName eq 'fw' # for catchwords |
562 | || $c->nodeName eq 'sic' |
563 | || $c->nodeName eq 'note' #TODO: decide how to deal with notes |
564 | || $c->textContent eq '' |
565 | || ref( $c ) eq 'XML::LibXML::Comment' ) { |
566 | $word_excluded = 1 if $c->nodeName =~ /^(fw|sic)$/; |
567 | next; |
568 | } elsif( $c->nodeName eq 'add' ) { |
b39fb0b3 |
569 | my( $use, $discard ) = _get_word_strings( $c ); |
fae52efd |
570 | $text .= $use; |
571 | } elsif( $c->nodeName eq 'del' ) { |
b39fb0b3 |
572 | my( $discard, $use ) = _get_word_strings( $c ); |
fae52efd |
573 | $uncorrtext .= $use; |
574 | } else { |
b39fb0b3 |
575 | my ( $tagtxt, $taguncorr ); |
fae52efd |
576 | if( ref( $c ) eq 'XML::LibXML::Text' ) { |
577 | # A text node. |
578 | $tagtxt = $c->textContent; |
b39fb0b3 |
579 | $taguncorr = $c->textContent; |
fae52efd |
580 | } else { |
b39fb0b3 |
581 | ( $tagtxt, $taguncorr ) = _get_word_strings( $c ); |
fae52efd |
582 | } |
583 | if( $strip_leading_space ) { |
584 | $tagtxt =~ s/^[\s\n]+//s; |
b39fb0b3 |
585 | $taguncorr =~ s/^[\s\n]+//s; |
fae52efd |
586 | # Unset the flag as soon as we see non-whitespace. |
587 | $strip_leading_space = 0 if $tagtxt; |
588 | } |
589 | $text .= $tagtxt; |
b39fb0b3 |
590 | $uncorrtext .= $taguncorr; |
fae52efd |
591 | } |
592 | } |
593 | throw( ident => "text not found", |
594 | tags => [ $node->nodeName ], |
595 | message => "No text found in node " . $node->toString(0) ) |
596 | unless $text || $uncorrtext || $word_excluded || $node->toString(0) =~/gap/; |
597 | return( $text, $uncorrtext ); |
598 | } |
599 | |
600 | sub _split_words { |
601 | my( $self, $string, $c ) = @_; |
602 | my @raw_words = split( /\s+/, $string ); |
603 | my @words; |
604 | foreach my $w ( @raw_words ) { |
605 | my $id = $self->sigil . 'r'. $c++; |
e92d4229 |
606 | my %opts = ( 'text' => $w, 'id' => $id ); |
fae52efd |
607 | my $w_obj = $self->tradition->collation->add_reading( \%opts ); |
608 | # Skip any words that have been canonized out of existence. |
609 | next if( length( $w_obj->text ) == 0 ); |
610 | push( @words, $w_obj ); |
611 | } |
612 | return @words; |
613 | } |
614 | |
615 | sub _init_from_json { |
616 | my( $self ) = shift; |
617 | my $wit; |
618 | if( $self->has_object ) { |
619 | $wit = $self->object; |
65ed66b9 |
620 | } elsif( $self->has_string ) { |
621 | $wit = from_json( $self->string ); |
622 | } elsif( $self->has_file ) { |
623 | my $ok = open( INPUT, $self->file ); |
624 | unless( $ok ) { |
625 | throw( ident => "bad source", |
626 | message => 'Could not open ' . $self->file . ' for reading' ); |
627 | } |
628 | binmode( INPUT, ':encoding(UTF-8)' ); |
629 | my @lines = <INPUT>; |
630 | close INPUT; |
631 | $wit = from_json( join( '', @lines ) ); |
fae52efd |
632 | } |
633 | |
65ed66b9 |
634 | if( exists $wit->{'id'} ) { |
635 | $self->_set_sigil( $wit->{'id'} ); |
636 | } elsif( !$self->has_sigil ) { |
637 | throw( ident => 'missing sigil', |
638 | message => 'Could not find witness sigil (id) in JSON spec' ); |
639 | } |
fae52efd |
640 | $self->identifier( $wit->{'name'} ); |
641 | my @words; |
642 | my @layerwords; |
038d6b50 |
643 | my( @text, @layertext ); |
fae52efd |
644 | if( exists $wit->{'content'} ) { |
645 | # We need to tokenize the text ourselves. |
646 | @words = _split_words( $self, $wit->{'content'} ); |
647 | } elsif( exists $wit->{'tokens'} ) { |
648 | # We have a bunch of pretokenized words. |
649 | my $ctr = 0; |
650 | foreach my $token ( @{$wit->{'tokens'}} ) { |
651 | my $w_obj = $self->tradition->collation->add_reading({ |
65ed66b9 |
652 | 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ }); |
fae52efd |
653 | push( @words, $w_obj ); |
038d6b50 |
654 | push( @text, $token->{'t'} ); # TODO unless...? |
fae52efd |
655 | } |
656 | ## TODO rethink this JSOn mechanism |
657 | if( exists $wit->{'layertokens'} ) { |
658 | foreach my $token ( @{$wit->{'layertokens'}} ) { |
659 | my $w_obj = $self->tradition->collation->add_reading({ |
65ed66b9 |
660 | 'text' => $token->{'t'}, 'id' => $self->sigil . 'r' . $ctr++ }); |
fae52efd |
661 | push( @layerwords, $w_obj ); |
038d6b50 |
662 | push( @layertext, $token->{'t'} ); |
7158714d |
663 | } |
fae52efd |
664 | } |
7158714d |
665 | } |
038d6b50 |
666 | $self->text( \@text ); |
667 | $self->layertext( \@layertext ) if @layertext; |
fae52efd |
668 | $self->path( \@words ); |
669 | $self->uncorrected_path( \@layerwords ) if @layerwords; |
7158714d |
670 | } |
671 | |
fae52efd |
672 | sub _init_from_plaintext { |
673 | my( $self ) = @_; |
bf7d52b5 |
674 | unless( $self->has_sigil ) { |
675 | throw( "No sigil defined for the plaintext witness" ); |
676 | } |
fae52efd |
677 | my $str; |
678 | if( $self->has_file ) { |
679 | my $ok = open( INPUT, $self->file ); |
680 | unless( $ok ) { |
681 | throw( ident => "bad source", |
682 | message => 'Could not open ' . $self->file . ' for reading' ); |
683 | } |
684 | binmode( INPUT, ':encoding(UTF-8)' ); |
685 | my @lines = <INPUT>; |
686 | close INPUT; |
687 | $str = join( '', @lines ); |
688 | } elsif( $self->has_object ) { # ...seriously? |
689 | $str = ${$self->object}; |
690 | } else { |
691 | $str = $self->string; |
692 | } |
693 | |
694 | # TODO allow a different word separation expression |
695 | my @text = split( /\s+/, $str ); |
696 | $self->text( \@text ); |
697 | my @words = _split_words( $self, $str ); |
698 | $self->path( \@words ); |
699 | } |
700 | |
701 | sub throw { |
702 | Text::Tradition::Error->throw( |
703 | 'ident' => 'Witness parsing error', |
704 | 'message' => $_[0], |
705 | ); |
706 | } |
707 | |
708 | sub _xpc_for_el { |
709 | my $el = shift; |
710 | my $xpc = XML::LibXML::XPathContext->new( $el ); |
711 | if( $el->namespaceURI ) { |
712 | $xpc->registerNs( 'tei', $el->namespaceURI ); |
713 | } |
714 | return $xpc; |
715 | } |
716 | |
f025e303 |
717 | =head2 export_as_json |
718 | |
719 | Exports the witness as a JSON structure, with the following keys: |
720 | |
721 | =over 4 |
722 | |
723 | =item * id - The witness sigil |
724 | |
725 | =item * name - The witness identifier |
726 | |
727 | =item * tokens - An array of hashes of the form { "t":"WORD" } |
728 | |
729 | =back |
730 | |
731 | =begin testing |
732 | |
733 | use Text::Tradition; |
fae52efd |
734 | my $trad = Text::Tradition->new(); |
f025e303 |
735 | |
fae52efd |
736 | my @text = qw/ Thhis is a line of text /; |
737 | my $wit = $trad->add_witness( |
f025e303 |
738 | 'sigil' => 'A', |
fae52efd |
739 | 'string' => join( ' ', @text ), |
740 | 'sourcetype' => 'plaintext', |
f025e303 |
741 | 'identifier' => 'test witness', |
742 | ); |
743 | my $jsonstruct = $wit->export_as_json; |
744 | is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" ); |
745 | is( $jsonstruct->{'name'}, 'test witness', "got the right identifier" ); |
746 | is( scalar @{$jsonstruct->{'tokens'}}, 6, "got six text tokens" ); |
747 | foreach my $idx ( 0 .. $#text ) { |
748 | is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $text[$idx], "tokens look OK" ); |
749 | } |
750 | |
751 | my @ctext = qw( when april with his showers sweet with fruit the drought of march |
752 | has pierced unto the root ); |
fae52efd |
753 | $trad = Text::Tradition->new( |
f025e303 |
754 | 'input' => 'CollateX', |
755 | 'file' => 't/data/Collatex-16.xml' ); |
756 | |
757 | $jsonstruct = $trad->witness('A')->export_as_json; |
758 | is( $jsonstruct->{'id'}, 'A', "got the right witness sigil" ); |
759 | is( $jsonstruct->{'name'}, undef, "got undef for missing identifier" ); |
760 | is( scalar @{$jsonstruct->{'tokens'}}, 17, "got all text tokens" ); |
761 | foreach my $idx ( 0 .. $#ctext ) { |
762 | is( $jsonstruct->{'tokens'}->[$idx]->{'t'}, $ctext[$idx], "tokens look OK" ); |
763 | } |
764 | |
fae52efd |
765 | ## TODO test layertext export |
766 | |
f025e303 |
767 | =end testing |
768 | |
769 | =cut |
770 | |
771 | sub export_as_json { |
772 | my $self = shift; |
773 | my @wordlist = map { { 't' => $_ || '' } } @{$self->text}; |
fae52efd |
774 | my $obj = { |
f025e303 |
775 | 'id' => $self->sigil, |
776 | 'tokens' => \@wordlist, |
777 | 'name' => $self->identifier, |
778 | }; |
fae52efd |
779 | if( $self->is_layered ) { |
248276a2 |
780 | my @lwlist = map { { 't' => $_ || '' } } @{$self->layertext}; |
fae52efd |
781 | $obj->{'layertokens'} = \@lwlist; |
782 | } |
783 | return $obj; |
f025e303 |
784 | } |
785 | |
dd3b58b0 |
786 | no Moose; |
787 | __PACKAGE__->meta->make_immutable; |
7158714d |
788 | |
789 | =head1 BUGS / TODO |
790 | |
791 | =over |
792 | |
06e7cbc7 |
793 | =item * Figure out how to serialize a witness |
794 | |
fae52efd |
795 | =item * Support encodings other than UTF-8 |
7158714d |
796 | |
797 | =back |
798 | |
799 | =head1 LICENSE |
800 | |
801 | This package is free software and is provided "as is" without express |
802 | or implied warranty. You can redistribute it and/or modify it under |
803 | the same terms as Perl itself. |
804 | |
805 | =head1 AUTHOR |
806 | |
807 | Tara L Andrews E<lt>aurum@cpan.orgE<gt> |