Commit | Line | Data |
f6066bac |
1 | package Text::Tradition::Parser::TEI; |
2 | |
3 | use strict; |
4 | use warnings; |
910a0a6d |
5 | use Text::Tradition::Parser::Util qw( collate_variants ); |
f6066bac |
6 | use XML::LibXML; |
7 | use XML::LibXML::XPathContext; |
8 | |
9 | =head1 NAME |
10 | |
11 | Text::Tradition::Parser::TEI |
12 | |
13 | =head1 DESCRIPTION |
14 | |
15 | Parser module for Text::Tradition, given a TEI parallel-segmentation |
16 | file that describes a text and its variants. |
17 | |
18 | =head1 METHODS |
19 | |
20 | =over |
21 | |
22 | =item B<parse> |
23 | |
24 | parse( $tei_string ); |
25 | |
26 | Takes an initialized tradition and a string containing the TEI; |
27 | creates the appropriate nodes and edges on the graph, as well as |
28 | the appropriate witness objects. |
29 | |
30 | =cut |
31 | |
910a0a6d |
32 | my $text = {}; # Hash of arrays, one per eventual witness we find. |
910a0a6d |
33 | my $substitutions = {}; # Keep track of merged readings |
34 | my $app_anchors = {}; # Track apparatus references |
35 | my $app_ac = {}; # Save a.c. readings |
eca16057 |
36 | my $app_count; # Keep track of how many apps we have |
910a0a6d |
37 | |
38 | # Create the package variables for tag names. |
39 | |
40 | # Would really like to do this with varname variables, but apparently this |
41 | # is considered a bad idea. The long way round then. |
42 | my( $LISTWIT, $WITNESS, $TEXT, $W, $SEG, $APP, $RDG, $LEM ) |
43 | = ( 'listWit', 'witness', 'text', 'w', 'seg', 'app', 'rdg', 'lem' ); |
44 | sub make_tagnames { |
45 | my( $ns ) = @_; |
46 | if( $ns ) { |
47 | $LISTWIT = "$ns:$LISTWIT"; |
48 | $WITNESS = "$ns:$WITNESS"; |
49 | $TEXT = "$ns:$TEXT"; |
50 | $W = "$ns:$W"; |
51 | $SEG = "$ns:$SEG"; |
52 | $APP = "$ns:$APP"; |
53 | $RDG = "$ns:$RDG"; |
54 | $LEM = "$ns:$LEM"; |
55 | } |
56 | } |
57 | |
58 | # Parse the TEI file. |
f6066bac |
59 | sub parse { |
dfc37e38 |
60 | my( $tradition, $opts ) = @_; |
f6066bac |
61 | |
62 | # First, parse the XML. |
63 | my $parser = XML::LibXML->new(); |
dfc37e38 |
64 | my $doc; |
65 | if( exists $opts->{'string'} ) { |
66 | $doc = $parser->parse_string( $opts->{'string'} ); |
67 | } elsif ( exists $opts->{'file'} ) { |
68 | $doc = $parser->parse_file( $opts->{'file'} ); |
69 | } else { |
70 | warn "Could not find string or file option to parse"; |
71 | return; |
72 | } |
f6066bac |
73 | my $tei = $doc->documentElement(); |
f2b9605f |
74 | my $xpc = XML::LibXML::XPathContext->new( $tei ); |
910a0a6d |
75 | my $ns; |
76 | if( $tei->namespaceURI ) { |
77 | $ns = 'tei'; |
78 | $xpc->registerNs( $ns, $tei->namespaceURI ); |
79 | } |
80 | make_tagnames( $ns ); |
81 | |
f6066bac |
82 | # Then get the witnesses and create the witness objects. |
910a0a6d |
83 | foreach my $wit_el ( $xpc->findnodes( "//$LISTWIT/$WITNESS" ) ) { |
84 | my $sig = $wit_el->getAttribute( 'xml:id' ); |
85 | my $source = $wit_el->toString(); |
86 | $tradition->add_witness( sigil => $sig, source => $source ); |
f6066bac |
87 | } |
910a0a6d |
88 | map { $text->{$_->sigil} = [] } $tradition->witnesses; |
eca16057 |
89 | |
910a0a6d |
90 | # Look for all word/seg node IDs and note their pre-existence. |
91 | my @attrs = $xpc->findnodes( "//$W|$SEG/attribute::xml:id" ); |
92 | save_preexisting_nodeids( @attrs ); |
93 | |
eca16057 |
94 | # Count up how many apps we have. |
95 | my @apps = $xpc->findnodes( "//$APP" ); |
96 | $app_count = scalar( @apps ); |
97 | |
910a0a6d |
98 | # Now go through the children of the text element and pull out the |
99 | # actual text. |
100 | foreach my $xml_el ( $xpc->findnodes( "//$TEXT" ) ) { |
101 | foreach my $xn ( $xml_el->childNodes ) { |
102 | _get_readings( $tradition, $xn ); |
103 | } |
104 | } |
105 | # Our $text global now has lists of readings, one per witness. |
106 | # Join them up. |
107 | my $c = $tradition->collation; |
108 | foreach my $sig ( keys %$text ) { |
109 | next if $sig eq 'base'; # Skip base text readings with no witnesses. |
110 | # Determine the list of readings for |
111 | my $sequence = $text->{$sig}; |
112 | my @real_sequence = ( $c->start ); |
113 | push( @$sequence, $c->end ); |
114 | my $source = $c->start; |
115 | foreach( _clean_sequence( $sig, $sequence ) ) { |
116 | my $rdg = _return_rdg( $_ ); |
117 | push( @real_sequence, $rdg ); |
118 | $c->add_path( $source, $rdg, $sig ); |
119 | $source = $rdg; |
120 | } |
121 | $tradition->witness( $sig )->path( \@real_sequence ); |
122 | # See if we need to make an a.c. version of the witness. |
123 | if( exists $app_ac->{$sig} ) { |
124 | my @uncorrected; |
125 | push( @uncorrected, @real_sequence ); |
126 | foreach my $app ( keys %{$app_ac->{$sig}} ) { |
127 | my $start = _return_rdg( $app_anchors->{$app}->{$sig}->{'start'} ); |
128 | my $end = _return_rdg( $app_anchors->{$app}->{$sig}->{'end'} ); |
129 | my @new = map { _return_rdg( $_ ) } @{$app_ac->{$sig}->{$app}}; |
130 | _replace_sequence( \@uncorrected, $start, $end, @new ); |
131 | } |
132 | my $source = $c->start; |
133 | foreach my $rdg ( @uncorrected ) { |
134 | my $has_base = grep { $_->label eq $sig } $source->edges_to( $rdg ); |
135 | if( $rdg ne $c->start && !$has_base ) { |
136 | print STDERR sprintf( "Adding path %s from %s -> %s\n", |
137 | $sig.$c->ac_label, $source->name, $rdg->name ); |
138 | $c->add_path( $source, $rdg, $sig.$c->ac_label ); |
139 | } |
140 | $source = $rdg; |
141 | } |
78fab1cf |
142 | print STDERR "Adding a.c. version for witness $sig\n"; |
910a0a6d |
143 | $tradition->witness( $sig )->uncorrected_path( \@uncorrected ); |
144 | } |
145 | } |
146 | # Delete readings that are no longer part of the graph. |
147 | # TODO think this is useless actually |
148 | foreach ( keys %$substitutions ) { |
149 | $tradition->collation->del_reading( $tradition->collation->reading( $_ ) ); |
150 | } |
c9bf3dbf |
151 | $tradition->collation->calculate_ranks(); |
3bc0cd18 |
152 | |
153 | # Now that we have ranks, see if we have distinct nodes with identical |
154 | # text and identical rank that can be merged. |
0e476982 |
155 | $tradition->collation->flatten_ranks(); |
910a0a6d |
156 | } |
157 | |
158 | sub _clean_sequence { |
159 | my( $wit, $sequence ) = @_; |
160 | my @clean_sequence; |
161 | foreach my $rdg ( @$sequence ) { |
162 | if( $rdg =~ /^PH-(.*)$/ ) { |
163 | # It is a placeholder. Keep it only if we need it. |
164 | my $app_id = $1; |
78fab1cf |
165 | if( exists $app_ac->{$wit} && |
166 | exists $app_ac->{$wit}->{$app_id} ) { |
910a0a6d |
167 | print STDERR "Retaining empty placeholder for $app_id\n"; |
168 | push( @clean_sequence, $rdg ); |
169 | } |
170 | } else { |
171 | push( @clean_sequence, $rdg ); |
172 | } |
f6066bac |
173 | } |
910a0a6d |
174 | return @clean_sequence; |
175 | } |
f6066bac |
176 | |
910a0a6d |
177 | sub _replace_sequence { |
178 | my( $arr, $start, $end, @new ) = @_; |
179 | my( $start_idx, $end_idx ); |
180 | foreach my $i ( 0 .. $#{$arr} ) { |
181 | $start_idx = $i if( $arr->[$i]->name eq $start ); |
182 | if( $arr->[$i]->name eq $end ) { |
183 | $end_idx = $i; |
184 | last; |
185 | } |
186 | } |
187 | unless( $start_idx && $end_idx ) { |
188 | warn "Could not find start and end"; |
189 | return; |
f2b9605f |
190 | } |
910a0a6d |
191 | my $length = $end_idx - $start_idx + 1; |
192 | splice( @$arr, $start_idx, $length, @new ); |
193 | } |
f6066bac |
194 | |
910a0a6d |
195 | sub _return_rdg { |
196 | my( $rdg ) = @_; |
197 | # If we were passed a reading name, return the name. If we were |
198 | # passed a reading object, return the object. |
199 | my $wantobj = ref( $rdg ) eq 'Text::Tradition::Collation::Reading'; |
200 | my $real = $rdg; |
201 | if( exists $substitutions->{ $wantobj ? $rdg->name : $rdg } ) { |
202 | $real = $substitutions->{ $wantobj ? $rdg->name : $rdg }; |
203 | $real = $real->name unless $wantobj; |
204 | } |
205 | return $real; |
f6066bac |
206 | } |
207 | |
910a0a6d |
208 | ## Recursive helper function to help us navigate through nested XML, |
209 | ## picking out the text. $tradition is the tradition, needed for |
210 | ## making readings; $xn is the XML node currently being looked at, |
211 | ## $in_var is a flag to say that we are inside a variant, $ac is a |
212 | ## flag to say that we are inside an ante-correctionem reading, and |
213 | ## @cur_wits is the list of witnesses to which this XML node applies. |
214 | ## Returns the list of readings, if any, created on the run. |
215 | |
216 | { |
3bc0cd18 |
217 | my %active_wits; |
910a0a6d |
218 | my $current_app; |
eca16057 |
219 | my $seen_apps; |
910a0a6d |
220 | |
221 | sub _get_readings { |
222 | my( $tradition, $xn, $in_var, $ac, @cur_wits ) = @_; |
3bc0cd18 |
223 | @cur_wits = grep { $active_wits{$_} } keys %active_wits unless $in_var; |
910a0a6d |
224 | |
225 | my @new_readings; |
226 | if( $xn->nodeType == XML_TEXT_NODE ) { |
227 | # Some words, thus make some readings. |
228 | my $str = $xn->data; |
229 | return unless $str =~ /\S/; # skip whitespace-only text nodes |
230 | #print STDERR "Handling text node " . $str . "\n"; |
231 | # Check that all the witnesses we have are active. |
232 | foreach my $c ( @cur_wits ) { |
3bc0cd18 |
233 | warn "$c is not among active wits" unless $active_wits{$c}; |
910a0a6d |
234 | } |
235 | $str =~ s/^\s+//; |
236 | my $final = $str =~ s/\s+$//; |
237 | foreach my $w ( split( /\s+/, $str ) ) { |
238 | # For now, skip punctuation. |
239 | next if $w !~ /[[:alnum:]]/; |
240 | my $rdg = make_reading( $tradition->collation, $w ); |
241 | push( @new_readings, $rdg ); |
242 | unless( $in_var ) { |
910a0a6d |
243 | $rdg->make_common; |
244 | } |
245 | foreach ( @cur_wits ) { |
246 | warn "Empty wit!" unless $_; |
247 | warn "Empty reading!" unless $rdg; |
248 | push( @{$text->{$_}}, $rdg ) unless $ac; |
249 | } |
250 | } |
251 | } elsif( $xn->nodeName eq 'w' ) { |
252 | # Everything in this tag is one word. Also save any original XML ID. |
253 | #print STDERR "Handling word " . $xn->toString . "\n"; |
254 | # Check that all the witnesses we have are active. |
255 | foreach my $c ( @cur_wits ) { |
3bc0cd18 |
256 | warn "$c is not among active wits" unless $active_wits{$c}; |
910a0a6d |
257 | } |
258 | my $xml_id = $xn->getAttribute( 'xml:id' ); |
259 | my $rdg = make_reading( $tradition->collation, $xn->textContent, $xml_id ); |
260 | push( @new_readings, $rdg ); |
261 | unless( $in_var ) { |
910a0a6d |
262 | $rdg->make_common; |
263 | } |
264 | foreach( @cur_wits ) { |
265 | warn "Empty wit!" unless $_; |
266 | warn "Empty reading!" unless $rdg; |
267 | push( @{$text->{$_}}, $rdg ) unless $ac; |
268 | } |
269 | } elsif ( $xn->nodeName eq 'app' ) { |
eca16057 |
270 | $seen_apps++; |
910a0a6d |
271 | $current_app = $xn->getAttribute( 'xml:id' ); |
272 | # print STDERR "Handling app $current_app\n"; |
273 | # Keep the reading sets in this app. |
274 | my @sets; |
275 | # Recurse through all children (i.e. rdgs) for sets of words. |
276 | foreach ( $xn->childNodes ) { |
277 | my @rdg_set = _get_readings( $tradition, $_, $in_var, $ac, @cur_wits ); |
278 | push( @sets, \@rdg_set ) if @rdg_set; |
279 | } |
280 | # Now collate these sets if we have more than one. |
281 | my $subs = collate_variants( $tradition->collation, @sets ) if @sets > 1; |
282 | map { $substitutions->{$_} = $subs->{$_} } keys %$subs; |
283 | # TODO Look through substitutions to see if we can make anything common now. |
284 | # Return the entire set of unique readings. |
285 | my %unique; |
286 | foreach my $s ( @sets ) { |
287 | map { $unique{$_->name} = $_ } @$s; |
288 | } |
289 | push( @new_readings, values( %unique ) ); |
290 | # Exit the current app. |
291 | $current_app = ''; |
292 | } elsif ( $xn->nodeName eq 'lem' || $xn->nodeName eq 'rdg' ) { |
293 | # Alter the current witnesses and recurse. |
294 | #print STDERR "Handling reading for " . $xn->getAttribute( 'wit' ) . "\n"; |
3bc0cd18 |
295 | # TODO handle p.c. and s.l. designations too |
910a0a6d |
296 | $ac = $xn->getAttribute( 'type' ) && $xn->getAttribute( 'type' ) eq 'a.c.'; |
297 | my @rdg_wits = get_sigla( $xn ); |
298 | @rdg_wits = ( 'base' ) unless @rdg_wits; # Allow for editorially-supplied readings |
299 | my @words; |
300 | foreach ( $xn->childNodes ) { |
301 | my @rdg_set = _get_readings( $tradition, $_, 1, $ac, @rdg_wits ); |
302 | push( @words, @rdg_set ) if @rdg_set; |
303 | } |
304 | # If we have more than one word in a reading, it should become a segment. |
305 | # $tradition->collation->add_segment( @words ) if @words > 1; |
306 | |
307 | if( $ac ) { |
308 | # Add the reading set to the a.c. readings. |
309 | foreach ( @rdg_wits ) { |
310 | $app_ac->{$_}->{$current_app} = \@words; |
311 | } |
312 | } else { |
313 | # Add the reading set to the app anchors for each witness |
314 | # or put in placeholders for empty p.c. readings |
315 | foreach ( @rdg_wits ) { |
316 | my $start = @words ? $words[0]->name : "PH-$current_app"; |
317 | my $end = @words ? $words[-1]->name : "PH-$current_app"; |
318 | $app_anchors->{$current_app}->{$_}->{'start'} = $start; |
319 | $app_anchors->{$current_app}->{$_}->{'end'} = $end; |
320 | push( @{$text->{$_}}, $start ) unless @words; |
321 | } |
322 | } |
323 | push( @new_readings, @words ); |
324 | } elsif( $xn->nodeName eq 'witStart' ) { |
325 | # Add the relevant wit(s) to the active list. |
326 | #print STDERR "Handling witStart\n"; |
3bc0cd18 |
327 | map { $active_wits{$_} = 1 } @cur_wits; |
328 | # Record a lacuna in all non-active witnesses if this is |
329 | # the first app. Get the full list from $text. |
330 | if( $seen_apps == 1 ) { |
331 | my $i = 0; |
332 | foreach my $sig ( keys %$text ) { |
333 | next if $active_wits{$sig}; |
334 | my $l = $tradition->collation->add_lacuna( $current_app . "_$i" ); |
335 | $i++; |
336 | push( @{$text->{$sig}}, $l ); |
337 | } |
338 | } |
910a0a6d |
339 | } elsif( $xn->nodeName eq 'witEnd' ) { |
340 | # Take the relevant wit(s) out of the list. |
341 | #print STDERR "Handling witEnd\n"; |
3bc0cd18 |
342 | map { $active_wits{$_} = undef } @cur_wits; |
eca16057 |
343 | # Record a lacuna, unless this is the last app. |
344 | unless( $seen_apps == $app_count ) { |
345 | foreach my $i ( 0 .. $#cur_wits ) { |
346 | my $w = $cur_wits[$i]; |
347 | my $l = $tradition->collation->add_lacuna( $current_app . "_$i" ); |
348 | push( @{$text->{$w}}, $l ); |
349 | } |
350 | } |
910a0a6d |
351 | } elsif( $xn->nodeName eq 'witDetail' ) { |
352 | # Ignore these for now. |
353 | return; |
354 | } else { |
355 | # Recurse as if this tag weren't there. |
356 | #print STDERR "Recursing on tag " . $xn->nodeName . "\n"; |
357 | foreach( $xn->childNodes ) { |
358 | push( @new_readings, _get_readings( $tradition, $_, $in_var, $ac, @cur_wits ) ); |
359 | } |
360 | } |
361 | return @new_readings; |
362 | } |
363 | |
364 | } |
365 | |
366 | # Helper to extract a list of witness sigla from a reading element. |
f6066bac |
367 | sub get_sigla { |
368 | my( $rdg ) = @_; |
369 | # Cope if we have been handed a NodeList. There is only |
370 | # one reading here. |
371 | if( ref( $rdg ) eq 'XML::LibXML::NodeList' ) { |
910a0a6d |
372 | $rdg = $rdg->shift; |
f6066bac |
373 | } |
374 | |
375 | my @wits; |
376 | if( ref( $rdg ) eq 'XML::LibXML::Element' ) { |
910a0a6d |
377 | my $witstr = $rdg->getAttribute( 'wit' ); |
378 | $witstr =~ s/^\s+//; |
379 | $witstr =~ s/\s+$//; |
380 | @wits = split( /\s+/, $witstr ); |
381 | map { $_ =~ s/^\#// } @wits; |
f6066bac |
382 | } |
383 | return @wits; |
384 | } |
385 | |
910a0a6d |
386 | # Helper with its counters to actually make the readings. |
f2b9605f |
387 | { |
388 | my $word_ctr = 0; |
389 | my %used_nodeids; |
390 | |
910a0a6d |
391 | sub save_preexisting_nodeids { |
392 | foreach( @_ ) { |
393 | $used_nodeids{$_->getValue()} = 1; |
394 | } |
395 | } |
396 | |
f2b9605f |
397 | sub make_reading { |
910a0a6d |
398 | my( $graph, $word, $xml_id ) = @_; |
399 | if( $xml_id ) { |
400 | if( exists $used_nodeids{$xml_id} ) { |
401 | if( $used_nodeids{$xml_id} != 1 ) { |
402 | warn "Already used assigned XML ID somewhere else!"; |
403 | $xml_id = undef; |
404 | } |
405 | } else { |
406 | warn "Undetected pre-existing XML ID"; |
407 | } |
408 | } |
409 | if( !$xml_id ) { |
410 | until( $xml_id ) { |
411 | my $try_id = 'w'.$word_ctr++; |
412 | next if exists $used_nodeids{$try_id}; |
413 | $xml_id = $try_id; |
414 | } |
415 | } |
416 | my $rdg = $graph->add_reading( $xml_id ); |
417 | $rdg->text( $word ); |
418 | $used_nodeids{$xml_id} = $rdg; |
419 | return $rdg; |
f2b9605f |
420 | } |
421 | } |
422 | |
f6066bac |
423 | 1; |