Commit | Line | Data |
6f4946fb |
1 | package Text::Tradition::Parser::CTE; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use XML::LibXML; |
6 | use XML::LibXML::XPathContext; |
7 | |
8 | =head1 NAME |
9 | |
10 | Text::Tradition::Parser::CTE |
11 | |
12 | =head1 DESCRIPTION |
13 | |
14 | Parser module for Text::Tradition, given a TEI file exported from |
15 | Classical Text Editor. |
16 | |
17 | =head1 METHODS |
18 | |
19 | =over |
20 | |
21 | =item B<parse> |
22 | |
23 | my @apparatus = read( $xml_file ); |
24 | |
25 | Takes a Tradition object and a TEI file exported from Classical Text |
26 | Editor; initializes the Tradition from the file. |
27 | |
28 | =cut |
29 | |
30 | my %seg_readings; # Save the XML IDs for apparatus anchors. |
31 | my %sigil_for; # Save the XML IDs for witnesses. |
32 | my %note_start; # Save the readings where an apparatus entry is attached. |
33 | |
34 | sub parse { |
35 | my( $tradition, $xml_str ) = @_; |
36 | my $c = $tradition->collation; # Some shorthand |
37 | |
38 | # First, parse the XML. |
39 | my $parser = XML::LibXML->new(); |
40 | my $doc = $parser->parse_string( $xml_str ); |
41 | my $tei = $doc->documentElement(); |
42 | my $xpc = XML::LibXML::XPathContext->new( $tei ); |
43 | |
44 | # CTE uses a DTD rather than any xmlns-based parsing. Thus we |
45 | # need no namespace foo. |
46 | |
47 | # Get the witnesses and create the witness objects. |
48 | foreach my $wit_el ( $xpc->findnodes( '//sourceDesc/listWit/witness' ) ) { |
49 | # The witness xml:id is used internally, and is *not* the sigil name. |
50 | my $id= $wit_el->getAttribute( 'xml:id' ); |
51 | $id =~ s/^M/sig/; # Stupid but there you go. |
52 | my @sig_parts = $xpc->findnodes( './abbr/descendant::text()', $wit_el ); |
53 | my $sig = join( '', grep { /\w/ } @sig_parts ); |
54 | $tradition->add_witness( sigil => $sig, source => $wit_el->toString() ); |
55 | $sigil_for{$id} = $sig; |
56 | } |
57 | |
58 | # Now go through the text and find the base tokens. Tokens are |
59 | # either plain text to be split on whitespace, or they are wrapped |
60 | # in <hi/> or <seg/> elements. |
61 | my @base_text; |
62 | my $ctr = 1; |
63 | foreach my $pg_el ( $xpc->findnodes( '/TEI/text/p' ) ) { |
64 | foreach my $xn ( $pg_el->childNodes ) { |
65 | push( @base_text, _get_readings( $tradition, $xn ) ); |
66 | } |
67 | } |
68 | |
69 | # String together the base. |
70 | my $source = $c->start; |
71 | foreach my $b ( @base_text ) { |
72 | $c->add_path( $source, $b, $c->baselabel ); |
73 | $source = $b; |
74 | } |
75 | $c->add_path( $source, $c->add_reading('#END#'), $c->baselabel ); |
76 | |
77 | # Now go through the text and find all the apparatus notes, and parse them. |
78 | foreach my $note_el( $xpc->findnodes( '//note[attribute::type = "a1"]' ) ) { |
79 | my $app_start = $note_start{$note_el}; |
80 | my $apparatus = _parse_note( $note_el, $c, $app_start ); |
81 | } |
82 | } |
83 | |
84 | |
85 | ## Recursive little helper function to help us navigate through nested |
86 | ## XML, picking out the text. |
87 | |
88 | sub _get_readings { |
89 | my( $tradition, $xn ) = @_; |
90 | my @readings; |
91 | if( $xn->nodeType == XML_TEXT_NODE ) { |
92 | my $str = $xn->data; |
93 | $str =~ s/^\s+//; |
94 | foreach my $w ( split( /\s+/, $str ) ) { |
95 | my $rdg = $tradition->collation->add_reading( 'n'.$ctr++ ); |
96 | $rdg->text( $w ); |
97 | push( @readings, $rdg ); |
98 | } |
99 | } elsif( $xn->nodeName eq 'hi' ) { |
100 | foreach( $xn->childNodes ) { |
101 | # Recurse as if the hi weren't there. |
102 | push( @readings, _get_readings( $tradition, $_ ) ); |
103 | } |
104 | } elsif( $xn->nodeName eq 'seg' ) { |
105 | # Read the reading, but also add the word in question as an anchor. |
106 | my $seg_id = $xn->getAttribute( 'xml:id' ); |
107 | my @r; |
108 | foreach( $xn->childNodes ) { |
109 | push( @r, _get_readings( $tradition, $_ ) ); |
110 | } |
111 | warn "More than one reading found in seg $seg_id" unless @r == 1; |
112 | $seg_readings{'#'.$seg_id} = $r[0]; |
113 | push( @readings, @r ); |
114 | } elsif( $xn->nodeName eq 'note' ) { |
115 | # Save where we found this note. |
116 | $note_start{$xn} = $readings[-1]; |
117 | } |
118 | return @readings; |
119 | } |
120 | |
121 | ## Helper function to parse apparatus entries. This could get nasty, I mean fun. |
122 | sub _parse_note { |
123 | my( $xn, $c, $app_start ) = @_; |
124 | my $app_end = $seg_readings{$xn->getAttribute( 'targetEnd' )}; |
125 | my $lemma = join( ' ', map { $_->text } $c->reading_sequence( $app_start, $app_end ) ); |
126 | |
127 | my %seen_wits; |
128 | # TODO A list of active witnesses should be passed really. |
129 | map { $seen_wits{$_} = 0 } vals( %sigil_for ); |
130 | |
131 | # The note has a <p/> tag, then <mentioned/>, then 0-1 text nodes, |
132 | # then an assortment of <hi/> or <abbr/> elements. If the hi |
133 | # contains an abbr, then it goes before, otherwise it probably |
134 | # goes after. |
135 | my @p = $xn->getChildrenByTagName( 'p' ); |
136 | warn "More than one pg in note" unless @p == 1; |
137 | |
138 | # Strip the <hi/> elements. |
139 | my @childnodes; |
140 | foreach ( $p[0]->childNodes ) { |
141 | if( $_->nodeName eq 'hi' ) { |
142 | push( @childnodes, $_->childNodes ); |
143 | } else { |
144 | push( @childnodes, $_ ); |
145 | } |
146 | } |
147 | |
148 | # Go through and try to parse the sucker. |
149 | my $apparatus; |
150 | my $curr_rdg = ''; |
151 | my $reading_sigla = 0; |
152 | my @curr_wits; |
153 | foreach my $pxn ( $p[0]->childNodes ) { |
154 | next if $pxn->nodeName eq 'mentioned'; # Redundant for us. |
155 | if( $pxn->nodeType == XML_TEXT_NODE ) { |
156 | my $pxn_str = $pxn->data; |
157 | $pxn_str =~ s/^\s+//; |
158 | $pxn_str =~ s/\s+$//; |
159 | my @parts = split( /,\s*/, $pxn_str ); |
160 | if( @parts > 1 ) { |
161 | # Comma separation means that we are starting a new reading. |
162 | my $last = shift @parts; |
163 | if( $last =~ /^\s*a\.\s*c\.\s*$/ ) { |
164 | my $sig = pop @curr_wits; |
165 | $sig .= '_ac'; |
166 | push( @curr_wits, $sig ); |
167 | } |
168 | $pxn_str = join( ', ', @parts ); |
169 | # Trigger a reading interpretation. |
170 | $reading_sigla = 1; |
171 | } |
172 | if( $reading_sigla ) { |
173 | my @wits = keys %curr_wits; |
174 | $apparatus->{ interpret( $curr_rdg, $lemma ) } = \@wits; |
175 | $curr_rdg = ''; |
176 | $reading_sigla = 0; |
177 | @curr_wits = (); |
178 | } |
179 | |
180 | if( $pxn_str =~ /^\s*a\.\s*c\.\s*$/ ) { |
181 | my $sig = pop @curr_wits; |
182 | $sig .= '_ac'; |
183 | push( @curr_wits, $sig ); |
184 | } else { |
185 | $curr_rdg .= $pxn_str; |
186 | } |
187 | } elsif( $pxn->nodeName eq 'abbr' ) { |
188 | # It is a witness, stick it in @curr_wits |
189 | my $wit = $sigil_for{$pxn->getAttribute( 'n' )} |
190 | push( @curr_wits, $wit ) unless $curr_wits[-1] eq $wit; |
191 | $seen_wits{$wit} += 1; # Keep track of a 'seen' count in case there is an a.c. |
192 | $reading_sigla = 1; |
193 | } |
194 | } |
195 | $apparatus->{ interpret( $curr_rdg, $lemma ) } = \@wits if $curr_rdg; |
196 | $apparatus->{ $lemma } = grep { $seen_wits{$_} == 0 } keys %seen_wits; |
197 | |
198 | return $apparatus; |
199 | } |
200 | |
201 | |
202 | sub interpret { |
203 | # A utility function to change apparatus-ese into a full variant. |
204 | my( $reading, $lemma ) = @_; |
205 | return $reading if $reading eq $lemma; |
206 | my $oldreading = $reading; |
207 | $lemma =~ s/\s+[[:punct:]]+$//; |
208 | $reading =~ s/\s*\(?sic([\s\w.]+)?\)?$//; |
209 | my @words = split( /\s+/, $lemma ); |
210 | if( $reading =~ /^(.*) praem.$/ ) { |
211 | $reading = "$1 $lemma"; |
212 | } elsif( $reading =~ /^(.*) add.$/ ) { |
213 | $reading = "$lemma $1"; |
214 | } elsif( $reading eq 'om.' ) { |
215 | $reading = ''; |
216 | } elsif( $reading eq 'inv.' ) { |
217 | # Hope it is two words. |
218 | print STDERR "WARNING: want to invert a lemma that is not two words\n" |
219 | unless scalar( @words ) == 2; |
220 | $reading = join( ' ', reverse( @words ) ); |
221 | } elsif( $reading eq 'iter.' ) { |
222 | # Repeat the lemma |
223 | $reading = "$lemma $lemma"; |
224 | } elsif( $reading =~ /^(.*) \.\.\. (.*)$/ ) { |
225 | # The first and last N words captured should replace the first and |
226 | # last N words of the lemma. |
227 | my @begin = split( /\s+/, $1 ); |
228 | my @end = split( /\s+/, $2 ); |
229 | if( scalar( @begin ) + scalar ( @end ) > scalar( @words ) ) { |
230 | # Something is wrong and we can't do the splice. |
231 | print STDERR "ERROR: $lemma is too short to accommodate $oldreading\n"; |
232 | } else { |
233 | splice( @words, 0, scalar @begin, @begin ); |
234 | splice( @words, -(scalar @end), scalar @end, @end ); |
235 | $reading = join( ' ', @words ); |
236 | } |
237 | } |
238 | print STDERR "Interpreted $oldreading as $reading given $lemma\n"; |
239 | return $reading; |
240 | } |
241 | |
242 | =back |
243 | |
244 | =head1 LICENSE |
245 | |
246 | This package is free software and is provided "as is" without express |
247 | or implied warranty. You can redistribute it and/or modify it under |
248 | the same terms as Perl itself. |
249 | |
250 | =head1 AUTHOR |
251 | |
252 | Tara L Andrews, aurum@cpan.org |
253 | |
254 | =cut |
255 | |
256 | 1; |
257 | |