Commit | Line | Data |
32014ec9 |
1 | package Text::Tradition::Parser::Self; |
2 | |
3 | use strict; |
4 | use warnings; |
1f7aa795 |
5 | use Text::Tradition::Parser::GraphML qw/ graphml_parse /; |
00c5bf0b |
6 | use TryCatch; |
32014ec9 |
7 | |
8 | =head1 NAME |
9 | |
10 | Text::Tradition::Parser::GraphML |
11 | |
e867486f |
12 | =head1 SYNOPSIS |
13 | |
14 | use Text::Tradition; |
15 | |
16 | my $t_from_file = Text::Tradition->new( |
17 | 'name' => 'my text', |
18 | 'input' => 'Self', |
19 | 'file' => '/path/to/tradition.xml' |
20 | ); |
21 | |
22 | my $t_from_string = Text::Tradition->new( |
23 | 'name' => 'my text', |
24 | 'input' => 'Self', |
25 | 'string' => $tradition_xml, |
26 | ); |
27 | |
32014ec9 |
28 | =head1 DESCRIPTION |
29 | |
30 | Parser module for Text::Tradition to read in its own GraphML output format. |
e867486f |
31 | GraphML is a relatively simple graph description language; a 'graph' element |
32 | can have 'node' and 'edge' elements, and each of these can have simple 'data' |
33 | elements for attributes to be saved. |
32014ec9 |
34 | |
e867486f |
35 | The graph itself has attributes as in the Collation object: |
36 | |
37 | =over |
38 | |
39 | =item * linear |
40 | |
41 | =item * ac_label |
42 | |
43 | =item * baselabel |
44 | |
45 | =item * wit_list_separator |
46 | |
47 | =back |
48 | |
49 | The node objects have the following attributes: |
32014ec9 |
50 | |
51 | =over |
52 | |
e867486f |
53 | =item * name |
54 | |
55 | =item * reading |
56 | |
57 | =item * identical |
58 | |
59 | =item * rank |
60 | |
61 | =item * class |
62 | |
63 | =back |
64 | |
65 | The edge objects have the following attributes: |
66 | |
67 | =over |
68 | |
69 | =item * class |
70 | |
71 | =item * witness (for 'path' class edges) |
72 | |
73 | =item * extra (for 'path' class edges) |
74 | |
75 | =item * relationship (for 'relationship' class edges) |
76 | |
77 | =item * equal_rank (for 'relationship' class edges) |
32014ec9 |
78 | |
e867486f |
79 | =item * non_correctable (for 'relationship' class edges) |
32014ec9 |
80 | |
e867486f |
81 | =item * non_independent (for 'relationship' class edges) |
82 | |
83 | =back |
84 | |
85 | =head1 METHODS |
86 | |
87 | =head2 B<parse> |
88 | |
89 | parse( $graph, $opts ); |
90 | |
91 | Takes an initialized Text::Tradition object and a set of options; creates |
92 | the appropriate nodes and edges on the graph. The options hash should |
93 | include either a 'file' argument or a 'string' argument, depending on the |
94 | source of the XML to be parsed. |
95 | |
96 | =begin testing |
97 | |
951ddfe8 |
98 | use Safe::Isa; |
9fef629b |
99 | use Test::Warn; |
e867486f |
100 | use Text::Tradition; |
951ddfe8 |
101 | use TryCatch; |
e867486f |
102 | binmode STDOUT, ":utf8"; |
103 | binmode STDERR, ":utf8"; |
104 | eval { no warnings; binmode $DB::OUT, ":utf8"; }; |
105 | |
106 | my $tradition = 't/data/florilegium_graphml.xml'; |
107 | my $t = Text::Tradition->new( |
108 | 'name' => 'inline', |
109 | 'input' => 'Self', |
110 | 'file' => $tradition, |
111 | ); |
112 | |
951ddfe8 |
113 | ok( $t->$_isa('Text::Tradition'), "Parsed GraphML version 2" ); |
e867486f |
114 | if( $t ) { |
115 | is( scalar $t->collation->readings, 319, "Collation has all readings" ); |
255875b8 |
116 | is( scalar $t->collation->paths, 376, "Collation has all paths" ); |
e867486f |
117 | is( scalar $t->witnesses, 13, "Collation has all witnesses" ); |
118 | } |
119 | |
2a812726 |
120 | # TODO add a relationship, add a stemma, write graphml, reparse it, check that |
121 | # the new data is there |
e92d4229 |
122 | my $language_enabled = $t->can('language'); |
123 | if( $language_enabled ) { |
124 | $t->language('Greek'); |
125 | } |
37bf09f4 |
126 | my $stemma_enabled = $t->can('add_stemma'); |
951ddfe8 |
127 | if( $stemma_enabled ) { |
128 | $t->add_stemma( 'dotfile' => 't/data/florilegium.dot' ); |
129 | } |
bbd064a9 |
130 | $t->collation->add_relationship( 'w12', 'w13', |
131 | { 'type' => 'grammatical', 'scope' => 'global', |
132 | 'annotation' => 'This is some note' } ); |
133 | ok( $t->collation->get_relationship( 'w12', 'w13' ), "Relationship set" ); |
134 | my $graphml_str = $t->collation->as_graphml; |
135 | |
136 | my $newt = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml_str ); |
951ddfe8 |
137 | ok( $newt->$_isa('Text::Tradition'), "Parsed current GraphML version" ); |
bbd064a9 |
138 | if( $newt ) { |
139 | is( scalar $newt->collation->readings, 319, "Collation has all readings" ); |
140 | is( scalar $newt->collation->paths, 376, "Collation has all paths" ); |
141 | is( scalar $newt->witnesses, 13, "Collation has all witnesses" ); |
142 | is( scalar $newt->collation->relationships, 1, "Collation has added relationship" ); |
e92d4229 |
143 | if( $language_enabled ) { |
144 | is( $newt->language, 'Greek', "Tradition has correct language setting" ); |
145 | } |
bbd064a9 |
146 | my $rel = $newt->collation->get_relationship( 'w12', 'w13' ); |
147 | ok( $rel, "Found set relationship" ); |
148 | is( $rel->annotation, 'This is some note', "Relationship has its properties" ); |
951ddfe8 |
149 | if( $stemma_enabled ) { |
150 | is( scalar $newt->stemmata, 1, "Tradition has its stemma" ); |
151 | is( $newt->stemma(0)->witnesses, $t->stemma(0)->witnesses, "Stemma has correct length witness list" ); |
152 | } |
bbd064a9 |
153 | } |
154 | |
951ddfe8 |
155 | # Test warning if we can |
156 | unless( $stemma_enabled ) { |
157 | my $nst; |
158 | warnings_exist { |
159 | $nst = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/lexformat.xml' ); |
160 | } [qr/DROPPING stemmata/], |
161 | "Got expected stemma drop warning on parse"; |
162 | } |
bbd064a9 |
163 | |
e867486f |
164 | =end testing |
32014ec9 |
165 | |
166 | =cut |
144d845b |
167 | use Data::Dump; |
32014ec9 |
168 | sub parse { |
dfc37e38 |
169 | my( $tradition, $opts ) = @_; |
2626f709 |
170 | |
171 | # Collation data is in the first graph; relationship-specific stuff |
172 | # is in the second. |
173 | my( $graph_data, $rel_data ) = graphml_parse( $opts ); |
144d845b |
174 | |
32014ec9 |
175 | my $collation = $tradition->collation; |
176 | my %witnesses; |
e309421a |
177 | |
0068967c |
178 | # print STDERR "Setting graph globals\n"; |
e3196b2a |
179 | $tradition->name( $graph_data->{'name'} ); |
144d845b |
180 | |
2626f709 |
181 | my $use_version; |
bbd064a9 |
182 | my $tmeta = $tradition->meta; |
183 | my $cmeta = $collation->meta; |
255875b8 |
184 | foreach my $gkey ( keys %{$graph_data->{'global'}} ) { |
185 | my $val = $graph_data->{'global'}->{$gkey}; |
186 | if( $gkey eq 'version' ) { |
187 | $use_version = $val; |
9fef629b |
188 | } elsif( $gkey eq 'stemmata' ) { |
951ddfe8 |
189 | # Make sure we can handle stemmata |
9fef629b |
190 | # Parse the stemmata into objects |
37bf09f4 |
191 | if( $tradition->can('add_stemma') ) { |
951ddfe8 |
192 | foreach my $dotstr ( split( /\n/, $val ) ) { |
193 | $tradition->add_stemma( 'dot' => $dotstr ); |
194 | } |
37bf09f4 |
195 | } else { |
196 | warn "Analysis module not installed; DROPPING stemmata"; |
2a812726 |
197 | } |
e92d4229 |
198 | } elsif( $gkey eq 'language' ) { |
199 | if( $tradition->can('language') ) { |
200 | $tradition->language( $val ); |
201 | } else { |
202 | warn "Morphology module not installed; DROPPING language"; |
203 | } |
9fef629b |
204 | } elsif( $gkey eq 'user' ) { |
205 | # Assign the tradition to the user if we can |
206 | if( exists $opts->{'userstore'} ) { |
1df4baa9 |
207 | my $userdir = delete $opts->{'userstore'}; |
9fef629b |
208 | my $user = $userdir->find_user( { username => $val } ); |
209 | if( $user ) { |
210 | $user->add_tradition( $tradition ); |
211 | } else { |
212 | warn( "Found no user with ID $val; DROPPING user assignment" ); |
213 | } |
214 | } else { |
215 | warn( "DROPPING user assignment without a specified userstore" ); |
216 | } |
bbd064a9 |
217 | } elsif( $tmeta->has_attribute( $gkey ) ) { |
218 | $tradition->$gkey( $val ); |
255875b8 |
219 | } else { |
220 | $collation->$gkey( $val ); |
221 | } |
222 | } |
e309421a |
223 | |
10e4b1ac |
224 | # Add the nodes to the graph. |
225 | # Note any reading IDs that were changed in order to comply with XML |
226 | # name restrictions; we have to hardcode start & end. |
227 | my %namechange = ( '#START#' => '__START__', '#END#' => '__END__' ); |
32014ec9 |
228 | |
bbd064a9 |
229 | # print STDERR "Adding collation readings\n"; |
2626f709 |
230 | foreach my $n ( @{$graph_data->{'nodes'}} ) { |
0174d6a9 |
231 | # If it is the start or end node, we already have one, so |
232 | # grab the rank and go. |
144d845b |
233 | if( defined $n->{'is_start'} ) { |
a445ce40 |
234 | $collation->start->rank($n->{'rank'}); |
235 | next; |
144d845b |
236 | } |
bbd064a9 |
237 | if( defined $n->{'is_end'} ) { |
238 | $collation->end->rank( $n->{'rank'} ); |
0174d6a9 |
239 | next; |
240 | } |
bbd064a9 |
241 | my $gnode = $collation->add_reading( $n ); |
10e4b1ac |
242 | if( $gnode->id ne $n->{'id'} ) { |
243 | $namechange{$n->{'id'}} = $gnode->id; |
244 | } |
32014ec9 |
245 | } |
910a0a6d |
246 | |
32014ec9 |
247 | # Now add the edges. |
bbd064a9 |
248 | # print STDERR "Adding collation path edges\n"; |
32014ec9 |
249 | foreach my $e ( @{$graph_data->{'edges'}} ) { |
10e4b1ac |
250 | my $sourceid = exists $namechange{$e->{'source'}->{'id'}} |
251 | ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'}; |
252 | my $targetid = exists $namechange{$e->{'target'}->{'id'}} |
253 | ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'}; |
254 | my $from = $collation->reading( $sourceid ); |
255 | my $to = $collation->reading( $targetid ); |
bbd064a9 |
256 | |
257 | warn "No witness label on path edge!" unless $e->{'witness'}; |
258 | my $label = $e->{'witness'} . ( $e->{'extra'} ? $collation->ac_label : '' ); |
259 | $collation->add_path( $from, $to, $label ); |
260 | |
2626f709 |
261 | # Add the witness if we don't have it already. |
bbd064a9 |
262 | unless( $witnesses{$e->{'witness'}} ) { |
82fa4d57 |
263 | $tradition->add_witness( |
264 | sigil => $e->{'witness'}, 'sourcetype' => 'collation' ); |
bbd064a9 |
265 | $witnesses{$e->{'witness'}} = 1; |
255875b8 |
266 | } |
bbd064a9 |
267 | $tradition->witness( $e->{'witness'} )->is_layered( 1 ) if $e->{'extra'}; |
32014ec9 |
268 | } |
2626f709 |
269 | |
270 | ## Done with the main graph, now look at the relationships. |
271 | # Nodes are added via the call to add_reading above. We only need |
272 | # add the relationships themselves. |
273 | # TODO check that scoping does trt |
bf6e338d |
274 | $rel_data->{'edges'} ||= []; # so that the next line doesn't break on no rels |
275 | foreach my $e ( sort { _layersort_rel( $a, $b ) } @{$rel_data->{'edges'}} ) { |
10e4b1ac |
276 | my $sourceid = exists $namechange{$e->{'source'}->{'id'}} |
277 | ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'}; |
278 | my $targetid = exists $namechange{$e->{'target'}->{'id'}} |
279 | ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'}; |
280 | my $from = $collation->reading( $sourceid ); |
281 | my $to = $collation->reading( $targetid ); |
bbd064a9 |
282 | delete $e->{'source'}; |
283 | delete $e->{'target'}; |
284 | # The remaining keys are relationship attributes. |
285 | # Backward compatibility... |
286 | if( $use_version eq '2.0' || $use_version eq '3.0' ) { |
287 | delete $e->{'class'}; |
288 | $e->{'type'} = delete $e->{'relationship'} if exists $e->{'relationship'}; |
289 | } |
290 | # Add the specified relationship unless we already have done. |
fdfa59a7 |
291 | my $rel_exists; |
bbd064a9 |
292 | if( $e->{'scope'} ne 'local' ) { |
293 | my $relobj = $collation->get_relationship( $from, $to ); |
294 | if( $relobj && $relobj->scope eq $e->{'scope'} |
295 | && $relobj->type eq $e->{'type'} ) { |
fdfa59a7 |
296 | $rel_exists = 1; |
297 | } |
298 | } |
00c5bf0b |
299 | try { |
300 | $collation->add_relationship( $from, $to, $e ) unless $rel_exists; |
301 | } catch( Text::Tradition::Error $e ) { |
302 | warn "DROPPING $from -> $to: " . $e->message; |
303 | } |
2626f709 |
304 | } |
861c3e27 |
305 | |
306 | # Save the text for each witness so that we can ensure consistency |
307 | # later on |
bbd064a9 |
308 | $collation->text_from_paths(); |
32014ec9 |
309 | } |
310 | |
bf6e338d |
311 | ## Return the relationship that comes first in priority. |
312 | my %LAYERS = ( |
313 | 'collated' => 1, |
314 | 'orthographic' => 2, |
315 | 'spelling' => 3, |
316 | ); |
317 | |
318 | sub _layersort_rel { |
319 | my( $a, $b ) = @_; |
320 | my $key = exists $a->{'type'} ? 'type' : 'relationship'; |
321 | my $at = $LAYERS{$a->{$key}} || 99; |
322 | my $bt = $LAYERS{$b->{$key}} || 99; |
323 | return $at <=> $bt; |
324 | } |
325 | |
e867486f |
326 | 1; |
327 | |
328 | =head1 BUGS / TODO |
329 | |
330 | =over |
331 | |
332 | =item * Make this into a stream parser with GraphML |
333 | |
334 | =item * Simply field -> attribute correspondence for nodes and edges |
335 | |
336 | =item * Share key name constants with Collation.pm |
337 | |
32014ec9 |
338 | =back |
339 | |
340 | =head1 LICENSE |
341 | |
342 | This package is free software and is provided "as is" without express |
343 | or implied warranty. You can redistribute it and/or modify it under |
344 | the same terms as Perl itself. |
345 | |
346 | =head1 AUTHOR |
347 | |
e867486f |
348 | Tara L Andrews E<lt>aurum@cpan.orgE<gt> |