Commit | Line | Data |
32014ec9 |
1 | package Text::Tradition::Parser::Self; |
2 | |
3 | use strict; |
4 | use warnings; |
1f7aa795 |
5 | use Text::Tradition::Parser::GraphML qw/ graphml_parse /; |
9fef629b |
6 | use Text::Tradition::UserStore; |
00c5bf0b |
7 | use TryCatch; |
32014ec9 |
8 | |
9 | =head1 NAME |
10 | |
11 | Text::Tradition::Parser::GraphML |
12 | |
e867486f |
13 | =head1 SYNOPSIS |
14 | |
15 | use Text::Tradition; |
16 | |
17 | my $t_from_file = Text::Tradition->new( |
18 | 'name' => 'my text', |
19 | 'input' => 'Self', |
20 | 'file' => '/path/to/tradition.xml' |
21 | ); |
22 | |
23 | my $t_from_string = Text::Tradition->new( |
24 | 'name' => 'my text', |
25 | 'input' => 'Self', |
26 | 'string' => $tradition_xml, |
27 | ); |
28 | |
32014ec9 |
29 | =head1 DESCRIPTION |
30 | |
31 | Parser module for Text::Tradition to read in its own GraphML output format. |
e867486f |
32 | GraphML is a relatively simple graph description language; a 'graph' element |
33 | can have 'node' and 'edge' elements, and each of these can have simple 'data' |
34 | elements for attributes to be saved. |
32014ec9 |
35 | |
e867486f |
36 | The graph itself has attributes as in the Collation object: |
37 | |
38 | =over |
39 | |
40 | =item * linear |
41 | |
42 | =item * ac_label |
43 | |
44 | =item * baselabel |
45 | |
46 | =item * wit_list_separator |
47 | |
48 | =back |
49 | |
50 | The node objects have the following attributes: |
32014ec9 |
51 | |
52 | =over |
53 | |
e867486f |
54 | =item * name |
55 | |
56 | =item * reading |
57 | |
58 | =item * identical |
59 | |
60 | =item * rank |
61 | |
62 | =item * class |
63 | |
64 | =back |
65 | |
66 | The edge objects have the following attributes: |
67 | |
68 | =over |
69 | |
70 | =item * class |
71 | |
72 | =item * witness (for 'path' class edges) |
73 | |
74 | =item * extra (for 'path' class edges) |
75 | |
76 | =item * relationship (for 'relationship' class edges) |
77 | |
78 | =item * equal_rank (for 'relationship' class edges) |
32014ec9 |
79 | |
e867486f |
80 | =item * non_correctable (for 'relationship' class edges) |
32014ec9 |
81 | |
e867486f |
82 | =item * non_independent (for 'relationship' class edges) |
83 | |
84 | =back |
85 | |
86 | =head1 METHODS |
87 | |
88 | =head2 B<parse> |
89 | |
90 | parse( $graph, $opts ); |
91 | |
92 | Takes an initialized Text::Tradition object and a set of options; creates |
93 | the appropriate nodes and edges on the graph. The options hash should |
94 | include either a 'file' argument or a 'string' argument, depending on the |
95 | source of the XML to be parsed. |
96 | |
97 | =begin testing |
98 | |
9fef629b |
99 | use File::Temp; |
951ddfe8 |
100 | use Safe::Isa; |
9fef629b |
101 | use Test::Warn; |
e867486f |
102 | use Text::Tradition; |
951ddfe8 |
103 | use TryCatch; |
e867486f |
104 | binmode STDOUT, ":utf8"; |
105 | binmode STDERR, ":utf8"; |
106 | eval { no warnings; binmode $DB::OUT, ":utf8"; }; |
107 | |
108 | my $tradition = 't/data/florilegium_graphml.xml'; |
109 | my $t = Text::Tradition->new( |
110 | 'name' => 'inline', |
111 | 'input' => 'Self', |
112 | 'file' => $tradition, |
113 | ); |
114 | |
951ddfe8 |
115 | ok( $t->$_isa('Text::Tradition'), "Parsed GraphML version 2" ); |
e867486f |
116 | if( $t ) { |
117 | is( scalar $t->collation->readings, 319, "Collation has all readings" ); |
255875b8 |
118 | is( scalar $t->collation->paths, 376, "Collation has all paths" ); |
e867486f |
119 | is( scalar $t->witnesses, 13, "Collation has all witnesses" ); |
120 | } |
121 | |
2a812726 |
122 | # TODO add a relationship, add a stemma, write graphml, reparse it, check that |
123 | # the new data is there |
bbd064a9 |
124 | $t->language('Greek'); |
951ddfe8 |
125 | my $stemma_enabled; |
126 | try { |
127 | $stemma_enabled = $t->enable_stemmata; |
128 | } catch { |
129 | ok( 1, "Skipping stemma tests without Analysis module" ); |
130 | } |
131 | if( $stemma_enabled ) { |
132 | $t->add_stemma( 'dotfile' => 't/data/florilegium.dot' ); |
133 | } |
bbd064a9 |
134 | $t->collation->add_relationship( 'w12', 'w13', |
135 | { 'type' => 'grammatical', 'scope' => 'global', |
136 | 'annotation' => 'This is some note' } ); |
137 | ok( $t->collation->get_relationship( 'w12', 'w13' ), "Relationship set" ); |
138 | my $graphml_str = $t->collation->as_graphml; |
139 | |
140 | my $newt = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml_str ); |
951ddfe8 |
141 | ok( $newt->$_isa('Text::Tradition'), "Parsed current GraphML version" ); |
bbd064a9 |
142 | if( $newt ) { |
143 | is( scalar $newt->collation->readings, 319, "Collation has all readings" ); |
144 | is( scalar $newt->collation->paths, 376, "Collation has all paths" ); |
145 | is( scalar $newt->witnesses, 13, "Collation has all witnesses" ); |
146 | is( scalar $newt->collation->relationships, 1, "Collation has added relationship" ); |
147 | is( $newt->language, 'Greek', "Tradition has correct language setting" ); |
148 | my $rel = $newt->collation->get_relationship( 'w12', 'w13' ); |
149 | ok( $rel, "Found set relationship" ); |
150 | is( $rel->annotation, 'This is some note', "Relationship has its properties" ); |
951ddfe8 |
151 | if( $stemma_enabled ) { |
152 | is( scalar $newt->stemmata, 1, "Tradition has its stemma" ); |
153 | is( $newt->stemma(0)->witnesses, $t->stemma(0)->witnesses, "Stemma has correct length witness list" ); |
154 | } |
bbd064a9 |
155 | } |
156 | |
9fef629b |
157 | # Test user save / restore |
158 | my $fh = File::Temp->new(); |
159 | my $file = $fh->filename; |
160 | $fh->close; |
161 | my $dsn = "dbi:SQLite:dbname=$file"; |
1df4baa9 |
162 | my $userstore = Text::Tradition::Directory->new( { dsn => $dsn, |
9fef629b |
163 | extra_args => { create => 1 } } ); |
164 | my $scope = $userstore->new_scope(); |
1df4baa9 |
165 | my $testuser = $userstore->create_user( { url => 'http://example.com' } ); |
951ddfe8 |
166 | ok( $testuser->$_isa('Text::Tradition::User'), "Created test user via userstore" ); |
9fef629b |
167 | $testuser->add_tradition( $newt ); |
168 | is( $newt->user->id, $testuser->id, "Assigned tradition to test user" ); |
169 | $graphml_str = $newt->collation->as_graphml; |
170 | my $usert; |
171 | warning_is { |
172 | $usert = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml_str ); |
173 | } 'DROPPING user assignment without a specified userstore', |
174 | "Got expected user drop warning on parse"; |
175 | $usert = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml_str, |
1df4baa9 |
176 | 'userstore' => $userstore ); |
9fef629b |
177 | is( $usert->user->id, $testuser->id, "Parsed tradition with userstore points to correct user" ); |
178 | |
951ddfe8 |
179 | # Test warning if we can |
180 | unless( $stemma_enabled ) { |
181 | my $nst; |
182 | warnings_exist { |
183 | $nst = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/lexformat.xml' ); |
184 | } [qr/DROPPING stemmata/], |
185 | "Got expected stemma drop warning on parse"; |
186 | } |
bbd064a9 |
187 | |
e867486f |
188 | =end testing |
32014ec9 |
189 | |
190 | =cut |
144d845b |
191 | use Data::Dump; |
32014ec9 |
192 | sub parse { |
dfc37e38 |
193 | my( $tradition, $opts ) = @_; |
2626f709 |
194 | |
195 | # Collation data is in the first graph; relationship-specific stuff |
196 | # is in the second. |
197 | my( $graph_data, $rel_data ) = graphml_parse( $opts ); |
144d845b |
198 | |
32014ec9 |
199 | my $collation = $tradition->collation; |
200 | my %witnesses; |
e309421a |
201 | |
0068967c |
202 | # print STDERR "Setting graph globals\n"; |
e3196b2a |
203 | $tradition->name( $graph_data->{'name'} ); |
144d845b |
204 | |
2626f709 |
205 | my $use_version; |
bbd064a9 |
206 | my $tmeta = $tradition->meta; |
207 | my $cmeta = $collation->meta; |
255875b8 |
208 | foreach my $gkey ( keys %{$graph_data->{'global'}} ) { |
209 | my $val = $graph_data->{'global'}->{$gkey}; |
210 | if( $gkey eq 'version' ) { |
211 | $use_version = $val; |
9fef629b |
212 | } elsif( $gkey eq 'stemmata' ) { |
951ddfe8 |
213 | # Make sure we can handle stemmata |
214 | my $stemma_enabled; |
215 | try { |
216 | $stemma_enabled = $tradition->enable_stemmata; |
217 | } catch { |
218 | warn "Analysis module not installed; DROPPING stemmata"; |
219 | } |
9fef629b |
220 | # Parse the stemmata into objects |
951ddfe8 |
221 | if( $stemma_enabled ) { |
222 | foreach my $dotstr ( split( /\n/, $val ) ) { |
223 | $tradition->add_stemma( 'dot' => $dotstr ); |
224 | } |
2a812726 |
225 | } |
9fef629b |
226 | } elsif( $gkey eq 'user' ) { |
227 | # Assign the tradition to the user if we can |
228 | if( exists $opts->{'userstore'} ) { |
1df4baa9 |
229 | my $userdir = delete $opts->{'userstore'}; |
9fef629b |
230 | my $user = $userdir->find_user( { username => $val } ); |
231 | if( $user ) { |
232 | $user->add_tradition( $tradition ); |
233 | } else { |
234 | warn( "Found no user with ID $val; DROPPING user assignment" ); |
235 | } |
236 | } else { |
237 | warn( "DROPPING user assignment without a specified userstore" ); |
238 | } |
bbd064a9 |
239 | } elsif( $tmeta->has_attribute( $gkey ) ) { |
240 | $tradition->$gkey( $val ); |
255875b8 |
241 | } else { |
242 | $collation->$gkey( $val ); |
243 | } |
244 | } |
e309421a |
245 | |
10e4b1ac |
246 | # Add the nodes to the graph. |
247 | # Note any reading IDs that were changed in order to comply with XML |
248 | # name restrictions; we have to hardcode start & end. |
249 | my %namechange = ( '#START#' => '__START__', '#END#' => '__END__' ); |
32014ec9 |
250 | |
bbd064a9 |
251 | # print STDERR "Adding collation readings\n"; |
2626f709 |
252 | foreach my $n ( @{$graph_data->{'nodes'}} ) { |
0174d6a9 |
253 | # If it is the start or end node, we already have one, so |
254 | # grab the rank and go. |
144d845b |
255 | if( defined $n->{'is_start'} ) { |
f3f26624 |
256 | # warn Data::Dump::dump($n); |
257 | # warn $collation->start->id; |
144d845b |
258 | $collation->start->rank($n->{'rank'}); |
259 | next; |
260 | } |
bbd064a9 |
261 | if( defined $n->{'is_end'} ) { |
f3f26624 |
262 | # warn Data::Dump::dump($n); |
bbd064a9 |
263 | $collation->end->rank( $n->{'rank'} ); |
0174d6a9 |
264 | next; |
265 | } |
bbd064a9 |
266 | my $gnode = $collation->add_reading( $n ); |
10e4b1ac |
267 | if( $gnode->id ne $n->{'id'} ) { |
268 | $namechange{$n->{'id'}} = $gnode->id; |
269 | } |
32014ec9 |
270 | } |
910a0a6d |
271 | |
32014ec9 |
272 | # Now add the edges. |
bbd064a9 |
273 | # print STDERR "Adding collation path edges\n"; |
32014ec9 |
274 | foreach my $e ( @{$graph_data->{'edges'}} ) { |
10e4b1ac |
275 | my $sourceid = exists $namechange{$e->{'source'}->{'id'}} |
276 | ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'}; |
277 | my $targetid = exists $namechange{$e->{'target'}->{'id'}} |
278 | ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'}; |
279 | my $from = $collation->reading( $sourceid ); |
280 | my $to = $collation->reading( $targetid ); |
bbd064a9 |
281 | |
282 | warn "No witness label on path edge!" unless $e->{'witness'}; |
283 | my $label = $e->{'witness'} . ( $e->{'extra'} ? $collation->ac_label : '' ); |
284 | $collation->add_path( $from, $to, $label ); |
285 | |
2626f709 |
286 | # Add the witness if we don't have it already. |
bbd064a9 |
287 | unless( $witnesses{$e->{'witness'}} ) { |
82fa4d57 |
288 | $tradition->add_witness( |
289 | sigil => $e->{'witness'}, 'sourcetype' => 'collation' ); |
bbd064a9 |
290 | $witnesses{$e->{'witness'}} = 1; |
255875b8 |
291 | } |
bbd064a9 |
292 | $tradition->witness( $e->{'witness'} )->is_layered( 1 ) if $e->{'extra'}; |
32014ec9 |
293 | } |
2626f709 |
294 | |
295 | ## Done with the main graph, now look at the relationships. |
296 | # Nodes are added via the call to add_reading above. We only need |
297 | # add the relationships themselves. |
298 | # TODO check that scoping does trt |
bf6e338d |
299 | $rel_data->{'edges'} ||= []; # so that the next line doesn't break on no rels |
300 | foreach my $e ( sort { _layersort_rel( $a, $b ) } @{$rel_data->{'edges'}} ) { |
10e4b1ac |
301 | my $sourceid = exists $namechange{$e->{'source'}->{'id'}} |
302 | ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'}; |
303 | my $targetid = exists $namechange{$e->{'target'}->{'id'}} |
304 | ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'}; |
305 | my $from = $collation->reading( $sourceid ); |
306 | my $to = $collation->reading( $targetid ); |
bbd064a9 |
307 | delete $e->{'source'}; |
308 | delete $e->{'target'}; |
309 | # The remaining keys are relationship attributes. |
310 | # Backward compatibility... |
311 | if( $use_version eq '2.0' || $use_version eq '3.0' ) { |
312 | delete $e->{'class'}; |
313 | $e->{'type'} = delete $e->{'relationship'} if exists $e->{'relationship'}; |
314 | } |
315 | # Add the specified relationship unless we already have done. |
fdfa59a7 |
316 | my $rel_exists; |
bbd064a9 |
317 | if( $e->{'scope'} ne 'local' ) { |
318 | my $relobj = $collation->get_relationship( $from, $to ); |
319 | if( $relobj && $relobj->scope eq $e->{'scope'} |
320 | && $relobj->type eq $e->{'type'} ) { |
fdfa59a7 |
321 | $rel_exists = 1; |
322 | } |
323 | } |
00c5bf0b |
324 | try { |
325 | $collation->add_relationship( $from, $to, $e ) unless $rel_exists; |
326 | } catch( Text::Tradition::Error $e ) { |
327 | warn "DROPPING $from -> $to: " . $e->message; |
328 | } |
2626f709 |
329 | } |
861c3e27 |
330 | |
331 | # Save the text for each witness so that we can ensure consistency |
332 | # later on |
bbd064a9 |
333 | $collation->text_from_paths(); |
32014ec9 |
334 | } |
335 | |
bf6e338d |
336 | ## Return the relationship that comes first in priority. |
337 | my %LAYERS = ( |
338 | 'collated' => 1, |
339 | 'orthographic' => 2, |
340 | 'spelling' => 3, |
341 | ); |
342 | |
343 | sub _layersort_rel { |
344 | my( $a, $b ) = @_; |
345 | my $key = exists $a->{'type'} ? 'type' : 'relationship'; |
346 | my $at = $LAYERS{$a->{$key}} || 99; |
347 | my $bt = $LAYERS{$b->{$key}} || 99; |
348 | return $at <=> $bt; |
349 | } |
350 | |
e867486f |
351 | 1; |
352 | |
353 | =head1 BUGS / TODO |
354 | |
355 | =over |
356 | |
357 | =item * Make this into a stream parser with GraphML |
358 | |
359 | =item * Simply field -> attribute correspondence for nodes and edges |
360 | |
361 | =item * Share key name constants with Collation.pm |
362 | |
32014ec9 |
363 | =back |
364 | |
365 | =head1 LICENSE |
366 | |
367 | This package is free software and is provided "as is" without express |
368 | or implied warranty. You can redistribute it and/or modify it under |
369 | the same terms as Perl itself. |
370 | |
371 | =head1 AUTHOR |
372 | |
e867486f |
373 | Tara L Andrews E<lt>aurum@cpan.orgE<gt> |