we cannot save coderefs, so stop trying; self parser fixes for new relationship regime
[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Parser / Self.pm
CommitLineData
32014ec9 1package Text::Tradition::Parser::Self;
2
3use strict;
4use warnings;
1f7aa795 5use Text::Tradition::Parser::GraphML qw/ graphml_parse /;
00c5bf0b 6use TryCatch;
32014ec9 7
8=head1 NAME
9
10Text::Tradition::Parser::GraphML
11
e867486f 12=head1 SYNOPSIS
13
14 use Text::Tradition;
15
16 my $t_from_file = Text::Tradition->new(
17 'name' => 'my text',
18 'input' => 'Self',
19 'file' => '/path/to/tradition.xml'
20 );
21
22 my $t_from_string = Text::Tradition->new(
23 'name' => 'my text',
24 'input' => 'Self',
25 'string' => $tradition_xml,
26 );
27
32014ec9 28=head1 DESCRIPTION
29
30Parser module for Text::Tradition to read in its own GraphML output format.
e867486f 31GraphML is a relatively simple graph description language; a 'graph' element
32can have 'node' and 'edge' elements, and each of these can have simple 'data'
33elements for attributes to be saved.
32014ec9 34
e867486f 35The graph itself has attributes as in the Collation object:
36
37=over
38
39=item * linear
40
41=item * ac_label
42
43=item * baselabel
44
45=item * wit_list_separator
46
47=back
48
49The node objects have the following attributes:
32014ec9 50
51=over
52
e867486f 53=item * name
54
55=item * reading
56
57=item * identical
58
59=item * rank
60
61=item * class
62
63=back
64
65The edge objects have the following attributes:
66
67=over
68
69=item * class
70
71=item * witness (for 'path' class edges)
72
73=item * extra (for 'path' class edges)
74
75=item * relationship (for 'relationship' class edges)
76
77=item * equal_rank (for 'relationship' class edges)
32014ec9 78
e867486f 79=item * non_correctable (for 'relationship' class edges)
32014ec9 80
e867486f 81=item * non_independent (for 'relationship' class edges)
82
83=back
84
85=head1 METHODS
86
87=head2 B<parse>
88
89parse( $graph, $opts );
90
91Takes an initialized Text::Tradition object and a set of options; creates
92the appropriate nodes and edges on the graph. The options hash should
93include either a 'file' argument or a 'string' argument, depending on the
94source of the XML to be parsed.
95
96=begin testing
97
951ddfe8 98use Safe::Isa;
9fef629b 99use Test::Warn;
e867486f 100use Text::Tradition;
951ddfe8 101use TryCatch;
e867486f 102binmode STDOUT, ":utf8";
103binmode STDERR, ":utf8";
104eval { no warnings; binmode $DB::OUT, ":utf8"; };
105
106my $tradition = 't/data/florilegium_graphml.xml';
107my $t = Text::Tradition->new(
108 'name' => 'inline',
109 'input' => 'Self',
110 'file' => $tradition,
111 );
112
951ddfe8 113ok( $t->$_isa('Text::Tradition'), "Parsed GraphML version 2" );
e867486f 114if( $t ) {
115 is( scalar $t->collation->readings, 319, "Collation has all readings" );
255875b8 116 is( scalar $t->collation->paths, 376, "Collation has all paths" );
e867486f 117 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
118}
119
2a812726 120# TODO add a relationship, add a stemma, write graphml, reparse it, check that
121# the new data is there
e92d4229 122my $language_enabled = $t->can('language');
123if( $language_enabled ) {
124 $t->language('Greek');
125}
37bf09f4 126my $stemma_enabled = $t->can('add_stemma');
951ddfe8 127if( $stemma_enabled ) {
128 $t->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
129}
bbd064a9 130$t->collation->add_relationship( 'w12', 'w13',
131 { 'type' => 'grammatical', 'scope' => 'global',
132 'annotation' => 'This is some note' } );
133ok( $t->collation->get_relationship( 'w12', 'w13' ), "Relationship set" );
134my $graphml_str = $t->collation->as_graphml;
135
136my $newt = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml_str );
951ddfe8 137ok( $newt->$_isa('Text::Tradition'), "Parsed current GraphML version" );
bbd064a9 138if( $newt ) {
139 is( scalar $newt->collation->readings, 319, "Collation has all readings" );
140 is( scalar $newt->collation->paths, 376, "Collation has all paths" );
141 is( scalar $newt->witnesses, 13, "Collation has all witnesses" );
142 is( scalar $newt->collation->relationships, 1, "Collation has added relationship" );
e92d4229 143 if( $language_enabled ) {
144 is( $newt->language, 'Greek', "Tradition has correct language setting" );
145 }
bbd064a9 146 my $rel = $newt->collation->get_relationship( 'w12', 'w13' );
147 ok( $rel, "Found set relationship" );
148 is( $rel->annotation, 'This is some note', "Relationship has its properties" );
951ddfe8 149 if( $stemma_enabled ) {
150 is( scalar $newt->stemmata, 1, "Tradition has its stemma" );
151 is( $newt->stemma(0)->witnesses, $t->stemma(0)->witnesses, "Stemma has correct length witness list" );
152 }
bbd064a9 153}
154
951ddfe8 155# Test warning if we can
156unless( $stemma_enabled ) {
157 my $nst;
158 warnings_exist {
159 $nst = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/lexformat.xml' );
160 } [qr/DROPPING stemmata/],
161 "Got expected stemma drop warning on parse";
162}
bbd064a9 163
e867486f 164=end testing
32014ec9 165
166=cut
144d845b 167use Data::Dump;
32014ec9 168sub parse {
dfc37e38 169 my( $tradition, $opts ) = @_;
2626f709 170
171 # Collation data is in the first graph; relationship-specific stuff
172 # is in the second.
173 my( $graph_data, $rel_data ) = graphml_parse( $opts );
144d845b 174
32014ec9 175 my $collation = $tradition->collation;
176 my %witnesses;
e309421a 177
0068967c 178 # print STDERR "Setting graph globals\n";
e3196b2a 179 $tradition->name( $graph_data->{'name'} );
144d845b 180
2626f709 181 my $use_version;
255875b8 182 foreach my $gkey ( keys %{$graph_data->{'global'}} ) {
183 my $val = $graph_data->{'global'}->{$gkey};
184 if( $gkey eq 'version' ) {
185 $use_version = $val;
9fef629b 186 } elsif( $gkey eq 'stemmata' ) {
951ddfe8 187 # Make sure we can handle stemmata
9fef629b 188 # Parse the stemmata into objects
37bf09f4 189 if( $tradition->can('add_stemma') ) {
951ddfe8 190 foreach my $dotstr ( split( /\n/, $val ) ) {
191 $tradition->add_stemma( 'dot' => $dotstr );
192 }
37bf09f4 193 } else {
194 warn "Analysis module not installed; DROPPING stemmata";
2a812726 195 }
9fef629b 196 } elsif( $gkey eq 'user' ) {
197 # Assign the tradition to the user if we can
198 if( exists $opts->{'userstore'} ) {
1df4baa9 199 my $userdir = delete $opts->{'userstore'};
9fef629b 200 my $user = $userdir->find_user( { username => $val } );
201 if( $user ) {
202 $user->add_tradition( $tradition );
203 } else {
204 warn( "Found no user with ID $val; DROPPING user assignment" );
205 }
206 } else {
207 warn( "DROPPING user assignment without a specified userstore" );
208 }
8943ff68 209 } elsif( $tradition->can( $gkey ) ) {
bbd064a9 210 $tradition->$gkey( $val );
8943ff68 211 } elsif( $collation->can( $gkey ) ) {
255875b8 212 $collation->$gkey( $val );
8943ff68 213 } else {
214 warn( "DROPPING unsupported attribute $gkey" );
255875b8 215 }
216 }
e309421a 217
10e4b1ac 218 # Add the nodes to the graph.
219 # Note any reading IDs that were changed in order to comply with XML
220 # name restrictions; we have to hardcode start & end.
221 my %namechange = ( '#START#' => '__START__', '#END#' => '__END__' );
32014ec9 222
bbd064a9 223 # print STDERR "Adding collation readings\n";
2626f709 224 foreach my $n ( @{$graph_data->{'nodes'}} ) {
0174d6a9 225 # If it is the start or end node, we already have one, so
226 # grab the rank and go.
144d845b 227 if( defined $n->{'is_start'} ) {
a445ce40 228 $collation->start->rank($n->{'rank'});
229 next;
144d845b 230 }
bbd064a9 231 if( defined $n->{'is_end'} ) {
232 $collation->end->rank( $n->{'rank'} );
0174d6a9 233 next;
234 }
bbd064a9 235 my $gnode = $collation->add_reading( $n );
10e4b1ac 236 if( $gnode->id ne $n->{'id'} ) {
237 $namechange{$n->{'id'}} = $gnode->id;
238 }
32014ec9 239 }
910a0a6d 240
32014ec9 241 # Now add the edges.
bbd064a9 242 # print STDERR "Adding collation path edges\n";
32014ec9 243 foreach my $e ( @{$graph_data->{'edges'}} ) {
10e4b1ac 244 my $sourceid = exists $namechange{$e->{'source'}->{'id'}}
245 ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'};
246 my $targetid = exists $namechange{$e->{'target'}->{'id'}}
247 ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'};
248 my $from = $collation->reading( $sourceid );
249 my $to = $collation->reading( $targetid );
bbd064a9 250
251 warn "No witness label on path edge!" unless $e->{'witness'};
252 my $label = $e->{'witness'} . ( $e->{'extra'} ? $collation->ac_label : '' );
253 $collation->add_path( $from, $to, $label );
254
2626f709 255 # Add the witness if we don't have it already.
bbd064a9 256 unless( $witnesses{$e->{'witness'}} ) {
82fa4d57 257 $tradition->add_witness(
258 sigil => $e->{'witness'}, 'sourcetype' => 'collation' );
bbd064a9 259 $witnesses{$e->{'witness'}} = 1;
255875b8 260 }
bbd064a9 261 $tradition->witness( $e->{'witness'} )->is_layered( 1 ) if $e->{'extra'};
32014ec9 262 }
2626f709 263
264 ## Done with the main graph, now look at the relationships.
265 # Nodes are added via the call to add_reading above. We only need
266 # add the relationships themselves.
267 # TODO check that scoping does trt
c7bd2768 268 $tradition->_init_done( 1 ); # so that relationships get validated
bf6e338d 269 $rel_data->{'edges'} ||= []; # so that the next line doesn't break on no rels
c7bd2768 270 # Backward compatibility...
271 if( $use_version eq '2.0' || $use_version eq '3.0' ) {
272 foreach my $e ( @{$rel_data->{'edges'}} ) {
273 delete $e->{'class'};
274 $e->{'type'} = delete $e->{'relationship'} if exists $e->{'relationship'};
275 }
276 }
277
278 my $rg = $collation->relations;
279 foreach my $e ( sort { _apply_relationship_order( $a, $b, $rg ) }
280 @{$rel_data->{'edges'}} ) {
10e4b1ac 281 my $sourceid = exists $namechange{$e->{'source'}->{'id'}}
282 ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'};
283 my $targetid = exists $namechange{$e->{'target'}->{'id'}}
284 ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'};
285 my $from = $collation->reading( $sourceid );
286 my $to = $collation->reading( $targetid );
bbd064a9 287 delete $e->{'source'};
288 delete $e->{'target'};
289 # The remaining keys are relationship attributes.
bbd064a9 290 # Add the specified relationship unless we already have done.
fdfa59a7 291 my $rel_exists;
bbd064a9 292 if( $e->{'scope'} ne 'local' ) {
293 my $relobj = $collation->get_relationship( $from, $to );
294 if( $relobj && $relobj->scope eq $e->{'scope'}
295 && $relobj->type eq $e->{'type'} ) {
fdfa59a7 296 $rel_exists = 1;
c7bd2768 297 } else {
298 # Don't propagate the relationship; all the propagations are
299 # already in the XML.
300 $e->{'thispaironly'} = 1;
fdfa59a7 301 }
302 }
00c5bf0b 303 try {
304 $collation->add_relationship( $from, $to, $e ) unless $rel_exists;
c7bd2768 305 } catch( Text::Tradition::Error $err ) {
306 warn "DROPPING " . $e->{type} . " rel on $from -> $to: " . $err->message;
00c5bf0b 307 }
2626f709 308 }
861c3e27 309
310 # Save the text for each witness so that we can ensure consistency
311 # later on
bbd064a9 312 $collation->text_from_paths();
32014ec9 313}
314
c7bd2768 315# Helper sort function for applying the saved relationships in a
316# sensible order.
317sub _apply_relationship_order {
318 my( $a, $b, $rg ) = @_;
319 my $at = $rg->type( $a->{type} ); my $bt = $rg->type( $b->{type} );
320 # Apply strong relationships before weak
321 return -1 if $bt->is_weak && !$at->is_weak;
322 return 1 if $at->is_weak && !$bt->is_weak;
323 # Apply local before global
324 return -1 if $a->{scope} eq 'local' && $b->{scope} ne 'local';
325 return 1 if $b->{scope} eq 'local' && $a->{scope} ne 'local';
326 # Apply more tightly bound relationships first
327 return $at->bindlevel <=> $bt->bindlevel;
bf6e338d 328}
329
e867486f 3301;
331
332=head1 BUGS / TODO
333
334=over
335
336=item * Make this into a stream parser with GraphML
337
32014ec9 338=back
339
340=head1 LICENSE
341
342This package is free software and is provided "as is" without express
343or implied warranty. You can redistribute it and/or modify it under
344the same terms as Perl itself.
345
346=head1 AUTHOR
347
e867486f 348Tara L Andrews E<lt>aurum@cpan.orgE<gt>