make sure all tests work in all combos, save the broken Directory deletion; POD doc...
[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Parser / Self.pm
CommitLineData
32014ec9 1package Text::Tradition::Parser::Self;
2
3use strict;
4use warnings;
1f7aa795 5use Text::Tradition::Parser::GraphML qw/ graphml_parse /;
00c5bf0b 6use TryCatch;
32014ec9 7
8=head1 NAME
9
10Text::Tradition::Parser::GraphML
11
e867486f 12=head1 SYNOPSIS
13
14 use Text::Tradition;
15
16 my $t_from_file = Text::Tradition->new(
17 'name' => 'my text',
18 'input' => 'Self',
19 'file' => '/path/to/tradition.xml'
20 );
21
22 my $t_from_string = Text::Tradition->new(
23 'name' => 'my text',
24 'input' => 'Self',
25 'string' => $tradition_xml,
26 );
27
32014ec9 28=head1 DESCRIPTION
29
30Parser module for Text::Tradition to read in its own GraphML output format.
e867486f 31GraphML is a relatively simple graph description language; a 'graph' element
32can have 'node' and 'edge' elements, and each of these can have simple 'data'
33elements for attributes to be saved.
32014ec9 34
e867486f 35The graph itself has attributes as in the Collation object:
36
37=over
38
39=item * linear
40
41=item * ac_label
42
43=item * baselabel
44
45=item * wit_list_separator
46
47=back
48
49The node objects have the following attributes:
32014ec9 50
51=over
52
e867486f 53=item * name
54
55=item * reading
56
57=item * identical
58
59=item * rank
60
61=item * class
62
63=back
64
65The edge objects have the following attributes:
66
67=over
68
69=item * class
70
71=item * witness (for 'path' class edges)
72
73=item * extra (for 'path' class edges)
74
75=item * relationship (for 'relationship' class edges)
76
77=item * equal_rank (for 'relationship' class edges)
32014ec9 78
e867486f 79=item * non_correctable (for 'relationship' class edges)
32014ec9 80
e867486f 81=item * non_independent (for 'relationship' class edges)
82
83=back
84
85=head1 METHODS
86
87=head2 B<parse>
88
89parse( $graph, $opts );
90
91Takes an initialized Text::Tradition object and a set of options; creates
92the appropriate nodes and edges on the graph. The options hash should
93include either a 'file' argument or a 'string' argument, depending on the
94source of the XML to be parsed.
95
96=begin testing
97
951ddfe8 98use Safe::Isa;
9fef629b 99use Test::Warn;
e867486f 100use Text::Tradition;
951ddfe8 101use TryCatch;
e867486f 102binmode STDOUT, ":utf8";
103binmode STDERR, ":utf8";
104eval { no warnings; binmode $DB::OUT, ":utf8"; };
105
106my $tradition = 't/data/florilegium_graphml.xml';
107my $t = Text::Tradition->new(
108 'name' => 'inline',
109 'input' => 'Self',
110 'file' => $tradition,
111 );
112
951ddfe8 113ok( $t->$_isa('Text::Tradition'), "Parsed GraphML version 2" );
e867486f 114if( $t ) {
115 is( scalar $t->collation->readings, 319, "Collation has all readings" );
255875b8 116 is( scalar $t->collation->paths, 376, "Collation has all paths" );
e867486f 117 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
118}
119
2a812726 120# TODO add a relationship, add a stemma, write graphml, reparse it, check that
121# the new data is there
e92d4229 122my $language_enabled = $t->can('language');
123if( $language_enabled ) {
124 $t->language('Greek');
125}
37bf09f4 126my $stemma_enabled = $t->can('add_stemma');
951ddfe8 127if( $stemma_enabled ) {
128 $t->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
129}
bbd064a9 130$t->collation->add_relationship( 'w12', 'w13',
131 { 'type' => 'grammatical', 'scope' => 'global',
132 'annotation' => 'This is some note' } );
133ok( $t->collation->get_relationship( 'w12', 'w13' ), "Relationship set" );
134my $graphml_str = $t->collation->as_graphml;
135
136my $newt = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml_str );
951ddfe8 137ok( $newt->$_isa('Text::Tradition'), "Parsed current GraphML version" );
bbd064a9 138if( $newt ) {
139 is( scalar $newt->collation->readings, 319, "Collation has all readings" );
140 is( scalar $newt->collation->paths, 376, "Collation has all paths" );
141 is( scalar $newt->witnesses, 13, "Collation has all witnesses" );
142 is( scalar $newt->collation->relationships, 1, "Collation has added relationship" );
e92d4229 143 if( $language_enabled ) {
144 is( $newt->language, 'Greek', "Tradition has correct language setting" );
145 }
bbd064a9 146 my $rel = $newt->collation->get_relationship( 'w12', 'w13' );
147 ok( $rel, "Found set relationship" );
148 is( $rel->annotation, 'This is some note', "Relationship has its properties" );
951ddfe8 149 if( $stemma_enabled ) {
150 is( scalar $newt->stemmata, 1, "Tradition has its stemma" );
151 is( $newt->stemma(0)->witnesses, $t->stemma(0)->witnesses, "Stemma has correct length witness list" );
152 }
bbd064a9 153}
154
951ddfe8 155# Test warning if we can
156unless( $stemma_enabled ) {
157 my $nst;
158 warnings_exist {
159 $nst = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/lexformat.xml' );
160 } [qr/DROPPING stemmata/],
161 "Got expected stemma drop warning on parse";
162}
bbd064a9 163
e867486f 164=end testing
32014ec9 165
166=cut
144d845b 167use Data::Dump;
32014ec9 168sub parse {
dfc37e38 169 my( $tradition, $opts ) = @_;
2626f709 170
171 # Collation data is in the first graph; relationship-specific stuff
172 # is in the second.
173 my( $graph_data, $rel_data ) = graphml_parse( $opts );
144d845b 174
32014ec9 175 my $collation = $tradition->collation;
176 my %witnesses;
e309421a 177
0068967c 178 # print STDERR "Setting graph globals\n";
e3196b2a 179 $tradition->name( $graph_data->{'name'} );
144d845b 180
2626f709 181 my $use_version;
255875b8 182 foreach my $gkey ( keys %{$graph_data->{'global'}} ) {
183 my $val = $graph_data->{'global'}->{$gkey};
184 if( $gkey eq 'version' ) {
185 $use_version = $val;
9fef629b 186 } elsif( $gkey eq 'stemmata' ) {
951ddfe8 187 # Make sure we can handle stemmata
9fef629b 188 # Parse the stemmata into objects
37bf09f4 189 if( $tradition->can('add_stemma') ) {
951ddfe8 190 foreach my $dotstr ( split( /\n/, $val ) ) {
191 $tradition->add_stemma( 'dot' => $dotstr );
192 }
37bf09f4 193 } else {
194 warn "Analysis module not installed; DROPPING stemmata";
2a812726 195 }
9fef629b 196 } elsif( $gkey eq 'user' ) {
197 # Assign the tradition to the user if we can
198 if( exists $opts->{'userstore'} ) {
1df4baa9 199 my $userdir = delete $opts->{'userstore'};
9fef629b 200 my $user = $userdir->find_user( { username => $val } );
201 if( $user ) {
202 $user->add_tradition( $tradition );
203 } else {
204 warn( "Found no user with ID $val; DROPPING user assignment" );
205 }
206 } else {
207 warn( "DROPPING user assignment without a specified userstore" );
208 }
8943ff68 209 } elsif( $tradition->can( $gkey ) ) {
bbd064a9 210 $tradition->$gkey( $val );
8943ff68 211 } elsif( $collation->can( $gkey ) ) {
255875b8 212 $collation->$gkey( $val );
8943ff68 213 } else {
214 warn( "DROPPING unsupported attribute $gkey" );
255875b8 215 }
216 }
e309421a 217
10e4b1ac 218 # Add the nodes to the graph.
219 # Note any reading IDs that were changed in order to comply with XML
220 # name restrictions; we have to hardcode start & end.
221 my %namechange = ( '#START#' => '__START__', '#END#' => '__END__' );
32014ec9 222
bbd064a9 223 # print STDERR "Adding collation readings\n";
2626f709 224 foreach my $n ( @{$graph_data->{'nodes'}} ) {
0174d6a9 225 # If it is the start or end node, we already have one, so
226 # grab the rank and go.
144d845b 227 if( defined $n->{'is_start'} ) {
a445ce40 228 $collation->start->rank($n->{'rank'});
229 next;
144d845b 230 }
bbd064a9 231 if( defined $n->{'is_end'} ) {
232 $collation->end->rank( $n->{'rank'} );
0174d6a9 233 next;
234 }
bbd064a9 235 my $gnode = $collation->add_reading( $n );
10e4b1ac 236 if( $gnode->id ne $n->{'id'} ) {
237 $namechange{$n->{'id'}} = $gnode->id;
238 }
32014ec9 239 }
910a0a6d 240
32014ec9 241 # Now add the edges.
bbd064a9 242 # print STDERR "Adding collation path edges\n";
32014ec9 243 foreach my $e ( @{$graph_data->{'edges'}} ) {
10e4b1ac 244 my $sourceid = exists $namechange{$e->{'source'}->{'id'}}
245 ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'};
246 my $targetid = exists $namechange{$e->{'target'}->{'id'}}
247 ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'};
248 my $from = $collation->reading( $sourceid );
249 my $to = $collation->reading( $targetid );
bbd064a9 250
251 warn "No witness label on path edge!" unless $e->{'witness'};
252 my $label = $e->{'witness'} . ( $e->{'extra'} ? $collation->ac_label : '' );
253 $collation->add_path( $from, $to, $label );
254
2626f709 255 # Add the witness if we don't have it already.
bbd064a9 256 unless( $witnesses{$e->{'witness'}} ) {
82fa4d57 257 $tradition->add_witness(
258 sigil => $e->{'witness'}, 'sourcetype' => 'collation' );
bbd064a9 259 $witnesses{$e->{'witness'}} = 1;
255875b8 260 }
bbd064a9 261 $tradition->witness( $e->{'witness'} )->is_layered( 1 ) if $e->{'extra'};
32014ec9 262 }
2626f709 263
264 ## Done with the main graph, now look at the relationships.
265 # Nodes are added via the call to add_reading above. We only need
266 # add the relationships themselves.
267 # TODO check that scoping does trt
bf6e338d 268 $rel_data->{'edges'} ||= []; # so that the next line doesn't break on no rels
269 foreach my $e ( sort { _layersort_rel( $a, $b ) } @{$rel_data->{'edges'}} ) {
10e4b1ac 270 my $sourceid = exists $namechange{$e->{'source'}->{'id'}}
271 ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'};
272 my $targetid = exists $namechange{$e->{'target'}->{'id'}}
273 ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'};
274 my $from = $collation->reading( $sourceid );
275 my $to = $collation->reading( $targetid );
bbd064a9 276 delete $e->{'source'};
277 delete $e->{'target'};
278 # The remaining keys are relationship attributes.
279 # Backward compatibility...
280 if( $use_version eq '2.0' || $use_version eq '3.0' ) {
281 delete $e->{'class'};
282 $e->{'type'} = delete $e->{'relationship'} if exists $e->{'relationship'};
283 }
284 # Add the specified relationship unless we already have done.
fdfa59a7 285 my $rel_exists;
bbd064a9 286 if( $e->{'scope'} ne 'local' ) {
287 my $relobj = $collation->get_relationship( $from, $to );
288 if( $relobj && $relobj->scope eq $e->{'scope'}
289 && $relobj->type eq $e->{'type'} ) {
fdfa59a7 290 $rel_exists = 1;
291 }
292 }
00c5bf0b 293 try {
294 $collation->add_relationship( $from, $to, $e ) unless $rel_exists;
295 } catch( Text::Tradition::Error $e ) {
296 warn "DROPPING $from -> $to: " . $e->message;
297 }
2626f709 298 }
861c3e27 299
300 # Save the text for each witness so that we can ensure consistency
301 # later on
bbd064a9 302 $collation->text_from_paths();
32014ec9 303}
304
bf6e338d 305## Return the relationship that comes first in priority.
306my %LAYERS = (
307 'collated' => 1,
308 'orthographic' => 2,
309 'spelling' => 3,
310 );
311
312sub _layersort_rel {
313 my( $a, $b ) = @_;
314 my $key = exists $a->{'type'} ? 'type' : 'relationship';
315 my $at = $LAYERS{$a->{$key}} || 99;
316 my $bt = $LAYERS{$b->{$key}} || 99;
317 return $at <=> $bt;
318}
319
e867486f 3201;
321
322=head1 BUGS / TODO
323
324=over
325
326=item * Make this into a stream parser with GraphML
327
328=item * Simply field -> attribute correspondence for nodes and edges
329
330=item * Share key name constants with Collation.pm
331
32014ec9 332=back
333
334=head1 LICENSE
335
336This package is free software and is provided "as is" without express
337or implied warranty. You can redistribute it and/or modify it under
338the same terms as Perl itself.
339
340=head1 AUTHOR
341
e867486f 342Tara L Andrews E<lt>aurum@cpan.orgE<gt>