save any defined stemmata in GraphML
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Self.pm
CommitLineData
32014ec9 1package Text::Tradition::Parser::Self;
2
3use strict;
4use warnings;
1f7aa795 5use Text::Tradition::Parser::GraphML qw/ graphml_parse /;
00c5bf0b 6use TryCatch;
32014ec9 7
8=head1 NAME
9
10Text::Tradition::Parser::GraphML
11
e867486f 12=head1 SYNOPSIS
13
14 use Text::Tradition;
15
16 my $t_from_file = Text::Tradition->new(
17 'name' => 'my text',
18 'input' => 'Self',
19 'file' => '/path/to/tradition.xml'
20 );
21
22 my $t_from_string = Text::Tradition->new(
23 'name' => 'my text',
24 'input' => 'Self',
25 'string' => $tradition_xml,
26 );
27
32014ec9 28=head1 DESCRIPTION
29
30Parser module for Text::Tradition to read in its own GraphML output format.
e867486f 31GraphML is a relatively simple graph description language; a 'graph' element
32can have 'node' and 'edge' elements, and each of these can have simple 'data'
33elements for attributes to be saved.
32014ec9 34
e867486f 35The graph itself has attributes as in the Collation object:
36
37=over
38
39=item * linear
40
41=item * ac_label
42
43=item * baselabel
44
45=item * wit_list_separator
46
47=back
48
49The node objects have the following attributes:
32014ec9 50
51=over
52
e867486f 53=item * name
54
55=item * reading
56
57=item * identical
58
59=item * rank
60
61=item * class
62
63=back
64
65The edge objects have the following attributes:
66
67=over
68
69=item * class
70
71=item * witness (for 'path' class edges)
72
73=item * extra (for 'path' class edges)
74
75=item * relationship (for 'relationship' class edges)
76
77=item * equal_rank (for 'relationship' class edges)
32014ec9 78
e867486f 79=item * non_correctable (for 'relationship' class edges)
32014ec9 80
e867486f 81=item * non_independent (for 'relationship' class edges)
82
83=back
84
85=head1 METHODS
86
87=head2 B<parse>
88
89parse( $graph, $opts );
90
91Takes an initialized Text::Tradition object and a set of options; creates
92the appropriate nodes and edges on the graph. The options hash should
93include either a 'file' argument or a 'string' argument, depending on the
94source of the XML to be parsed.
95
96=begin testing
97
98use Text::Tradition;
99binmode STDOUT, ":utf8";
100binmode STDERR, ":utf8";
101eval { no warnings; binmode $DB::OUT, ":utf8"; };
102
103my $tradition = 't/data/florilegium_graphml.xml';
104my $t = Text::Tradition->new(
105 'name' => 'inline',
106 'input' => 'Self',
107 'file' => $tradition,
108 );
109
bbd064a9 110is( ref( $t ), 'Text::Tradition', "Parsed GraphML version 2" );
e867486f 111if( $t ) {
112 is( scalar $t->collation->readings, 319, "Collation has all readings" );
255875b8 113 is( scalar $t->collation->paths, 376, "Collation has all paths" );
e867486f 114 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
115}
116
2a812726 117# TODO add a relationship, add a stemma, write graphml, reparse it, check that
118# the new data is there
bbd064a9 119$t->language('Greek');
2a812726 120$t->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
bbd064a9 121$t->collation->add_relationship( 'w12', 'w13',
122 { 'type' => 'grammatical', 'scope' => 'global',
123 'annotation' => 'This is some note' } );
124ok( $t->collation->get_relationship( 'w12', 'w13' ), "Relationship set" );
125my $graphml_str = $t->collation->as_graphml;
126
127my $newt = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml_str );
128is( ref( $newt ), 'Text::Tradition', "Parsed current GraphML version" );
129if( $newt ) {
130 is( scalar $newt->collation->readings, 319, "Collation has all readings" );
131 is( scalar $newt->collation->paths, 376, "Collation has all paths" );
132 is( scalar $newt->witnesses, 13, "Collation has all witnesses" );
133 is( scalar $newt->collation->relationships, 1, "Collation has added relationship" );
134 is( $newt->language, 'Greek', "Tradition has correct language setting" );
135 my $rel = $newt->collation->get_relationship( 'w12', 'w13' );
136 ok( $rel, "Found set relationship" );
137 is( $rel->annotation, 'This is some note', "Relationship has its properties" );
2a812726 138 is( scalar $newt->stemmata, 1, "Tradition has its stemma" );
139 is( $newt->stemma(0)->witnesses, $t->stemma(0)->witnesses, "Stemma has correct length witness list" );
bbd064a9 140}
141
142
e867486f 143=end testing
32014ec9 144
145=cut
146
32014ec9 147sub parse {
dfc37e38 148 my( $tradition, $opts ) = @_;
2626f709 149
150 # Collation data is in the first graph; relationship-specific stuff
151 # is in the second.
152 my( $graph_data, $rel_data ) = graphml_parse( $opts );
94c00c71 153
32014ec9 154 my $collation = $tradition->collation;
155 my %witnesses;
e309421a 156
0068967c 157 # print STDERR "Setting graph globals\n";
e3196b2a 158 $tradition->name( $graph_data->{'name'} );
2626f709 159 my $use_version;
bbd064a9 160 my $tmeta = $tradition->meta;
161 my $cmeta = $collation->meta;
255875b8 162 foreach my $gkey ( keys %{$graph_data->{'global'}} ) {
163 my $val = $graph_data->{'global'}->{$gkey};
164 if( $gkey eq 'version' ) {
165 $use_version = $val;
2a812726 166 } elsif( $gkey eq 'stemmata' ) { # Special case, yuck
167 foreach my $dotstr ( split( /\n/, $val ) ) {
168 $tradition->add_stemma( 'dot' => $dotstr );
169 }
bbd064a9 170 } elsif( $tmeta->has_attribute( $gkey ) ) {
171 $tradition->$gkey( $val );
255875b8 172 } else {
173 $collation->$gkey( $val );
174 }
175 }
e309421a 176
10e4b1ac 177 # Add the nodes to the graph.
178 # Note any reading IDs that were changed in order to comply with XML
179 # name restrictions; we have to hardcode start & end.
180 my %namechange = ( '#START#' => '__START__', '#END#' => '__END__' );
32014ec9 181
bbd064a9 182 # print STDERR "Adding collation readings\n";
2626f709 183 foreach my $n ( @{$graph_data->{'nodes'}} ) {
0174d6a9 184 # If it is the start or end node, we already have one, so
185 # grab the rank and go.
bbd064a9 186 next if( defined $n->{'is_start'} );
187 if( defined $n->{'is_end'} ) {
188 $collation->end->rank( $n->{'rank'} );
0174d6a9 189 next;
190 }
bbd064a9 191 my $gnode = $collation->add_reading( $n );
10e4b1ac 192 if( $gnode->id ne $n->{'id'} ) {
193 $namechange{$n->{'id'}} = $gnode->id;
194 }
32014ec9 195 }
910a0a6d 196
32014ec9 197 # Now add the edges.
bbd064a9 198 # print STDERR "Adding collation path edges\n";
32014ec9 199 foreach my $e ( @{$graph_data->{'edges'}} ) {
10e4b1ac 200 my $sourceid = exists $namechange{$e->{'source'}->{'id'}}
201 ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'};
202 my $targetid = exists $namechange{$e->{'target'}->{'id'}}
203 ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'};
204 my $from = $collation->reading( $sourceid );
205 my $to = $collation->reading( $targetid );
bbd064a9 206
207 warn "No witness label on path edge!" unless $e->{'witness'};
208 my $label = $e->{'witness'} . ( $e->{'extra'} ? $collation->ac_label : '' );
209 $collation->add_path( $from, $to, $label );
210
2626f709 211 # Add the witness if we don't have it already.
bbd064a9 212 unless( $witnesses{$e->{'witness'}} ) {
82fa4d57 213 $tradition->add_witness(
214 sigil => $e->{'witness'}, 'sourcetype' => 'collation' );
bbd064a9 215 $witnesses{$e->{'witness'}} = 1;
255875b8 216 }
bbd064a9 217 $tradition->witness( $e->{'witness'} )->is_layered( 1 ) if $e->{'extra'};
32014ec9 218 }
2626f709 219
220 ## Done with the main graph, now look at the relationships.
221 # Nodes are added via the call to add_reading above. We only need
222 # add the relationships themselves.
223 # TODO check that scoping does trt
bf6e338d 224 $rel_data->{'edges'} ||= []; # so that the next line doesn't break on no rels
225 foreach my $e ( sort { _layersort_rel( $a, $b ) } @{$rel_data->{'edges'}} ) {
10e4b1ac 226 my $sourceid = exists $namechange{$e->{'source'}->{'id'}}
227 ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'};
228 my $targetid = exists $namechange{$e->{'target'}->{'id'}}
229 ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'};
230 my $from = $collation->reading( $sourceid );
231 my $to = $collation->reading( $targetid );
bbd064a9 232 delete $e->{'source'};
233 delete $e->{'target'};
234 # The remaining keys are relationship attributes.
235 # Backward compatibility...
236 if( $use_version eq '2.0' || $use_version eq '3.0' ) {
237 delete $e->{'class'};
238 $e->{'type'} = delete $e->{'relationship'} if exists $e->{'relationship'};
239 }
240 # Add the specified relationship unless we already have done.
fdfa59a7 241 my $rel_exists;
bbd064a9 242 if( $e->{'scope'} ne 'local' ) {
243 my $relobj = $collation->get_relationship( $from, $to );
244 if( $relobj && $relobj->scope eq $e->{'scope'}
245 && $relobj->type eq $e->{'type'} ) {
fdfa59a7 246 $rel_exists = 1;
247 }
248 }
00c5bf0b 249 try {
250 $collation->add_relationship( $from, $to, $e ) unless $rel_exists;
251 } catch( Text::Tradition::Error $e ) {
252 warn "DROPPING $from -> $to: " . $e->message;
253 }
2626f709 254 }
861c3e27 255
256 # Save the text for each witness so that we can ensure consistency
257 # later on
bbd064a9 258 $collation->text_from_paths();
32014ec9 259}
260
bf6e338d 261## Return the relationship that comes first in priority.
262my %LAYERS = (
263 'collated' => 1,
264 'orthographic' => 2,
265 'spelling' => 3,
266 );
267
268sub _layersort_rel {
269 my( $a, $b ) = @_;
270 my $key = exists $a->{'type'} ? 'type' : 'relationship';
271 my $at = $LAYERS{$a->{$key}} || 99;
272 my $bt = $LAYERS{$b->{$key}} || 99;
273 return $at <=> $bt;
274}
275
e867486f 2761;
277
278=head1 BUGS / TODO
279
280=over
281
282=item * Make this into a stream parser with GraphML
283
284=item * Simply field -> attribute correspondence for nodes and edges
285
286=item * Share key name constants with Collation.pm
287
32014ec9 288=back
289
290=head1 LICENSE
291
292This package is free software and is provided "as is" without express
293or implied warranty. You can redistribute it and/or modify it under
294the same terms as Perl itself.
295
296=head1 AUTHOR
297
e867486f 298Tara L Andrews E<lt>aurum@cpan.orgE<gt>