reading IDs must be XML names; now used in SVG node IDs
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Self.pm
CommitLineData
32014ec9 1package Text::Tradition::Parser::Self;
2
3use strict;
4use warnings;
1f7aa795 5use Text::Tradition::Parser::GraphML qw/ graphml_parse /;
00c5bf0b 6use TryCatch;
32014ec9 7
8=head1 NAME
9
10Text::Tradition::Parser::GraphML
11
e867486f 12=head1 SYNOPSIS
13
14 use Text::Tradition;
15
16 my $t_from_file = Text::Tradition->new(
17 'name' => 'my text',
18 'input' => 'Self',
19 'file' => '/path/to/tradition.xml'
20 );
21
22 my $t_from_string = Text::Tradition->new(
23 'name' => 'my text',
24 'input' => 'Self',
25 'string' => $tradition_xml,
26 );
27
32014ec9 28=head1 DESCRIPTION
29
30Parser module for Text::Tradition to read in its own GraphML output format.
e867486f 31GraphML is a relatively simple graph description language; a 'graph' element
32can have 'node' and 'edge' elements, and each of these can have simple 'data'
33elements for attributes to be saved.
32014ec9 34
e867486f 35The graph itself has attributes as in the Collation object:
36
37=over
38
39=item * linear
40
41=item * ac_label
42
43=item * baselabel
44
45=item * wit_list_separator
46
47=back
48
49The node objects have the following attributes:
32014ec9 50
51=over
52
e867486f 53=item * name
54
55=item * reading
56
57=item * identical
58
59=item * rank
60
61=item * class
62
63=back
64
65The edge objects have the following attributes:
66
67=over
68
69=item * class
70
71=item * witness (for 'path' class edges)
72
73=item * extra (for 'path' class edges)
74
75=item * relationship (for 'relationship' class edges)
76
77=item * equal_rank (for 'relationship' class edges)
32014ec9 78
e867486f 79=item * non_correctable (for 'relationship' class edges)
32014ec9 80
e867486f 81=item * non_independent (for 'relationship' class edges)
82
83=back
84
85=head1 METHODS
86
87=head2 B<parse>
88
89parse( $graph, $opts );
90
91Takes an initialized Text::Tradition object and a set of options; creates
92the appropriate nodes and edges on the graph. The options hash should
93include either a 'file' argument or a 'string' argument, depending on the
94source of the XML to be parsed.
95
96=begin testing
97
98use Text::Tradition;
99binmode STDOUT, ":utf8";
100binmode STDERR, ":utf8";
101eval { no warnings; binmode $DB::OUT, ":utf8"; };
102
103my $tradition = 't/data/florilegium_graphml.xml';
104my $t = Text::Tradition->new(
105 'name' => 'inline',
106 'input' => 'Self',
107 'file' => $tradition,
108 );
109
bbd064a9 110is( ref( $t ), 'Text::Tradition', "Parsed GraphML version 2" );
e867486f 111if( $t ) {
112 is( scalar $t->collation->readings, 319, "Collation has all readings" );
255875b8 113 is( scalar $t->collation->paths, 376, "Collation has all paths" );
e867486f 114 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
115}
116
bbd064a9 117# TODO add a relationship, write graphml, reparse it, check that the rel
118# is still there
119$t->language('Greek');
120$t->collation->add_relationship( 'w12', 'w13',
121 { 'type' => 'grammatical', 'scope' => 'global',
122 'annotation' => 'This is some note' } );
123ok( $t->collation->get_relationship( 'w12', 'w13' ), "Relationship set" );
124my $graphml_str = $t->collation->as_graphml;
125
126my $newt = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml_str );
127is( ref( $newt ), 'Text::Tradition', "Parsed current GraphML version" );
128if( $newt ) {
129 is( scalar $newt->collation->readings, 319, "Collation has all readings" );
130 is( scalar $newt->collation->paths, 376, "Collation has all paths" );
131 is( scalar $newt->witnesses, 13, "Collation has all witnesses" );
132 is( scalar $newt->collation->relationships, 1, "Collation has added relationship" );
133 is( $newt->language, 'Greek', "Tradition has correct language setting" );
134 my $rel = $newt->collation->get_relationship( 'w12', 'w13' );
135 ok( $rel, "Found set relationship" );
136 is( $rel->annotation, 'This is some note', "Relationship has its properties" );
137}
138
139
e867486f 140=end testing
32014ec9 141
142=cut
143
32014ec9 144sub parse {
dfc37e38 145 my( $tradition, $opts ) = @_;
2626f709 146
147 # Collation data is in the first graph; relationship-specific stuff
148 # is in the second.
149 my( $graph_data, $rel_data ) = graphml_parse( $opts );
94c00c71 150
32014ec9 151 my $collation = $tradition->collation;
152 my %witnesses;
e309421a 153
0068967c 154 # print STDERR "Setting graph globals\n";
e3196b2a 155 $tradition->name( $graph_data->{'name'} );
2626f709 156 my $use_version;
bbd064a9 157 my $tmeta = $tradition->meta;
158 my $cmeta = $collation->meta;
255875b8 159 foreach my $gkey ( keys %{$graph_data->{'global'}} ) {
160 my $val = $graph_data->{'global'}->{$gkey};
161 if( $gkey eq 'version' ) {
162 $use_version = $val;
bbd064a9 163 } elsif( $tmeta->has_attribute( $gkey ) ) {
164 $tradition->$gkey( $val );
255875b8 165 } else {
166 $collation->$gkey( $val );
167 }
168 }
e309421a 169
10e4b1ac 170 # Add the nodes to the graph.
171 # Note any reading IDs that were changed in order to comply with XML
172 # name restrictions; we have to hardcode start & end.
173 my %namechange = ( '#START#' => '__START__', '#END#' => '__END__' );
32014ec9 174
bbd064a9 175 # print STDERR "Adding collation readings\n";
2626f709 176 foreach my $n ( @{$graph_data->{'nodes'}} ) {
0174d6a9 177 # If it is the start or end node, we already have one, so
178 # grab the rank and go.
bbd064a9 179 next if( defined $n->{'is_start'} );
180 if( defined $n->{'is_end'} ) {
181 $collation->end->rank( $n->{'rank'} );
0174d6a9 182 next;
183 }
bbd064a9 184 my $gnode = $collation->add_reading( $n );
10e4b1ac 185 if( $gnode->id ne $n->{'id'} ) {
186 $namechange{$n->{'id'}} = $gnode->id;
187 }
32014ec9 188 }
910a0a6d 189
32014ec9 190 # Now add the edges.
bbd064a9 191 # print STDERR "Adding collation path edges\n";
32014ec9 192 foreach my $e ( @{$graph_data->{'edges'}} ) {
10e4b1ac 193 my $sourceid = exists $namechange{$e->{'source'}->{'id'}}
194 ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'};
195 my $targetid = exists $namechange{$e->{'target'}->{'id'}}
196 ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'};
197 my $from = $collation->reading( $sourceid );
198 my $to = $collation->reading( $targetid );
bbd064a9 199
200 warn "No witness label on path edge!" unless $e->{'witness'};
201 my $label = $e->{'witness'} . ( $e->{'extra'} ? $collation->ac_label : '' );
202 $collation->add_path( $from, $to, $label );
203
2626f709 204 # Add the witness if we don't have it already.
bbd064a9 205 unless( $witnesses{$e->{'witness'}} ) {
82fa4d57 206 $tradition->add_witness(
207 sigil => $e->{'witness'}, 'sourcetype' => 'collation' );
bbd064a9 208 $witnesses{$e->{'witness'}} = 1;
255875b8 209 }
bbd064a9 210 $tradition->witness( $e->{'witness'} )->is_layered( 1 ) if $e->{'extra'};
32014ec9 211 }
2626f709 212
213 ## Done with the main graph, now look at the relationships.
214 # Nodes are added via the call to add_reading above. We only need
215 # add the relationships themselves.
216 # TODO check that scoping does trt
bf6e338d 217 $rel_data->{'edges'} ||= []; # so that the next line doesn't break on no rels
218 foreach my $e ( sort { _layersort_rel( $a, $b ) } @{$rel_data->{'edges'}} ) {
10e4b1ac 219 my $sourceid = exists $namechange{$e->{'source'}->{'id'}}
220 ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'};
221 my $targetid = exists $namechange{$e->{'target'}->{'id'}}
222 ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'};
223 my $from = $collation->reading( $sourceid );
224 my $to = $collation->reading( $targetid );
bbd064a9 225 delete $e->{'source'};
226 delete $e->{'target'};
227 # The remaining keys are relationship attributes.
228 # Backward compatibility...
229 if( $use_version eq '2.0' || $use_version eq '3.0' ) {
230 delete $e->{'class'};
231 $e->{'type'} = delete $e->{'relationship'} if exists $e->{'relationship'};
232 }
233 # Add the specified relationship unless we already have done.
fdfa59a7 234 my $rel_exists;
bbd064a9 235 if( $e->{'scope'} ne 'local' ) {
236 my $relobj = $collation->get_relationship( $from, $to );
237 if( $relobj && $relobj->scope eq $e->{'scope'}
238 && $relobj->type eq $e->{'type'} ) {
fdfa59a7 239 $rel_exists = 1;
240 }
241 }
00c5bf0b 242 try {
243 $collation->add_relationship( $from, $to, $e ) unless $rel_exists;
244 } catch( Text::Tradition::Error $e ) {
245 warn "DROPPING $from -> $to: " . $e->message;
246 }
2626f709 247 }
861c3e27 248
249 # Save the text for each witness so that we can ensure consistency
250 # later on
bbd064a9 251 $collation->text_from_paths();
32014ec9 252}
253
bf6e338d 254## Return the relationship that comes first in priority.
255my %LAYERS = (
256 'collated' => 1,
257 'orthographic' => 2,
258 'spelling' => 3,
259 );
260
261sub _layersort_rel {
262 my( $a, $b ) = @_;
263 my $key = exists $a->{'type'} ? 'type' : 'relationship';
264 my $at = $LAYERS{$a->{$key}} || 99;
265 my $bt = $LAYERS{$b->{$key}} || 99;
266 return $at <=> $bt;
267}
268
e867486f 2691;
270
271=head1 BUGS / TODO
272
273=over
274
275=item * Make this into a stream parser with GraphML
276
277=item * Simply field -> attribute correspondence for nodes and edges
278
279=item * Share key name constants with Collation.pm
280
32014ec9 281=back
282
283=head1 LICENSE
284
285This package is free software and is provided "as is" without express
286or implied warranty. You can redistribute it and/or modify it under
287the same terms as Perl itself.
288
289=head1 AUTHOR
290
e867486f 291Tara L Andrews E<lt>aurum@cpan.orgE<gt>