split out persistence / DB functionality
[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Parser / Self.pm
CommitLineData
32014ec9 1package Text::Tradition::Parser::Self;
2
3use strict;
4use warnings;
1f7aa795 5use Text::Tradition::Parser::GraphML qw/ graphml_parse /;
00c5bf0b 6use TryCatch;
32014ec9 7
8=head1 NAME
9
10Text::Tradition::Parser::GraphML
11
e867486f 12=head1 SYNOPSIS
13
14 use Text::Tradition;
15
16 my $t_from_file = Text::Tradition->new(
17 'name' => 'my text',
18 'input' => 'Self',
19 'file' => '/path/to/tradition.xml'
20 );
21
22 my $t_from_string = Text::Tradition->new(
23 'name' => 'my text',
24 'input' => 'Self',
25 'string' => $tradition_xml,
26 );
27
32014ec9 28=head1 DESCRIPTION
29
30Parser module for Text::Tradition to read in its own GraphML output format.
e867486f 31GraphML is a relatively simple graph description language; a 'graph' element
32can have 'node' and 'edge' elements, and each of these can have simple 'data'
33elements for attributes to be saved.
32014ec9 34
e867486f 35The graph itself has attributes as in the Collation object:
36
37=over
38
39=item * linear
40
41=item * ac_label
42
43=item * baselabel
44
45=item * wit_list_separator
46
47=back
48
49The node objects have the following attributes:
32014ec9 50
51=over
52
e867486f 53=item * name
54
55=item * reading
56
57=item * identical
58
59=item * rank
60
61=item * class
62
63=back
64
65The edge objects have the following attributes:
66
67=over
68
69=item * class
70
71=item * witness (for 'path' class edges)
72
73=item * extra (for 'path' class edges)
74
75=item * relationship (for 'relationship' class edges)
76
77=item * equal_rank (for 'relationship' class edges)
32014ec9 78
e867486f 79=item * non_correctable (for 'relationship' class edges)
32014ec9 80
e867486f 81=item * non_independent (for 'relationship' class edges)
82
83=back
84
85=head1 METHODS
86
87=head2 B<parse>
88
89parse( $graph, $opts );
90
91Takes an initialized Text::Tradition object and a set of options; creates
92the appropriate nodes and edges on the graph. The options hash should
93include either a 'file' argument or a 'string' argument, depending on the
94source of the XML to be parsed.
95
96=begin testing
97
951ddfe8 98use Safe::Isa;
9fef629b 99use Test::Warn;
e867486f 100use Text::Tradition;
951ddfe8 101use TryCatch;
e867486f 102binmode STDOUT, ":utf8";
103binmode STDERR, ":utf8";
104eval { no warnings; binmode $DB::OUT, ":utf8"; };
105
106my $tradition = 't/data/florilegium_graphml.xml';
107my $t = Text::Tradition->new(
108 'name' => 'inline',
109 'input' => 'Self',
110 'file' => $tradition,
111 );
112
951ddfe8 113ok( $t->$_isa('Text::Tradition'), "Parsed GraphML version 2" );
e867486f 114if( $t ) {
115 is( scalar $t->collation->readings, 319, "Collation has all readings" );
255875b8 116 is( scalar $t->collation->paths, 376, "Collation has all paths" );
e867486f 117 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
118}
119
2a812726 120# TODO add a relationship, add a stemma, write graphml, reparse it, check that
121# the new data is there
e92d4229 122my $language_enabled = $t->can('language');
123if( $language_enabled ) {
124 $t->language('Greek');
125}
37bf09f4 126my $stemma_enabled = $t->can('add_stemma');
951ddfe8 127if( $stemma_enabled ) {
128 $t->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
129}
bbd064a9 130$t->collation->add_relationship( 'w12', 'w13',
131 { 'type' => 'grammatical', 'scope' => 'global',
132 'annotation' => 'This is some note' } );
133ok( $t->collation->get_relationship( 'w12', 'w13' ), "Relationship set" );
134my $graphml_str = $t->collation->as_graphml;
135
136my $newt = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml_str );
951ddfe8 137ok( $newt->$_isa('Text::Tradition'), "Parsed current GraphML version" );
bbd064a9 138if( $newt ) {
139 is( scalar $newt->collation->readings, 319, "Collation has all readings" );
140 is( scalar $newt->collation->paths, 376, "Collation has all paths" );
141 is( scalar $newt->witnesses, 13, "Collation has all witnesses" );
142 is( scalar $newt->collation->relationships, 1, "Collation has added relationship" );
e92d4229 143 if( $language_enabled ) {
144 is( $newt->language, 'Greek', "Tradition has correct language setting" );
145 }
bbd064a9 146 my $rel = $newt->collation->get_relationship( 'w12', 'w13' );
147 ok( $rel, "Found set relationship" );
148 is( $rel->annotation, 'This is some note', "Relationship has its properties" );
951ddfe8 149 if( $stemma_enabled ) {
150 is( scalar $newt->stemmata, 1, "Tradition has its stemma" );
151 is( $newt->stemma(0)->witnesses, $t->stemma(0)->witnesses, "Stemma has correct length witness list" );
152 }
bbd064a9 153}
154
951ddfe8 155# Test warning if we can
156unless( $stemma_enabled ) {
157 my $nst;
158 warnings_exist {
159 $nst = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/lexformat.xml' );
160 } [qr/DROPPING stemmata/],
161 "Got expected stemma drop warning on parse";
162}
bbd064a9 163
e867486f 164=end testing
32014ec9 165
166=cut
144d845b 167use Data::Dump;
32014ec9 168sub parse {
dfc37e38 169 my( $tradition, $opts ) = @_;
2626f709 170
171 # Collation data is in the first graph; relationship-specific stuff
172 # is in the second.
173 my( $graph_data, $rel_data ) = graphml_parse( $opts );
144d845b 174
32014ec9 175 my $collation = $tradition->collation;
176 my %witnesses;
e309421a 177
0068967c 178 # print STDERR "Setting graph globals\n";
e3196b2a 179 $tradition->name( $graph_data->{'name'} );
144d845b 180
2626f709 181 my $use_version;
bbd064a9 182 my $tmeta = $tradition->meta;
183 my $cmeta = $collation->meta;
255875b8 184 foreach my $gkey ( keys %{$graph_data->{'global'}} ) {
185 my $val = $graph_data->{'global'}->{$gkey};
186 if( $gkey eq 'version' ) {
187 $use_version = $val;
9fef629b 188 } elsif( $gkey eq 'stemmata' ) {
951ddfe8 189 # Make sure we can handle stemmata
9fef629b 190 # Parse the stemmata into objects
37bf09f4 191 if( $tradition->can('add_stemma') ) {
951ddfe8 192 foreach my $dotstr ( split( /\n/, $val ) ) {
193 $tradition->add_stemma( 'dot' => $dotstr );
194 }
37bf09f4 195 } else {
196 warn "Analysis module not installed; DROPPING stemmata";
2a812726 197 }
e92d4229 198 } elsif( $gkey eq 'language' ) {
199 if( $tradition->can('language') ) {
200 $tradition->language( $val );
201 } else {
202 warn "Morphology module not installed; DROPPING language";
203 }
9fef629b 204 } elsif( $gkey eq 'user' ) {
205 # Assign the tradition to the user if we can
206 if( exists $opts->{'userstore'} ) {
1df4baa9 207 my $userdir = delete $opts->{'userstore'};
9fef629b 208 my $user = $userdir->find_user( { username => $val } );
209 if( $user ) {
210 $user->add_tradition( $tradition );
211 } else {
212 warn( "Found no user with ID $val; DROPPING user assignment" );
213 }
214 } else {
215 warn( "DROPPING user assignment without a specified userstore" );
216 }
bbd064a9 217 } elsif( $tmeta->has_attribute( $gkey ) ) {
218 $tradition->$gkey( $val );
255875b8 219 } else {
220 $collation->$gkey( $val );
221 }
222 }
e309421a 223
10e4b1ac 224 # Add the nodes to the graph.
225 # Note any reading IDs that were changed in order to comply with XML
226 # name restrictions; we have to hardcode start & end.
227 my %namechange = ( '#START#' => '__START__', '#END#' => '__END__' );
32014ec9 228
bbd064a9 229 # print STDERR "Adding collation readings\n";
2626f709 230 foreach my $n ( @{$graph_data->{'nodes'}} ) {
0174d6a9 231 # If it is the start or end node, we already have one, so
232 # grab the rank and go.
144d845b 233 if( defined $n->{'is_start'} ) {
a445ce40 234 $collation->start->rank($n->{'rank'});
235 next;
144d845b 236 }
bbd064a9 237 if( defined $n->{'is_end'} ) {
238 $collation->end->rank( $n->{'rank'} );
0174d6a9 239 next;
240 }
bbd064a9 241 my $gnode = $collation->add_reading( $n );
10e4b1ac 242 if( $gnode->id ne $n->{'id'} ) {
243 $namechange{$n->{'id'}} = $gnode->id;
244 }
32014ec9 245 }
910a0a6d 246
32014ec9 247 # Now add the edges.
bbd064a9 248 # print STDERR "Adding collation path edges\n";
32014ec9 249 foreach my $e ( @{$graph_data->{'edges'}} ) {
10e4b1ac 250 my $sourceid = exists $namechange{$e->{'source'}->{'id'}}
251 ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'};
252 my $targetid = exists $namechange{$e->{'target'}->{'id'}}
253 ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'};
254 my $from = $collation->reading( $sourceid );
255 my $to = $collation->reading( $targetid );
bbd064a9 256
257 warn "No witness label on path edge!" unless $e->{'witness'};
258 my $label = $e->{'witness'} . ( $e->{'extra'} ? $collation->ac_label : '' );
259 $collation->add_path( $from, $to, $label );
260
2626f709 261 # Add the witness if we don't have it already.
bbd064a9 262 unless( $witnesses{$e->{'witness'}} ) {
82fa4d57 263 $tradition->add_witness(
264 sigil => $e->{'witness'}, 'sourcetype' => 'collation' );
bbd064a9 265 $witnesses{$e->{'witness'}} = 1;
255875b8 266 }
bbd064a9 267 $tradition->witness( $e->{'witness'} )->is_layered( 1 ) if $e->{'extra'};
32014ec9 268 }
2626f709 269
270 ## Done with the main graph, now look at the relationships.
271 # Nodes are added via the call to add_reading above. We only need
272 # add the relationships themselves.
273 # TODO check that scoping does trt
bf6e338d 274 $rel_data->{'edges'} ||= []; # so that the next line doesn't break on no rels
275 foreach my $e ( sort { _layersort_rel( $a, $b ) } @{$rel_data->{'edges'}} ) {
10e4b1ac 276 my $sourceid = exists $namechange{$e->{'source'}->{'id'}}
277 ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'};
278 my $targetid = exists $namechange{$e->{'target'}->{'id'}}
279 ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'};
280 my $from = $collation->reading( $sourceid );
281 my $to = $collation->reading( $targetid );
bbd064a9 282 delete $e->{'source'};
283 delete $e->{'target'};
284 # The remaining keys are relationship attributes.
285 # Backward compatibility...
286 if( $use_version eq '2.0' || $use_version eq '3.0' ) {
287 delete $e->{'class'};
288 $e->{'type'} = delete $e->{'relationship'} if exists $e->{'relationship'};
289 }
290 # Add the specified relationship unless we already have done.
fdfa59a7 291 my $rel_exists;
bbd064a9 292 if( $e->{'scope'} ne 'local' ) {
293 my $relobj = $collation->get_relationship( $from, $to );
294 if( $relobj && $relobj->scope eq $e->{'scope'}
295 && $relobj->type eq $e->{'type'} ) {
fdfa59a7 296 $rel_exists = 1;
297 }
298 }
00c5bf0b 299 try {
300 $collation->add_relationship( $from, $to, $e ) unless $rel_exists;
301 } catch( Text::Tradition::Error $e ) {
302 warn "DROPPING $from -> $to: " . $e->message;
303 }
2626f709 304 }
861c3e27 305
306 # Save the text for each witness so that we can ensure consistency
307 # later on
bbd064a9 308 $collation->text_from_paths();
32014ec9 309}
310
bf6e338d 311## Return the relationship that comes first in priority.
312my %LAYERS = (
313 'collated' => 1,
314 'orthographic' => 2,
315 'spelling' => 3,
316 );
317
318sub _layersort_rel {
319 my( $a, $b ) = @_;
320 my $key = exists $a->{'type'} ? 'type' : 'relationship';
321 my $at = $LAYERS{$a->{$key}} || 99;
322 my $bt = $LAYERS{$b->{$key}} || 99;
323 return $at <=> $bt;
324}
325
e867486f 3261;
327
328=head1 BUGS / TODO
329
330=over
331
332=item * Make this into a stream parser with GraphML
333
334=item * Simply field -> attribute correspondence for nodes and edges
335
336=item * Share key name constants with Collation.pm
337
32014ec9 338=back
339
340=head1 LICENSE
341
342This package is free software and is provided "as is" without express
343or implied warranty. You can redistribute it and/or modify it under
344the same terms as Perl itself.
345
346=head1 AUTHOR
347
e867486f 348Tara L Andrews E<lt>aurum@cpan.orgE<gt>