split off stemma analysis modules from base Tradition layer
[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Parser / Self.pm
CommitLineData
32014ec9 1package Text::Tradition::Parser::Self;
2
3use strict;
4use warnings;
1f7aa795 5use Text::Tradition::Parser::GraphML qw/ graphml_parse /;
9fef629b 6use Text::Tradition::UserStore;
00c5bf0b 7use TryCatch;
32014ec9 8
9=head1 NAME
10
11Text::Tradition::Parser::GraphML
12
e867486f 13=head1 SYNOPSIS
14
15 use Text::Tradition;
16
17 my $t_from_file = Text::Tradition->new(
18 'name' => 'my text',
19 'input' => 'Self',
20 'file' => '/path/to/tradition.xml'
21 );
22
23 my $t_from_string = Text::Tradition->new(
24 'name' => 'my text',
25 'input' => 'Self',
26 'string' => $tradition_xml,
27 );
28
32014ec9 29=head1 DESCRIPTION
30
31Parser module for Text::Tradition to read in its own GraphML output format.
e867486f 32GraphML is a relatively simple graph description language; a 'graph' element
33can have 'node' and 'edge' elements, and each of these can have simple 'data'
34elements for attributes to be saved.
32014ec9 35
e867486f 36The graph itself has attributes as in the Collation object:
37
38=over
39
40=item * linear
41
42=item * ac_label
43
44=item * baselabel
45
46=item * wit_list_separator
47
48=back
49
50The node objects have the following attributes:
32014ec9 51
52=over
53
e867486f 54=item * name
55
56=item * reading
57
58=item * identical
59
60=item * rank
61
62=item * class
63
64=back
65
66The edge objects have the following attributes:
67
68=over
69
70=item * class
71
72=item * witness (for 'path' class edges)
73
74=item * extra (for 'path' class edges)
75
76=item * relationship (for 'relationship' class edges)
77
78=item * equal_rank (for 'relationship' class edges)
32014ec9 79
e867486f 80=item * non_correctable (for 'relationship' class edges)
32014ec9 81
e867486f 82=item * non_independent (for 'relationship' class edges)
83
84=back
85
86=head1 METHODS
87
88=head2 B<parse>
89
90parse( $graph, $opts );
91
92Takes an initialized Text::Tradition object and a set of options; creates
93the appropriate nodes and edges on the graph. The options hash should
94include either a 'file' argument or a 'string' argument, depending on the
95source of the XML to be parsed.
96
97=begin testing
98
9fef629b 99use File::Temp;
951ddfe8 100use Safe::Isa;
9fef629b 101use Test::Warn;
e867486f 102use Text::Tradition;
951ddfe8 103use TryCatch;
e867486f 104binmode STDOUT, ":utf8";
105binmode STDERR, ":utf8";
106eval { no warnings; binmode $DB::OUT, ":utf8"; };
107
108my $tradition = 't/data/florilegium_graphml.xml';
109my $t = Text::Tradition->new(
110 'name' => 'inline',
111 'input' => 'Self',
112 'file' => $tradition,
113 );
114
951ddfe8 115ok( $t->$_isa('Text::Tradition'), "Parsed GraphML version 2" );
e867486f 116if( $t ) {
117 is( scalar $t->collation->readings, 319, "Collation has all readings" );
255875b8 118 is( scalar $t->collation->paths, 376, "Collation has all paths" );
e867486f 119 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
120}
121
2a812726 122# TODO add a relationship, add a stemma, write graphml, reparse it, check that
123# the new data is there
bbd064a9 124$t->language('Greek');
951ddfe8 125my $stemma_enabled;
126try {
127 $stemma_enabled = $t->enable_stemmata;
128} catch {
129 ok( 1, "Skipping stemma tests without Analysis module" );
130}
131if( $stemma_enabled ) {
132 $t->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
133}
bbd064a9 134$t->collation->add_relationship( 'w12', 'w13',
135 { 'type' => 'grammatical', 'scope' => 'global',
136 'annotation' => 'This is some note' } );
137ok( $t->collation->get_relationship( 'w12', 'w13' ), "Relationship set" );
138my $graphml_str = $t->collation->as_graphml;
139
140my $newt = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml_str );
951ddfe8 141ok( $newt->$_isa('Text::Tradition'), "Parsed current GraphML version" );
bbd064a9 142if( $newt ) {
143 is( scalar $newt->collation->readings, 319, "Collation has all readings" );
144 is( scalar $newt->collation->paths, 376, "Collation has all paths" );
145 is( scalar $newt->witnesses, 13, "Collation has all witnesses" );
146 is( scalar $newt->collation->relationships, 1, "Collation has added relationship" );
147 is( $newt->language, 'Greek', "Tradition has correct language setting" );
148 my $rel = $newt->collation->get_relationship( 'w12', 'w13' );
149 ok( $rel, "Found set relationship" );
150 is( $rel->annotation, 'This is some note', "Relationship has its properties" );
951ddfe8 151 if( $stemma_enabled ) {
152 is( scalar $newt->stemmata, 1, "Tradition has its stemma" );
153 is( $newt->stemma(0)->witnesses, $t->stemma(0)->witnesses, "Stemma has correct length witness list" );
154 }
bbd064a9 155}
156
9fef629b 157# Test user save / restore
158my $fh = File::Temp->new();
159my $file = $fh->filename;
160$fh->close;
161my $dsn = "dbi:SQLite:dbname=$file";
1df4baa9 162my $userstore = Text::Tradition::Directory->new( { dsn => $dsn,
9fef629b 163 extra_args => { create => 1 } } );
164my $scope = $userstore->new_scope();
1df4baa9 165my $testuser = $userstore->create_user( { url => 'http://example.com' } );
951ddfe8 166ok( $testuser->$_isa('Text::Tradition::User'), "Created test user via userstore" );
9fef629b 167$testuser->add_tradition( $newt );
168is( $newt->user->id, $testuser->id, "Assigned tradition to test user" );
169$graphml_str = $newt->collation->as_graphml;
170my $usert;
171warning_is {
172 $usert = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml_str );
173} 'DROPPING user assignment without a specified userstore',
174 "Got expected user drop warning on parse";
175$usert = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml_str,
1df4baa9 176 'userstore' => $userstore );
9fef629b 177is( $usert->user->id, $testuser->id, "Parsed tradition with userstore points to correct user" );
178
951ddfe8 179# Test warning if we can
180unless( $stemma_enabled ) {
181 my $nst;
182 warnings_exist {
183 $nst = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/lexformat.xml' );
184 } [qr/DROPPING stemmata/],
185 "Got expected stemma drop warning on parse";
186}
bbd064a9 187
e867486f 188=end testing
32014ec9 189
190=cut
144d845b 191use Data::Dump;
32014ec9 192sub parse {
dfc37e38 193 my( $tradition, $opts ) = @_;
2626f709 194
195 # Collation data is in the first graph; relationship-specific stuff
196 # is in the second.
197 my( $graph_data, $rel_data ) = graphml_parse( $opts );
144d845b 198
32014ec9 199 my $collation = $tradition->collation;
200 my %witnesses;
e309421a 201
0068967c 202 # print STDERR "Setting graph globals\n";
e3196b2a 203 $tradition->name( $graph_data->{'name'} );
144d845b 204
2626f709 205 my $use_version;
bbd064a9 206 my $tmeta = $tradition->meta;
207 my $cmeta = $collation->meta;
255875b8 208 foreach my $gkey ( keys %{$graph_data->{'global'}} ) {
209 my $val = $graph_data->{'global'}->{$gkey};
210 if( $gkey eq 'version' ) {
211 $use_version = $val;
9fef629b 212 } elsif( $gkey eq 'stemmata' ) {
951ddfe8 213 # Make sure we can handle stemmata
214 my $stemma_enabled;
215 try {
216 $stemma_enabled = $tradition->enable_stemmata;
217 } catch {
218 warn "Analysis module not installed; DROPPING stemmata";
219 }
9fef629b 220 # Parse the stemmata into objects
951ddfe8 221 if( $stemma_enabled ) {
222 foreach my $dotstr ( split( /\n/, $val ) ) {
223 $tradition->add_stemma( 'dot' => $dotstr );
224 }
2a812726 225 }
9fef629b 226 } elsif( $gkey eq 'user' ) {
227 # Assign the tradition to the user if we can
228 if( exists $opts->{'userstore'} ) {
1df4baa9 229 my $userdir = delete $opts->{'userstore'};
9fef629b 230 my $user = $userdir->find_user( { username => $val } );
231 if( $user ) {
232 $user->add_tradition( $tradition );
233 } else {
234 warn( "Found no user with ID $val; DROPPING user assignment" );
235 }
236 } else {
237 warn( "DROPPING user assignment without a specified userstore" );
238 }
bbd064a9 239 } elsif( $tmeta->has_attribute( $gkey ) ) {
240 $tradition->$gkey( $val );
255875b8 241 } else {
242 $collation->$gkey( $val );
243 }
244 }
e309421a 245
10e4b1ac 246 # Add the nodes to the graph.
247 # Note any reading IDs that were changed in order to comply with XML
248 # name restrictions; we have to hardcode start & end.
249 my %namechange = ( '#START#' => '__START__', '#END#' => '__END__' );
32014ec9 250
bbd064a9 251 # print STDERR "Adding collation readings\n";
2626f709 252 foreach my $n ( @{$graph_data->{'nodes'}} ) {
0174d6a9 253 # If it is the start or end node, we already have one, so
254 # grab the rank and go.
144d845b 255 if( defined $n->{'is_start'} ) {
f3f26624 256# warn Data::Dump::dump($n);
257# warn $collation->start->id;
144d845b 258 $collation->start->rank($n->{'rank'});
259 next;
260 }
bbd064a9 261 if( defined $n->{'is_end'} ) {
f3f26624 262# warn Data::Dump::dump($n);
bbd064a9 263 $collation->end->rank( $n->{'rank'} );
0174d6a9 264 next;
265 }
bbd064a9 266 my $gnode = $collation->add_reading( $n );
10e4b1ac 267 if( $gnode->id ne $n->{'id'} ) {
268 $namechange{$n->{'id'}} = $gnode->id;
269 }
32014ec9 270 }
910a0a6d 271
32014ec9 272 # Now add the edges.
bbd064a9 273 # print STDERR "Adding collation path edges\n";
32014ec9 274 foreach my $e ( @{$graph_data->{'edges'}} ) {
10e4b1ac 275 my $sourceid = exists $namechange{$e->{'source'}->{'id'}}
276 ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'};
277 my $targetid = exists $namechange{$e->{'target'}->{'id'}}
278 ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'};
279 my $from = $collation->reading( $sourceid );
280 my $to = $collation->reading( $targetid );
bbd064a9 281
282 warn "No witness label on path edge!" unless $e->{'witness'};
283 my $label = $e->{'witness'} . ( $e->{'extra'} ? $collation->ac_label : '' );
284 $collation->add_path( $from, $to, $label );
285
2626f709 286 # Add the witness if we don't have it already.
bbd064a9 287 unless( $witnesses{$e->{'witness'}} ) {
82fa4d57 288 $tradition->add_witness(
289 sigil => $e->{'witness'}, 'sourcetype' => 'collation' );
bbd064a9 290 $witnesses{$e->{'witness'}} = 1;
255875b8 291 }
bbd064a9 292 $tradition->witness( $e->{'witness'} )->is_layered( 1 ) if $e->{'extra'};
32014ec9 293 }
2626f709 294
295 ## Done with the main graph, now look at the relationships.
296 # Nodes are added via the call to add_reading above. We only need
297 # add the relationships themselves.
298 # TODO check that scoping does trt
bf6e338d 299 $rel_data->{'edges'} ||= []; # so that the next line doesn't break on no rels
300 foreach my $e ( sort { _layersort_rel( $a, $b ) } @{$rel_data->{'edges'}} ) {
10e4b1ac 301 my $sourceid = exists $namechange{$e->{'source'}->{'id'}}
302 ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'};
303 my $targetid = exists $namechange{$e->{'target'}->{'id'}}
304 ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'};
305 my $from = $collation->reading( $sourceid );
306 my $to = $collation->reading( $targetid );
bbd064a9 307 delete $e->{'source'};
308 delete $e->{'target'};
309 # The remaining keys are relationship attributes.
310 # Backward compatibility...
311 if( $use_version eq '2.0' || $use_version eq '3.0' ) {
312 delete $e->{'class'};
313 $e->{'type'} = delete $e->{'relationship'} if exists $e->{'relationship'};
314 }
315 # Add the specified relationship unless we already have done.
fdfa59a7 316 my $rel_exists;
bbd064a9 317 if( $e->{'scope'} ne 'local' ) {
318 my $relobj = $collation->get_relationship( $from, $to );
319 if( $relobj && $relobj->scope eq $e->{'scope'}
320 && $relobj->type eq $e->{'type'} ) {
fdfa59a7 321 $rel_exists = 1;
322 }
323 }
00c5bf0b 324 try {
325 $collation->add_relationship( $from, $to, $e ) unless $rel_exists;
326 } catch( Text::Tradition::Error $e ) {
327 warn "DROPPING $from -> $to: " . $e->message;
328 }
2626f709 329 }
861c3e27 330
331 # Save the text for each witness so that we can ensure consistency
332 # later on
bbd064a9 333 $collation->text_from_paths();
32014ec9 334}
335
bf6e338d 336## Return the relationship that comes first in priority.
337my %LAYERS = (
338 'collated' => 1,
339 'orthographic' => 2,
340 'spelling' => 3,
341 );
342
343sub _layersort_rel {
344 my( $a, $b ) = @_;
345 my $key = exists $a->{'type'} ? 'type' : 'relationship';
346 my $at = $LAYERS{$a->{$key}} || 99;
347 my $bt = $LAYERS{$b->{$key}} || 99;
348 return $at <=> $bt;
349}
350
e867486f 3511;
352
353=head1 BUGS / TODO
354
355=over
356
357=item * Make this into a stream parser with GraphML
358
359=item * Simply field -> attribute correspondence for nodes and edges
360
361=item * Share key name constants with Collation.pm
362
32014ec9 363=back
364
365=head1 LICENSE
366
367This package is free software and is provided "as is" without express
368or implied warranty. You can redistribute it and/or modify it under
369the same terms as Perl itself.
370
371=head1 AUTHOR
372
e867486f 373Tara L Andrews E<lt>aurum@cpan.orgE<gt>