make test DB generation script work with new world order
[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Parser / Self.pm
CommitLineData
32014ec9 1package Text::Tradition::Parser::Self;
2
3use strict;
4use warnings;
1f7aa795 5use Text::Tradition::Parser::GraphML qw/ graphml_parse /;
00c5bf0b 6use TryCatch;
32014ec9 7
8=head1 NAME
9
10Text::Tradition::Parser::GraphML
11
e867486f 12=head1 SYNOPSIS
13
14 use Text::Tradition;
15
16 my $t_from_file = Text::Tradition->new(
17 'name' => 'my text',
18 'input' => 'Self',
19 'file' => '/path/to/tradition.xml'
20 );
21
22 my $t_from_string = Text::Tradition->new(
23 'name' => 'my text',
24 'input' => 'Self',
25 'string' => $tradition_xml,
26 );
27
32014ec9 28=head1 DESCRIPTION
29
30Parser module for Text::Tradition to read in its own GraphML output format.
e867486f 31GraphML is a relatively simple graph description language; a 'graph' element
32can have 'node' and 'edge' elements, and each of these can have simple 'data'
33elements for attributes to be saved.
32014ec9 34
e867486f 35The graph itself has attributes as in the Collation object:
36
37=over
38
39=item * linear
40
41=item * ac_label
42
43=item * baselabel
44
45=item * wit_list_separator
46
47=back
48
49The node objects have the following attributes:
32014ec9 50
51=over
52
e867486f 53=item * name
54
55=item * reading
56
57=item * identical
58
59=item * rank
60
61=item * class
62
63=back
64
65The edge objects have the following attributes:
66
67=over
68
69=item * class
70
71=item * witness (for 'path' class edges)
72
73=item * extra (for 'path' class edges)
74
75=item * relationship (for 'relationship' class edges)
76
77=item * equal_rank (for 'relationship' class edges)
32014ec9 78
e867486f 79=item * non_correctable (for 'relationship' class edges)
32014ec9 80
e867486f 81=item * non_independent (for 'relationship' class edges)
82
83=back
84
85=head1 METHODS
86
87=head2 B<parse>
88
89parse( $graph, $opts );
90
91Takes an initialized Text::Tradition object and a set of options; creates
92the appropriate nodes and edges on the graph. The options hash should
93include either a 'file' argument or a 'string' argument, depending on the
94source of the XML to be parsed.
95
96=begin testing
97
9fef629b 98use File::Temp;
951ddfe8 99use Safe::Isa;
9fef629b 100use Test::Warn;
e867486f 101use Text::Tradition;
a445ce40 102use Text::Tradition::Directory;
951ddfe8 103use TryCatch;
e867486f 104binmode STDOUT, ":utf8";
105binmode STDERR, ":utf8";
106eval { no warnings; binmode $DB::OUT, ":utf8"; };
107
108my $tradition = 't/data/florilegium_graphml.xml';
109my $t = Text::Tradition->new(
110 'name' => 'inline',
111 'input' => 'Self',
112 'file' => $tradition,
113 );
114
951ddfe8 115ok( $t->$_isa('Text::Tradition'), "Parsed GraphML version 2" );
e867486f 116if( $t ) {
117 is( scalar $t->collation->readings, 319, "Collation has all readings" );
255875b8 118 is( scalar $t->collation->paths, 376, "Collation has all paths" );
e867486f 119 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
120}
121
2a812726 122# TODO add a relationship, add a stemma, write graphml, reparse it, check that
123# the new data is there
bbd064a9 124$t->language('Greek');
951ddfe8 125my $stemma_enabled;
126try {
127 $stemma_enabled = $t->enable_stemmata;
128} catch {
129 ok( 1, "Skipping stemma tests without Analysis module" );
130}
131if( $stemma_enabled ) {
132 $t->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
133}
bbd064a9 134$t->collation->add_relationship( 'w12', 'w13',
135 { 'type' => 'grammatical', 'scope' => 'global',
136 'annotation' => 'This is some note' } );
137ok( $t->collation->get_relationship( 'w12', 'w13' ), "Relationship set" );
138my $graphml_str = $t->collation->as_graphml;
139
140my $newt = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml_str );
951ddfe8 141ok( $newt->$_isa('Text::Tradition'), "Parsed current GraphML version" );
bbd064a9 142if( $newt ) {
143 is( scalar $newt->collation->readings, 319, "Collation has all readings" );
144 is( scalar $newt->collation->paths, 376, "Collation has all paths" );
145 is( scalar $newt->witnesses, 13, "Collation has all witnesses" );
146 is( scalar $newt->collation->relationships, 1, "Collation has added relationship" );
147 is( $newt->language, 'Greek', "Tradition has correct language setting" );
148 my $rel = $newt->collation->get_relationship( 'w12', 'w13' );
149 ok( $rel, "Found set relationship" );
150 is( $rel->annotation, 'This is some note', "Relationship has its properties" );
951ddfe8 151 if( $stemma_enabled ) {
152 is( scalar $newt->stemmata, 1, "Tradition has its stemma" );
153 is( $newt->stemma(0)->witnesses, $t->stemma(0)->witnesses, "Stemma has correct length witness list" );
154 }
bbd064a9 155}
156
9fef629b 157# Test user save / restore
158my $fh = File::Temp->new();
159my $file = $fh->filename;
160$fh->close;
161my $dsn = "dbi:SQLite:dbname=$file";
1df4baa9 162my $userstore = Text::Tradition::Directory->new( { dsn => $dsn,
9fef629b 163 extra_args => { create => 1 } } );
164my $scope = $userstore->new_scope();
1df4baa9 165my $testuser = $userstore->create_user( { url => 'http://example.com' } );
951ddfe8 166ok( $testuser->$_isa('Text::Tradition::User'), "Created test user via userstore" );
9fef629b 167$testuser->add_tradition( $newt );
168is( $newt->user->id, $testuser->id, "Assigned tradition to test user" );
169$graphml_str = $newt->collation->as_graphml;
170my $usert;
171warning_is {
172 $usert = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml_str );
173} 'DROPPING user assignment without a specified userstore',
174 "Got expected user drop warning on parse";
175$usert = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml_str,
1df4baa9 176 'userstore' => $userstore );
9fef629b 177is( $usert->user->id, $testuser->id, "Parsed tradition with userstore points to correct user" );
178
951ddfe8 179# Test warning if we can
180unless( $stemma_enabled ) {
181 my $nst;
182 warnings_exist {
183 $nst = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/lexformat.xml' );
184 } [qr/DROPPING stemmata/],
185 "Got expected stemma drop warning on parse";
186}
bbd064a9 187
e867486f 188=end testing
32014ec9 189
190=cut
144d845b 191use Data::Dump;
32014ec9 192sub parse {
dfc37e38 193 my( $tradition, $opts ) = @_;
2626f709 194
195 # Collation data is in the first graph; relationship-specific stuff
196 # is in the second.
197 my( $graph_data, $rel_data ) = graphml_parse( $opts );
144d845b 198
32014ec9 199 my $collation = $tradition->collation;
200 my %witnesses;
e309421a 201
0068967c 202 # print STDERR "Setting graph globals\n";
e3196b2a 203 $tradition->name( $graph_data->{'name'} );
144d845b 204
2626f709 205 my $use_version;
bbd064a9 206 my $tmeta = $tradition->meta;
207 my $cmeta = $collation->meta;
255875b8 208 foreach my $gkey ( keys %{$graph_data->{'global'}} ) {
209 my $val = $graph_data->{'global'}->{$gkey};
210 if( $gkey eq 'version' ) {
211 $use_version = $val;
9fef629b 212 } elsif( $gkey eq 'stemmata' ) {
951ddfe8 213 # Make sure we can handle stemmata
214 my $stemma_enabled;
215 try {
216 $stemma_enabled = $tradition->enable_stemmata;
217 } catch {
218 warn "Analysis module not installed; DROPPING stemmata";
219 }
9fef629b 220 # Parse the stemmata into objects
951ddfe8 221 if( $stemma_enabled ) {
222 foreach my $dotstr ( split( /\n/, $val ) ) {
223 $tradition->add_stemma( 'dot' => $dotstr );
224 }
2a812726 225 }
9fef629b 226 } elsif( $gkey eq 'user' ) {
227 # Assign the tradition to the user if we can
228 if( exists $opts->{'userstore'} ) {
1df4baa9 229 my $userdir = delete $opts->{'userstore'};
9fef629b 230 my $user = $userdir->find_user( { username => $val } );
231 if( $user ) {
232 $user->add_tradition( $tradition );
233 } else {
234 warn( "Found no user with ID $val; DROPPING user assignment" );
235 }
236 } else {
237 warn( "DROPPING user assignment without a specified userstore" );
238 }
bbd064a9 239 } elsif( $tmeta->has_attribute( $gkey ) ) {
240 $tradition->$gkey( $val );
255875b8 241 } else {
242 $collation->$gkey( $val );
243 }
244 }
e309421a 245
10e4b1ac 246 # Add the nodes to the graph.
247 # Note any reading IDs that were changed in order to comply with XML
248 # name restrictions; we have to hardcode start & end.
249 my %namechange = ( '#START#' => '__START__', '#END#' => '__END__' );
32014ec9 250
bbd064a9 251 # print STDERR "Adding collation readings\n";
a445ce40 252 my $need_morphology;
2626f709 253 foreach my $n ( @{$graph_data->{'nodes'}} ) {
0174d6a9 254 # If it is the start or end node, we already have one, so
255 # grab the rank and go.
144d845b 256 if( defined $n->{'is_start'} ) {
a445ce40 257 $collation->start->rank($n->{'rank'});
258 next;
144d845b 259 }
bbd064a9 260 if( defined $n->{'is_end'} ) {
261 $collation->end->rank( $n->{'rank'} );
0174d6a9 262 next;
263 }
a445ce40 264 # HACKY but no better way yet
265 # If $n has a 'lexemes' property then we will need the morphology for
266 # the whole tradition.
267 $need_morphology = 1 if exists $n->{'lexemes'};
bbd064a9 268 my $gnode = $collation->add_reading( $n );
10e4b1ac 269 if( $gnode->id ne $n->{'id'} ) {
270 $namechange{$n->{'id'}} = $gnode->id;
271 }
32014ec9 272 }
a445ce40 273 # HACK continued - if any of the readings had morphology info, we
274 # must enable it for the whole tradition. Just eval it, as we will
275 # have already been warned if the morphology extension isn't installed.
276 if( $need_morphology ) {
277 eval { $tradition->enable_morphology };
278 }
910a0a6d 279
32014ec9 280 # Now add the edges.
bbd064a9 281 # print STDERR "Adding collation path edges\n";
32014ec9 282 foreach my $e ( @{$graph_data->{'edges'}} ) {
10e4b1ac 283 my $sourceid = exists $namechange{$e->{'source'}->{'id'}}
284 ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'};
285 my $targetid = exists $namechange{$e->{'target'}->{'id'}}
286 ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'};
287 my $from = $collation->reading( $sourceid );
288 my $to = $collation->reading( $targetid );
bbd064a9 289
290 warn "No witness label on path edge!" unless $e->{'witness'};
291 my $label = $e->{'witness'} . ( $e->{'extra'} ? $collation->ac_label : '' );
292 $collation->add_path( $from, $to, $label );
293
2626f709 294 # Add the witness if we don't have it already.
bbd064a9 295 unless( $witnesses{$e->{'witness'}} ) {
82fa4d57 296 $tradition->add_witness(
297 sigil => $e->{'witness'}, 'sourcetype' => 'collation' );
bbd064a9 298 $witnesses{$e->{'witness'}} = 1;
255875b8 299 }
bbd064a9 300 $tradition->witness( $e->{'witness'} )->is_layered( 1 ) if $e->{'extra'};
32014ec9 301 }
2626f709 302
303 ## Done with the main graph, now look at the relationships.
304 # Nodes are added via the call to add_reading above. We only need
305 # add the relationships themselves.
306 # TODO check that scoping does trt
bf6e338d 307 $rel_data->{'edges'} ||= []; # so that the next line doesn't break on no rels
308 foreach my $e ( sort { _layersort_rel( $a, $b ) } @{$rel_data->{'edges'}} ) {
10e4b1ac 309 my $sourceid = exists $namechange{$e->{'source'}->{'id'}}
310 ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'};
311 my $targetid = exists $namechange{$e->{'target'}->{'id'}}
312 ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'};
313 my $from = $collation->reading( $sourceid );
314 my $to = $collation->reading( $targetid );
bbd064a9 315 delete $e->{'source'};
316 delete $e->{'target'};
317 # The remaining keys are relationship attributes.
318 # Backward compatibility...
319 if( $use_version eq '2.0' || $use_version eq '3.0' ) {
320 delete $e->{'class'};
321 $e->{'type'} = delete $e->{'relationship'} if exists $e->{'relationship'};
322 }
323 # Add the specified relationship unless we already have done.
fdfa59a7 324 my $rel_exists;
bbd064a9 325 if( $e->{'scope'} ne 'local' ) {
326 my $relobj = $collation->get_relationship( $from, $to );
327 if( $relobj && $relobj->scope eq $e->{'scope'}
328 && $relobj->type eq $e->{'type'} ) {
fdfa59a7 329 $rel_exists = 1;
330 }
331 }
00c5bf0b 332 try {
333 $collation->add_relationship( $from, $to, $e ) unless $rel_exists;
334 } catch( Text::Tradition::Error $e ) {
335 warn "DROPPING $from -> $to: " . $e->message;
336 }
2626f709 337 }
861c3e27 338
339 # Save the text for each witness so that we can ensure consistency
340 # later on
bbd064a9 341 $collation->text_from_paths();
32014ec9 342}
343
bf6e338d 344## Return the relationship that comes first in priority.
345my %LAYERS = (
346 'collated' => 1,
347 'orthographic' => 2,
348 'spelling' => 3,
349 );
350
351sub _layersort_rel {
352 my( $a, $b ) = @_;
353 my $key = exists $a->{'type'} ? 'type' : 'relationship';
354 my $at = $LAYERS{$a->{$key}} || 99;
355 my $bt = $LAYERS{$b->{$key}} || 99;
356 return $at <=> $bt;
357}
358
e867486f 3591;
360
361=head1 BUGS / TODO
362
363=over
364
365=item * Make this into a stream parser with GraphML
366
367=item * Simply field -> attribute correspondence for nodes and edges
368
369=item * Share key name constants with Collation.pm
370
32014ec9 371=back
372
373=head1 LICENSE
374
375This package is free software and is provided "as is" without express
376or implied warranty. You can redistribute it and/or modify it under
377the same terms as Perl itself.
378
379=head1 AUTHOR
380
e867486f 381Tara L Andrews E<lt>aurum@cpan.orgE<gt>