revert untested changes that were causing a segfault
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation / RelationshipStore.pm
1 package Text::Tradition::Collation::RelationshipStore;
2
3 use strict;
4 use warnings;
5 use Text::Tradition::Error;
6 use Text::Tradition::Collation::Relationship;
7 use TryCatch;
8
9 use Moose;
10
11 =head1 NAME
12
13 Text::Tradition::Collation::RelationshipStore - Keeps track of the relationships
14 between readings in a given collation
15     
16 =head1 DESCRIPTION
17
18 Text::Tradition is a library for representation and analysis of collated
19 texts, particularly medieval ones.  The RelationshipStore is an internal object
20 of the collation, to keep track of the defined relationships (both specific and
21 general) between readings.
22
23 =begin testing
24
25 use Text::Tradition;
26 use TryCatch;
27
28 use_ok( 'Text::Tradition::Collation::RelationshipStore' );
29
30 # Add some relationships, and delete them
31
32 my $cxfile = 't/data/Collatex-16.xml';
33 my $t = Text::Tradition->new( 
34     'name'  => 'inline', 
35     'input' => 'CollateX',
36     'file'  => $cxfile,
37     );
38 my $c = $t->collation;
39
40 my @v1 = $c->add_relationship( 'n21', 'n22', { 'type' => 'lexical' } );
41 is( scalar @v1, 1, "Added a single relationship" );
42 is( $v1[0]->[0], 'n21', "Got correct node 1" );
43 is( $v1[0]->[1], 'n22', "Got correct node 2" );
44 my @v2 = $c->add_relationship( 'n24', 'n23', 
45         { 'type' => 'spelling', 'scope' => 'global' } );
46 is( scalar @v2, 2, "Added a global relationship with two instances" );
47 @v1 = $c->del_relationship( 'n22', 'n21' );
48 is( scalar @v1, 1, "Deleted first relationship" );
49 @v2 = $c->del_relationship( 'n12', 'n13' );
50 is( scalar @v2, 2, "Deleted second global relationship" );
51 my @v3 = $c->del_relationship( 'n1', 'n2' );
52 is( scalar @v3, 0, "Nothing deleted on non-existent relationship" );
53
54 =end testing
55
56 =head1 METHODS
57
58 =head2 new( collation => $collation );
59
60 Creates a new relationship store for the given collation.
61
62 =cut
63
64 has 'collation' => (
65         is => 'ro',
66         isa => 'Text::Tradition::Collation',
67         required => 1,
68         weak_ref => 1,
69         );
70
71 has 'scopedrels' => (
72         is => 'ro',
73         isa => 'HashRef[HashRef[Text::Tradition::Collation::Relationship]]',
74         default => sub { {} },
75         );
76
77 has 'graph' => (
78         is => 'ro',
79         isa => 'Graph',
80         default => sub { Graph->new( undirected => 1 ) },
81     handles => {
82         relationships => 'edges',
83         add_reading => 'add_vertex',
84         delete_reading => 'delete_vertex',
85     },
86         );
87         
88 =head2 get_relationship
89
90 Return the relationship object, if any, that exists between two readings.
91
92 =cut
93
94 sub get_relationship {
95         my $self = shift;
96         my @vector;
97         if( @_ == 1 && ref( $_[0] ) eq 'ARRAY' ) {
98                 # Dereference the edge arrayref that was passed.
99                 my $edge = shift;
100                 @vector = @$edge;
101         } else {
102                 @vector = @_;
103         }
104         my $relationship;
105         if( $self->graph->has_edge_attribute( @vector, 'object' ) ) {
106                 $relationship = $self->graph->get_edge_attribute( @vector, 'object' );
107         } 
108         return $relationship;
109 }
110
111 sub _set_relationship {
112         my( $self, $relationship, @vector ) = @_;
113         $self->graph->add_edge( @vector );
114         $self->graph->set_edge_attribute( @vector, 'object', $relationship );
115 }
116
117 =head2 create
118
119 Create a new relationship with the given options and return it.
120 Warn and return undef if the relationship cannot be created.
121
122 =cut
123
124 sub create {
125         my( $self, $options ) = @_;
126         # Check to see if a relationship exists between the two given readings
127         my $source = delete $options->{'orig_a'};
128         my $target = delete $options->{'orig_b'};
129         my $rel = $self->get_relationship( $source, $target );
130         if( $rel ) {
131                 if( $rel->type eq 'collated' ) {
132                         # Always replace a 'collated' relationship with a more descriptive
133                         # one, if asked.
134                         $self->del_relationship( $source, $target );
135                 } elsif( $rel->type ne $options->{'type'} ) {
136                         throw( "Another relationship of type " . $rel->type 
137                                 . " already exists between $source and $target" );
138                 } else {
139                         return $rel;
140                 }
141         }
142         
143         # Check to see if a nonlocal relationship is defined for the two readings
144         $rel = $self->scoped_relationship( $options->{'reading_a'}, 
145                 $options->{'reading_b'} );
146         if( $rel && $rel->type eq $options->{'type'} ) {
147                 return $rel;
148         } elsif( $rel ) {
149                 throw( sprintf( "Relationship of type %s with scope %s already defined for readings %s and %s", $rel->type, $rel->scope, $options->{'reading_a'}, $options->{'reading_b'} ) );
150         } else {
151                 $rel = Text::Tradition::Collation::Relationship->new( $options );
152                 $self->add_scoped_relationship( $rel ) if $rel->nonlocal;
153                 return $rel;
154         }
155 }
156
157 =head2 add_scoped_relationship( $rel )
158
159 Keep track of relationships defined between specific readings that are scoped
160 non-locally.  Key on whichever reading occurs first alphabetically.
161
162 =cut
163
164 sub add_scoped_relationship {
165         my( $self, $rel ) = @_;
166         my $rdga = $rel->type eq 'orthographic' ? $rel->reading_a : lc( $rel->reading_a );
167         my $rdgb = $rel->type eq 'orthographic' ? $rel->reading_b : lc( $rel->reading_b );      
168         my $r = $self->scoped_relationship( $rdga, $rdgb );
169         if( $r ) {
170                 warn sprintf( "Scoped relationship of type %s already exists between %s and %s",
171                         $r->type, $rdga, $rdgb );
172                 return;
173         }
174         my( $first, $second ) = sort ( $rdga, $rdgb );
175         $self->scopedrels->{$first}->{$second} = $rel;
176 }
177
178 =head2 scoped_relationship( $reading_a, $reading_b )
179
180 Returns the general (document-level or global) relationship that has been defined 
181 between the two reading strings. Returns undef if there is no general relationship.
182
183 =cut
184
185 sub scoped_relationship {
186         my( $self, $rdga, $rdgb ) = @_;
187         my( $first, $second ) = sort( $rdga, $rdgb );
188         if( exists $self->scopedrels->{$first}->{$second} ) {
189                 return $self->scopedrels->{$first}->{$second};
190         } else {
191                 return undef;
192         }
193 }
194
195 =head2 add_relationship( $self, $source, $sourcetext, $target, $targettext, $opts )
196
197 Adds the relationship specified in $opts (see Text::Tradition::Collation::Relationship 
198 for the possible options) between the readings given in $source and $target.  Sets
199 up a scoped relationship between $sourcetext and $targettext if the relationship is
200 scoped non-locally.
201
202 Returns a status boolean and a list of all reading pairs connected by the call to
203 add_relationship.
204
205 =cut
206
207 sub add_relationship {
208         my( $self, $source, $source_rdg, $target, $target_rdg, $options ) = @_;
209
210         my $relationship;
211         my $thispaironly;
212         if( ref( $options ) eq 'Text::Tradition::Collation::Relationship' ) {
213                 $relationship = $options;
214                 $thispaironly = 1;  # If existing rel, set only where asked.
215         } else {
216                 # Check the options
217                 $options->{'scope'} = 'local' unless $options->{'scope'};
218                 $options->{'scope'} = 'local' if $options->{'type'} eq 'collated';
219                 $options->{'scope'} = 'local' if $options->{'type'} eq 'transposition';
220                 
221                 my( $is_valid, $reason ) = 
222                         $self->relationship_valid( $source, $target, $options->{'type'} );
223                 unless( $is_valid ) {
224                         throw( "Invalid relationship: $reason" );
225                 }
226                 
227                 # Try to create the relationship object.
228                 $options->{'reading_a'} = $source_rdg->text;
229                 $options->{'reading_b'} = $target_rdg->text;
230                 $options->{'orig_a'} = $source;
231                 $options->{'orig_b'} = $target;
232         if( $options->{'scope'} ne 'local' ) {
233                         # Is there a relationship with this a & b already?
234                         # Case-insensitive for non-orthographics.
235                         my $rdga = $options->{'type'} eq 'orthographic' 
236                                 ? $options->{'reading_a'} : lc( $options->{'reading_a'} );
237                         my $rdgb = $options->{'type'} eq 'orthographic' 
238                                 ? $options->{'reading_b'} : lc( $options->{'reading_b'} );
239                         my $otherrel = $self->scoped_relationship( $rdga, $rdgb );
240                         if( $otherrel && $otherrel->type eq $options->{type}
241                                 && $otherrel->scope eq $options->{scope} ) {
242                                 warn "Applying existing scoped relationship";
243                                 $relationship = $otherrel;
244                         }
245         }
246                 $relationship = $self->create( $options ) unless $relationship;  # Will throw on error
247     }
248
249
250         # Find all the pairs for which we need to set the relationship.
251         my @vectors = [ $source, $target ];
252     if( $relationship->colocated && $relationship->nonlocal && !$thispaironly ) {
253         push( @vectors, $self->_find_applicable( $relationship ) );
254     }
255         
256     # Now set the relationship(s).
257     my @pairs_set;
258     foreach my $v ( @vectors ) {
259                 my $rel = $self->get_relationship( @$v );
260         if( $rel && $rel ne $relationship ) {
261                 if( $rel->nonlocal ) {
262                         throw( "Found conflicting relationship at @$v" );
263                 } elsif( $rel->type ne 'collated' ) {
264                         # Replace a collation relationship; leave any other sort in place.
265                         my $r1ann = $rel->has_annotation ? $rel->annotation : '';
266                         my $r2ann = $relationship->has_annotation ? $relationship->annotation : '';
267                         unless( $rel->type eq $relationship->type && $r1ann eq $r2ann ) {
268                                         warn sprintf( "Not overriding local relationship %s with global %s " 
269                                                 . "set at %s -> %s (%s -> %s)", $rel->type, $relationship->type,
270                                                 @$v, $rel->reading_a, $rel->reading_b );
271                                         next;
272                                 }
273                 }
274         }
275         map { $self->_drop_collations( $_ ) } @$v;
276         $self->_set_relationship( $relationship, @$v );
277         push( @pairs_set, $v );
278     }
279     
280     return @pairs_set;
281 }
282
283 =head2 del_scoped_relationship( $reading_a, $reading_b )
284
285 Returns the general (document-level or global) relationship that has been defined 
286 between the two reading strings. Returns undef if there is no general relationship.
287
288 =cut
289
290 sub del_scoped_relationship {
291         my( $self, $rdga, $rdgb ) = @_;
292         my( $first, $second ) = sort( $rdga, $rdgb );
293         return delete $self->scopedrels->{$first}->{$second};
294 }
295
296 sub _find_applicable {
297         my( $self, $rel ) = @_;
298         my $c = $self->collation;
299         # TODO Someday we might use a case sensitive language.
300         my $lang = $c->tradition->language;
301         my @vectors;
302         my @identical_readings;
303         if( $rel->type eq 'orthographic' ) {
304                 @identical_readings = grep { $_->text eq $rel->reading_a } 
305                         $c->readings;
306         } else {
307                 @identical_readings = grep { lc( $_->text ) eq lc( $rel->reading_a ) }
308                         $c->readings;
309         }
310         foreach my $ir ( @identical_readings ) {
311                 my @itarget;
312                 if( $rel->type eq 'orthographic' ) {
313                         @itarget = grep { $_->rank == $ir->rank 
314                                                           && $_->text eq $rel->reading_b } $c->readings;
315                 } else {
316                         @itarget = grep { $_->rank == $ir->rank 
317                                                           && lc( $_->text ) eq lc( $rel->reading_b ) } $c->readings;
318                 }
319                 if( @itarget ) {
320                         # Warn if there is more than one hit with no orth link between them.
321                         my $itmain = shift @itarget;
322                         if( @itarget ) {
323                                 my %all_targets;
324                                 map { $all_targets{$_} = 1 } @itarget;
325                                 map { delete $all_targets{$_} } 
326                                         $self->related_readings( $itmain, 
327                                                 sub { $_[0]->type eq 'orthographic' } );
328                         warn "More than one unrelated reading with text " . $itmain->text
329                                 . " at rank " . $ir->rank . "!" if keys %all_targets;
330                         }
331                         push( @vectors, [ $ir->id, $itmain->id ] );
332                 }
333         }
334         return @vectors;
335 }
336
337 =head2 del_relationship( $source, $target )
338
339 Removes the relationship between the given readings. If the relationship is
340 non-local, removes the relationship everywhere in the graph.
341
342 =cut
343
344 sub del_relationship {
345         my( $self, $source, $target ) = @_;
346         my $rel = $self->get_relationship( $source, $target );
347         return () unless $rel; # Nothing to delete; return an empty set.
348         my @vectors = ( [ $source, $target ] );
349         $self->_remove_relationship( $source, $target );
350         if( $rel->nonlocal ) {
351                 # Remove the relationship wherever it occurs.
352                 # Remove the relationship wherever it occurs.
353                 my @rel_edges = grep { $self->get_relationship( @$_ ) == $rel }
354                         $self->relationships;
355                 foreach my $re ( @rel_edges ) {
356                         $self->_remove_relationship( @$re );
357                         push( @vectors, $re );
358                 }
359                 $self->del_scoped_relationship( $rel->reading_a, $rel->reading_b );
360         }
361         return @vectors;
362 }
363
364 sub _remove_relationship {
365         my( $self, @vector ) = @_;
366         $self->graph->delete_edge( @vector );
367 }
368         
369 =head2 relationship_valid( $source, $target, $type )
370
371 Checks whether a relationship of type $type may exist between the readings given
372 in $source and $target.  Returns a tuple of ( status, message ) where status is
373 a yes/no boolean and, if the answer is no, message gives the reason why.
374
375 =cut
376
377 sub relationship_valid {
378     my( $self, $source, $target, $rel ) = @_;
379     my $c = $self->collation;
380     if ( $rel eq 'transposition' || $rel eq 'repetition' ) {
381                 # Check that the two readings do (for a repetition) or do not (for
382                 # a transposition) appear in the same witness.
383                 # TODO this might be called before witness paths are set...
384                 my %seen_wits;
385                 map { $seen_wits{$_} = 1 } $c->reading_witnesses( $source );
386                 foreach my $w ( $c->reading_witnesses( $target ) ) {
387                         if( $seen_wits{$w} ) {
388                                 return ( 0, "Readings both occur in witness $w" ) 
389                                         if $rel eq 'transposition';
390                                 return ( 1, "ok" ) if $rel eq 'repetition';
391                         }
392                 }
393                 return $rel eq 'transposition' ? ( 1, "ok" )
394                         : ( 0, "Readings occur only in distinct witnesses" );
395         } else {
396                 # Check that linking the source and target in a relationship won't lead
397                 # to a path loop for any witness.  If they have the same rank then fine.
398                 return( 1, "ok" ) 
399                         if $c->reading( $source )->has_rank
400                                 && $c->reading( $target )->has_rank
401                                 && $c->reading( $source )->rank == $c->reading( $target )->rank;
402                 
403                 # Otherwise, first make a lookup table of all the
404                 # readings related to either the source or the target.
405                 my @proposed_related = ( $source, $target );
406                 # Drop the collation links of source and target, unless we want to
407                 # add a collation relationship.
408                 foreach my $r ( ( $source, $target ) ) {
409                         $self->_drop_collations( $r ) unless $rel eq 'collated';
410                         push( @proposed_related, $self->related_readings( $r, 'colocated' ) );
411                 }
412                 my %pr_ids;
413                 map { $pr_ids{ $_ } = 1 } @proposed_related;
414         
415                 # The cumulative predecessors and successors of the proposed-related readings
416                 # should not overlap.
417                 my %all_pred;
418                 my %all_succ;
419                 foreach my $pr ( keys %pr_ids ) {
420                         map { $all_pred{$_} = 1 } $c->sequence->all_predecessors( $pr );
421                         map { $all_succ{$_} = 1 } $c->sequence->all_successors( $pr );
422                 }
423                 foreach my $k ( keys %all_pred ) {
424                         return( 0, "Relationship would create witness loop" )
425                                 if exists $all_succ{$k};
426                 }
427                 foreach my $k ( keys %pr_ids ) {
428                         return( 0, "Relationship would create witness loop" )
429                                 if exists $all_pred{$k} || exists $all_succ{$k};
430                 }
431                 return ( 1, "ok" );
432         }
433 }
434
435 sub _drop_collations {
436         my( $self, $reading ) = @_;
437         foreach my $n ( $self->graph->neighbors( $reading ) ) {
438                 if( $self->get_relationship( $reading, $n )->type eq 'collated' ) {
439                         $self->del_relationship( $reading, $n );
440                 }
441         }
442 }
443
444 =head2 related_readings( $reading, $filter )
445
446 Returns a list of readings that are connected via relationship links to $reading.
447 If $filter is set to a subroutine ref, returns only those related readings where
448 $filter( $relationship ) returns a true value.
449
450 =cut
451
452 sub related_readings {
453         my( $self, $reading, $filter ) = @_;
454         my $return_object;
455         if( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
456                 $reading = $reading->id;
457                 $return_object = 1;
458         }
459         my @answer;
460         if( $filter ) {
461                 # Backwards compat
462                 if( $filter eq 'colocated' ) {
463                         $filter = sub { $_[0]->colocated };
464                 }
465                 my %found = ( $reading => 1 );
466                 my $check = [ $reading ];
467                 my $iter = 0;
468                 while( @$check ) {
469                         my $more = [];
470                         foreach my $r ( @$check ) {
471                                 foreach my $nr ( $self->graph->neighbors( $r ) ) {
472                                         if( &$filter( $self->get_relationship( $r, $nr ) ) ) {
473                                                 push( @$more, $nr ) unless exists $found{$nr};
474                                                 $found{$nr} = 1;
475                                         }
476                                 }
477                         }
478                         $check = $more;
479                 }
480                 delete $found{$reading};
481                 @answer = keys %found;
482         } else {
483                 @answer = $self->graph->all_reachable( $reading );
484         }
485         if( $return_object ) {
486                 my $c = $self->collation;
487                 return map { $c->reading( $_ ) } @answer;
488         } else {
489                 return @answer;
490         }
491 }
492
493 =head2 merge_readings( $kept, $deleted );
494
495 Makes a best-effort merge of the relationship links between the given readings, and
496 stops tracking the to-be-deleted reading.
497
498 =cut
499
500 sub merge_readings {
501         my( $self, $kept, $deleted, $combined ) = @_;
502         foreach my $edge ( $self->graph->edges_at( $deleted ) ) {
503                 # Get the pair of kept / rel
504                 my @vector = ( $kept );
505                 push( @vector, $edge->[0] eq $deleted ? $edge->[1] : $edge->[0] );
506                 next if $vector[0] eq $vector[1]; # Don't add a self loop
507                 
508                 # If kept changes its text, drop the relationship.
509                 next if $combined;
510                         
511                 # If kept / rel already has a relationship, just keep the old
512                 my $rel = $self->get_relationship( @vector );
513                 next if $rel;
514                 
515                 # Otherwise, adopt the relationship that would be deleted.
516                 $rel = $self->get_relationship( @$edge );
517                 $self->_set_relationship( $rel, @vector );
518         }
519         $self->delete_reading( $deleted );
520 }
521
522 sub _as_graphml { 
523         my( $self, $graphml_ns, $xmlroot, $node_hash, $nodeid_key, $edge_keys ) = @_;
524         
525     my $rgraph = $xmlroot->addNewChild( $graphml_ns, 'graph' );
526         $rgraph->setAttribute( 'edgedefault', 'directed' );
527     $rgraph->setAttribute( 'id', 'relationships', );
528     $rgraph->setAttribute( 'parse.edgeids', 'canonical' );
529     $rgraph->setAttribute( 'parse.edges', scalar($self->graph->edges) );
530     $rgraph->setAttribute( 'parse.nodeids', 'canonical' );
531     $rgraph->setAttribute( 'parse.nodes', scalar($self->graph->vertices) );
532     $rgraph->setAttribute( 'parse.order', 'nodesfirst' );
533     
534     # Add the vertices according to their XML IDs
535     my %rdg_lookup = ( reverse %$node_hash );
536     foreach my $n ( sort _by_xmlid keys( %rdg_lookup ) ) {
537         my $n_el = $rgraph->addNewChild( $graphml_ns, 'node' );
538         $n_el->setAttribute( 'id', $n );
539         _add_graphml_data( $n_el, $nodeid_key, $rdg_lookup{$n} );
540     }
541     
542     # Add the relationship edges, with their object information
543     my $edge_ctr = 0;
544     foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->graph->edges ) {
545         # Add an edge and fill in its relationship info.
546         next unless( exists $node_hash->{$e->[0]} && exists $node_hash->{$e->[1]} );
547                 my $edge_el = $rgraph->addNewChild( $graphml_ns, 'edge' );
548                 $edge_el->setAttribute( 'source', $node_hash->{$e->[0]} );
549                 $edge_el->setAttribute( 'target', $node_hash->{$e->[1]} );
550                 $edge_el->setAttribute( 'id', 'e'.$edge_ctr++ );
551
552                 my $rel_obj = $self->get_relationship( @$e );
553                 foreach my $key ( keys %$edge_keys ) {
554                         my $value = $rel_obj->$key;
555                         _add_graphml_data( $edge_el, $edge_keys->{$key}, $value ) 
556                                 if defined $value;
557                 }
558         }
559 }
560
561 sub _by_xmlid {
562         my $tmp_a = $a;
563         my $tmp_b = $b;
564         $tmp_a =~ s/\D//g;
565         $tmp_b =~ s/\D//g;
566         return $tmp_a <=> $tmp_b;
567 }
568
569 sub _add_graphml_data {
570     my( $el, $key, $value ) = @_;
571     return unless defined $value;
572     my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
573     $data_el->setAttribute( 'key', $key );
574     $data_el->appendText( $value );
575 }
576
577 sub throw {
578         Text::Tradition::Error->throw( 
579                 'ident' => 'Relationship error',
580                 'message' => $_[0],
581                 );
582 }
583
584 no Moose;
585 __PACKAGE__->meta->make_immutable;
586
587 1;