1e3af41461c2958bf21873a5e3f3969596c59d0f
[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation / RelationshipStore.pm
1 package Text::Tradition::Collation::RelationshipStore;
2
3 use strict;
4 use warnings;
5 use Text::Tradition::Error;
6 use Text::Tradition::Collation::Relationship;
7 use TryCatch;
8
9 use Moose;
10
11 =head1 NAME
12
13 Text::Tradition::Collation::RelationshipStore - Keeps track of the relationships
14 between readings in a given collation
15     
16 =head1 DESCRIPTION
17
18 Text::Tradition is a library for representation and analysis of collated
19 texts, particularly medieval ones.  The RelationshipStore is an internal object
20 of the collation, to keep track of the defined relationships (both specific and
21 general) between readings.
22
23 =begin testing
24
25 use Text::Tradition;
26 use TryCatch;
27
28 use_ok( 'Text::Tradition::Collation::RelationshipStore' );
29
30 # Add some relationships, and delete them
31
32 my $cxfile = 't/data/Collatex-16.xml';
33 my $t = Text::Tradition->new( 
34     'name'  => 'inline', 
35     'input' => 'CollateX',
36     'file'  => $cxfile,
37     );
38 my $c = $t->collation;
39
40 my @v1 = $c->add_relationship( 'n21', 'n22', { 'type' => 'lexical' } );
41 is( scalar @v1, 1, "Added a single relationship" );
42 is( $v1[0]->[0], 'n21', "Got correct node 1" );
43 is( $v1[0]->[1], 'n22', "Got correct node 2" );
44 my @v2 = $c->add_relationship( 'n24', 'n23', 
45         { 'type' => 'spelling', 'scope' => 'global' } );
46 is( scalar @v2, 2, "Added a global relationship with two instances" );
47 @v1 = $c->del_relationship( 'n22', 'n21' );
48 is( scalar @v1, 1, "Deleted first relationship" );
49 @v2 = $c->del_relationship( 'n12', 'n13' );
50 is( scalar @v2, 2, "Deleted second global relationship" );
51 my @v3 = $c->del_relationship( 'n1', 'n2' );
52 is( scalar @v3, 0, "Nothing deleted on non-existent relationship" );
53
54 =end testing
55
56 =head1 METHODS
57
58 =head2 new( collation => $collation );
59
60 Creates a new relationship store for the given collation.
61
62 =cut
63
64 has 'collation' => (
65         is => 'ro',
66         isa => 'Text::Tradition::Collation',
67         required => 1,
68         weak_ref => 1,
69         );
70
71 has 'scopedrels' => (
72         is => 'ro',
73         isa => 'HashRef[HashRef[Text::Tradition::Collation::Relationship]]',
74         default => sub { {} },
75         );
76
77 has 'graph' => (
78         is => 'ro',
79         isa => 'Graph',
80         default => sub { Graph->new( undirected => 1 ) },
81     handles => {
82         relationships => 'edges',
83         add_reading => 'add_vertex',
84         delete_reading => 'delete_vertex',
85     },
86         );
87         
88 =head2 get_relationship
89
90 Return the relationship object, if any, that exists between two readings.
91
92 =cut
93
94 sub get_relationship {
95         my $self = shift;
96         my @vector;
97         if( @_ == 1 && ref( $_[0] ) eq 'ARRAY' ) {
98                 # Dereference the edge arrayref that was passed.
99                 my $edge = shift;
100                 @vector = @$edge;
101         } else {
102                 @vector = @_;
103         }
104         my $relationship;
105         if( $self->graph->has_edge_attribute( @vector, 'object' ) ) {
106                 $relationship = $self->graph->get_edge_attribute( @vector, 'object' );
107         } 
108         return $relationship;
109 }
110
111 sub _set_relationship {
112         my( $self, $relationship, @vector ) = @_;
113         $self->graph->add_edge( @vector );
114         $self->graph->set_edge_attribute( @vector, 'object', $relationship );
115 }
116
117 =head2 create
118
119 Create a new relationship with the given options and return it.
120 Warn and return undef if the relationship cannot be created.
121
122 =cut
123
124 sub create {
125         my( $self, $options ) = @_;
126         # Check to see if a relationship exists between the two given readings
127         my $source = delete $options->{'orig_a'};
128         my $target = delete $options->{'orig_b'};
129         my $rel = $self->get_relationship( $source, $target );
130         if( $rel ) {
131                 if( $rel->type eq 'collated' ) {
132                         # Always replace a 'collated' relationship with a more descriptive
133                         # one, if asked.
134                         $self->del_relationship( $source, $target );
135                 } elsif( $rel->type ne $options->{'type'} ) {
136                         throw( "Another relationship of type " . $rel->type 
137                                 . " already exists between $source and $target" );
138                 } else {
139                         return $rel;
140                 }
141         }
142         
143         # Check to see if a nonlocal relationship is defined for the two readings
144         $rel = $self->scoped_relationship( $options->{'reading_a'}, 
145                 $options->{'reading_b'} );
146         if( $rel && $rel->type eq $options->{'type'} ) {
147                 return $rel;
148         } elsif( $rel ) {
149                 throw( sprintf( "Relationship of type %s with scope %s already defined for readings %s and %s", $rel->type, $rel->scope, $options->{'reading_a'}, $options->{'reading_b'} ) );
150         } else {
151                 $rel = Text::Tradition::Collation::Relationship->new( $options );
152                 $self->add_scoped_relationship( $rel ) if $rel->nonlocal;
153                 return $rel;
154         }
155 }
156
157 =head2 add_scoped_relationship( $rel )
158
159 Keep track of relationships defined between specific readings that are scoped
160 non-locally.  Key on whichever reading occurs first alphabetically.
161
162 =cut
163
164 sub add_scoped_relationship {
165         my( $self, $rel ) = @_;
166         my $rdga = $rel->type eq 'orthographic' ? $rel->reading_a : lc( $rel->reading_a );
167         my $rdgb = $rel->type eq 'orthographic' ? $rel->reading_b : lc( $rel->reading_b );      
168         my $r = $self->scoped_relationship( $rdga, $rdgb );
169         if( $r ) {
170                 warn sprintf( "Scoped relationship of type %s already exists between %s and %s",
171                         $r->type, $rdga, $rdgb );
172                 return;
173         }
174         my( $first, $second ) = sort ( $rdga, $rdgb );
175         $self->scopedrels->{$first}->{$second} = $rel;
176 }
177
178 =head2 scoped_relationship( $reading_a, $reading_b )
179
180 Returns the general (document-level or global) relationship that has been defined 
181 between the two reading strings. Returns undef if there is no general relationship.
182
183 =cut
184
185 sub scoped_relationship {
186         my( $self, $rdga, $rdgb ) = @_;
187         my( $first, $second ) = sort( $rdga, $rdgb );
188         if( exists $self->scopedrels->{$first}->{$second} ) {
189                 return $self->scopedrels->{$first}->{$second};
190         } else {
191                 return undef;
192         }
193 }
194
195 =head2 add_relationship( $self, $source, $sourcetext, $target, $targettext, $opts )
196
197 Adds the relationship specified in $opts (see Text::Tradition::Collation::Relationship 
198 for the possible options) between the readings given in $source and $target.  Sets
199 up a scoped relationship between $sourcetext and $targettext if the relationship is
200 scoped non-locally.
201
202 Returns a status boolean and a list of all reading pairs connected by the call to
203 add_relationship.
204
205 =cut
206
207 sub add_relationship {
208         my( $self, $source, $source_rdg, $target, $target_rdg, $options ) = @_;
209
210         my $relationship;
211         my $thispaironly;
212         if( ref( $options ) eq 'Text::Tradition::Collation::Relationship' ) {
213                 $relationship = $options;
214                 $thispaironly = 1;  # If existing rel, set only where asked.
215         } else {
216                 # Check the options
217                 $options->{'scope'} = 'local' unless $options->{'scope'};
218                 $options->{'scope'} = 'local' if $options->{'type'} eq 'collated';
219                 $options->{'scope'} = 'local' if $options->{'type'} eq 'transposition';
220                 
221                 my( $is_valid, $reason ) = 
222                         $self->relationship_valid( $source, $target, $options->{'type'} );
223                 unless( $is_valid ) {
224                         throw( "Invalid relationship: $reason" );
225                 }
226                 
227                 # Try to create the relationship object.
228                 $options->{'reading_a'} = $source_rdg->text;
229                 $options->{'reading_b'} = $target_rdg->text;
230                 $options->{'orig_a'} = $source;
231                 $options->{'orig_b'} = $target;
232         if( $options->{'scope'} ne 'local' ) {
233                         # Is there a relationship with this a & b already?
234                         # Case-insensitive for non-orthographics.
235                         my $rdga = $options->{'type'} eq 'orthographic' 
236                                 ? $options->{'reading_a'} : lc( $options->{'reading_a'} );
237                         my $rdgb = $options->{'type'} eq 'orthographic' 
238                                 ? $options->{'reading_b'} : lc( $options->{'reading_b'} );
239                         my $otherrel = $self->scoped_relationship( $rdga, $rdgb );
240                         if( $otherrel && $otherrel->type eq $options->{type}
241                                 && $otherrel->scope eq $options->{scope} ) {
242                                 warn "Applying existing scoped relationship";
243                                 $relationship = $otherrel;
244                         }
245         }
246                 $relationship = $self->create( $options ) unless $relationship;  # Will throw on error
247     }
248
249
250         # Find all the pairs for which we need to set the relationship.
251         my @vectors = [ $source, $target ];
252     if( $relationship->colocated && $relationship->nonlocal && !$thispaironly ) {
253         push( @vectors, $self->_find_applicable( $relationship ) );
254     }
255         
256     # Now set the relationship(s).
257     my @pairs_set;
258     foreach my $v ( @vectors ) {
259                 my $rel = $self->get_relationship( @$v );
260         if( $rel && $rel ne $relationship ) {
261                 if( $rel->nonlocal ) {
262                         throw( "Found conflicting relationship at @$v" );
263                 } elsif( $rel->type ne 'collated' ) {
264                         # Replace a collation relationship; leave any other sort in place.
265                         warn "Not overriding local relationship set at @$v";
266                                 next;
267                 }
268         }
269         map { $self->_drop_collations( $_ ) } @$v;
270         $self->_set_relationship( $relationship, @$v );
271         push( @pairs_set, $v );
272     }
273     
274     return @pairs_set;
275 }
276
277 =head2 del_scoped_relationship( $reading_a, $reading_b )
278
279 Returns the general (document-level or global) relationship that has been defined 
280 between the two reading strings. Returns undef if there is no general relationship.
281
282 =cut
283
284 sub del_scoped_relationship {
285         my( $self, $rdga, $rdgb ) = @_;
286         my( $first, $second ) = sort( $rdga, $rdgb );
287         return delete $self->scopedrels->{$first}->{$second};
288 }
289
290 sub _find_applicable {
291         my( $self, $rel ) = @_;
292         my $c = $self->collation;
293         # TODO Someday we might use a case sensitive language.
294         my $lang = $c->tradition->language;
295         my @vectors;
296         my @identical_readings;
297         if( $rel->type eq 'orthographic' ) {
298                 @identical_readings = grep { $_->text eq $rel->reading_a } 
299                         $c->readings;
300         } else {
301                 @identical_readings = grep { lc( $_->text ) eq lc( $rel->reading_a ) }
302                         $c->readings;
303         }
304         foreach my $ir ( @identical_readings ) {
305                 my @itarget;
306                 if( $rel->type eq 'orthographic' ) {
307                         @itarget = grep { $_->rank == $ir->rank 
308                                                           && $_->text eq $rel->reading_b } $c->readings;
309                 } else {
310                         @itarget = grep { $_->rank == $ir->rank 
311                                                           && lc( $_->text ) eq lc( $rel->reading_b ) } $c->readings;
312                 }
313                 if( @itarget ) {
314                         # Warn if there is more than one hit with no orth link between them.
315                         my $itmain = shift @itarget;
316                         if( @itarget ) {
317                                 my %all_targets;
318                                 map { $all_targets{$_} = 1 } @itarget;
319                                 map { delete $all_targets{$_} } 
320                                         $self->related_readings( $itmain, 
321                                                 sub { $_[0]->type eq 'orthographic' } );
322                         warn "More than one unrelated reading with text " . $itmain->text
323                                 . " at rank " . $ir->rank . "!" if keys %all_targets;
324                         }
325                         push( @vectors, [ $ir->id, $itmain->id ] );
326                 }
327         }
328         return @vectors;
329 }
330
331 =head2 del_relationship( $source, $target )
332
333 Removes the relationship between the given readings. If the relationship is
334 non-local, removes the relationship everywhere in the graph.
335
336 =cut
337
338 sub del_relationship {
339         my( $self, $source, $target ) = @_;
340         my $rel = $self->get_relationship( $source, $target );
341         return () unless $rel; # Nothing to delete; return an empty set.
342         my @vectors = ( [ $source, $target ] );
343         $self->_remove_relationship( $source, $target );
344         if( $rel->nonlocal ) {
345                 # Remove the relationship wherever it occurs.
346                 # Remove the relationship wherever it occurs.
347                 my @rel_edges = grep { $self->get_relationship( @$_ ) == $rel }
348                         $self->relationships;
349                 foreach my $re ( @rel_edges ) {
350                         $self->_remove_relationship( @$re );
351                         push( @vectors, $re );
352                 }
353                 $self->del_scoped_relationship( $rel->reading_a, $rel->reading_b );
354         }
355         return @vectors;
356 }
357
358 sub _remove_relationship {
359         my( $self, @vector ) = @_;
360         $self->graph->delete_edge( @vector );
361 }
362         
363 =head2 relationship_valid( $source, $target, $type )
364
365 Checks whether a relationship of type $type may exist between the readings given
366 in $source and $target.  Returns a tuple of ( status, message ) where status is
367 a yes/no boolean and, if the answer is no, message gives the reason why.
368
369 =cut
370
371 sub relationship_valid {
372     my( $self, $source, $target, $rel ) = @_;
373     my $c = $self->collation;
374     if ( $rel eq 'transposition' || $rel eq 'repetition' ) {
375                 # Check that the two readings do (for a repetition) or do not (for
376                 # a transposition) appear in the same witness.
377                 # If we haven't made reading paths yet, take it on faith.
378                 return( 1, "no paths yet" ) unless $c->sequence->successors( $c->start );
379                 
380                 # We have some paths, so carry on.
381                 my %seen_wits;
382                 map { $seen_wits{$_} = 1 } $c->reading_witnesses( $source );
383                 foreach my $w ( $c->reading_witnesses( $target ) ) {
384                         if( $seen_wits{$w} ) {
385                                 return ( 0, "Readings both occur in witness $w" ) 
386                                         if $rel eq 'transposition';
387                                 return ( 1, "ok" ) if $rel eq 'repetition';
388                         }
389                 }
390                 # For transpositions, there should also be a path from one reading
391                 # to the other.
392                 if( $rel eq 'transposition' ) {
393                         my( %sourceseq, %targetseq );
394                         map { $sourceseq{$_} = 1 } $c->sequence->all_successors( $source );
395                         map { $targetseq{$_} = 1 } $c->sequence->all_successors( $target );
396                         return( 0, "Readings are parallel" )
397                                 unless $sourceseq{$target} || $targetseq{$source};
398                 }
399                 return $rel eq 'transposition' ? ( 1, "ok" )
400                         : ( 0, "Readings occur only in distinct witnesses" );
401         } 
402         if( $rel ne 'repetition' ) {
403                 # Check that linking the source and target in a relationship won't lead
404                 # to a path loop for any witness.  If they have the same rank then
405                 # they are parallel by definition.
406                 # For transpositions, we want the opposite result: it is only valid if
407                 # the readings cannot be parallel.
408                 my $sourcerank = $c->reading( $source )->has_rank
409                         ? $c->reading( $source )->rank : undef;
410                 my $targetrank = $c->reading( $target )->has_rank
411                         ? $c->reading( $target )->rank : undef;
412                 if( $sourcerank && $targetrank && $sourcerank == $targetrank ) {
413                         return( 0, "Cannot transpose readings of same rank" )
414                                 if $rel eq 'transposition';
415                         return( 1, "ok" );
416                 }
417                 
418                 # Otherwise, first make a lookup table of all the
419                 # readings related to either the source or the target.
420                 my @proposed_related = ( $source, $target );
421                 # Drop the collation links of source and target, unless we want to
422                 # add a collation relationship.
423                 my @dropped;
424                 foreach my $r ( ( $source, $target ) ) {
425                         push( @dropped, $self->_drop_collations( $r ) )
426                                 unless $rel eq 'collated';
427                         push( @proposed_related, $self->related_readings( $r, 'colocated' ) );
428                 }
429                 # Also drop any collation links at intermediate ranks.
430                 foreach my $rank ( $sourcerank+1 .. $targetrank-1 ) {
431                         map { push( @dropped, $self->_drop_collations( $_ ) ) }
432                                 $c->readings_at_rank( $rank );
433                 }
434                 my %pr_ids;
435                 map { $pr_ids{ $_ } = 1 } @proposed_related;
436         
437                 # The cumulative predecessors and successors of the proposed-related readings
438                 # should not overlap.
439                 my %all_pred;
440                 my %all_succ;
441                 foreach my $pr ( keys %pr_ids ) {
442                         map { $all_pred{$_} = 1 } $c->sequence->all_predecessors( $pr );
443                         map { $all_succ{$_} = 1 } $c->sequence->all_successors( $pr );
444                 }
445                 foreach my $k ( keys %all_pred ) {
446                         if( exists $all_succ{$k} ) {
447                                 $self->_restore_collations( @dropped );
448                                 return( 1, "ok" ) if $rel eq 'transposition';
449                                 return( 0, "Relationship would create witness loop" );
450                         }
451                 }
452                 foreach my $k ( keys %pr_ids ) {
453                         if( exists $all_pred{$k} || exists $all_succ{$k} ) {
454                                 $self->_restore_collations( @dropped );
455                                 return( 1, "ok" ) if $rel eq 'transposition';
456                                 return( 0, "Relationship would create witness loop" );
457                         }
458                 }
459                 if( $rel eq 'transposition' ) {
460                         $self->_restore_collations( @dropped );
461                         return ( 0, "Cannot transpose parallel readings" );
462                 }
463                 return ( 1, "ok" );
464         }
465 }
466
467 sub _drop_collations {
468         my( $self, $reading ) = @_;
469         my @deleted;
470         foreach my $n ( $self->graph->neighbors( $reading ) ) {
471                 if( $self->get_relationship( $reading, $n )->type eq 'collated' ) {
472                         $self->del_relationship( $reading, $n );
473                         push( @deleted, [ $reading, $n ] );
474                 }
475         }
476         return @deleted;
477 }
478
479 sub _restore_collations {
480         my( $self, @vectors ) = @_;
481         foreach my $v ( @vectors ) {
482                 try {
483                         $self->add_relationship( @$v, { 'type' => 'collated' } );
484                 } catch ( Text::Tradition::Error $e ) {
485                         warn "Could not restore collation " . join( ' -> ', @$v );
486                 }
487         }
488 }
489
490 =head2 related_readings( $reading, $filter )
491
492 Returns a list of readings that are connected via relationship links to $reading.
493 If $filter is set to a subroutine ref, returns only those related readings where
494 $filter( $relationship ) returns a true value.
495
496 =cut
497
498 sub related_readings {
499         my( $self, $reading, $filter ) = @_;
500         my $return_object;
501         if( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) {
502                 $reading = $reading->id;
503                 $return_object = 1;
504         }
505         my @answer;
506         if( $filter ) {
507                 # Backwards compat
508                 if( $filter eq 'colocated' ) {
509                         $filter = sub { $_[0]->colocated };
510                 }
511                 my %found = ( $reading => 1 );
512                 my $check = [ $reading ];
513                 my $iter = 0;
514                 while( @$check ) {
515                         my $more = [];
516                         foreach my $r ( @$check ) {
517                                 foreach my $nr ( $self->graph->neighbors( $r ) ) {
518                                         if( &$filter( $self->get_relationship( $r, $nr ) ) ) {
519                                                 push( @$more, $nr ) unless exists $found{$nr};
520                                                 $found{$nr} = 1;
521                                         }
522                                 }
523                         }
524                         $check = $more;
525                 }
526                 delete $found{$reading};
527                 @answer = keys %found;
528         } else {
529                 @answer = $self->graph->all_reachable( $reading );
530         }
531         if( $return_object ) {
532                 my $c = $self->collation;
533                 return map { $c->reading( $_ ) } @answer;
534         } else {
535                 return @answer;
536         }
537 }
538
539 =head2 merge_readings( $kept, $deleted );
540
541 Makes a best-effort merge of the relationship links between the given readings, and
542 stops tracking the to-be-deleted reading.
543
544 =cut
545
546 sub merge_readings {
547         my( $self, $kept, $deleted, $combined ) = @_;
548         # Delete any relationship between kept and deleted
549         $self->del_relationship( $kept, $deleted );
550         foreach my $edge ( $self->graph->edges_at( $deleted ) ) {
551                 # Get the pair of kept / rel
552                 my @vector = ( $kept );
553                 push( @vector, $edge->[0] eq $deleted ? $edge->[1] : $edge->[0] );
554                 next if $vector[0] eq $vector[1]; # Don't add a self loop
555                 
556                 # If kept changes its text, drop the relationship.
557                 next if $combined;
558                         
559                 # If kept / rel already has a relationship, just keep the old
560                 my $rel = $self->get_relationship( @vector );
561                 next if $rel;
562                 
563                 # Otherwise, adopt the relationship that would be deleted.
564                 $rel = $self->get_relationship( @$edge );
565                 $self->_set_relationship( $rel, @vector );
566         }
567         $self->delete_reading( $deleted );
568 }
569
570 sub _as_graphml { 
571         my( $self, $graphml_ns, $xmlroot, $node_hash, $nodeid_key, $edge_keys ) = @_;
572         
573     my $rgraph = $xmlroot->addNewChild( $graphml_ns, 'graph' );
574         $rgraph->setAttribute( 'edgedefault', 'directed' );
575     $rgraph->setAttribute( 'id', 'relationships', );
576     $rgraph->setAttribute( 'parse.edgeids', 'canonical' );
577     $rgraph->setAttribute( 'parse.edges', scalar($self->graph->edges) );
578     $rgraph->setAttribute( 'parse.nodeids', 'canonical' );
579     $rgraph->setAttribute( 'parse.nodes', scalar($self->graph->vertices) );
580     $rgraph->setAttribute( 'parse.order', 'nodesfirst' );
581     
582     # Add the vertices according to their XML IDs
583     my %rdg_lookup = ( reverse %$node_hash );
584     foreach my $n ( sort _by_xmlid keys( %rdg_lookup ) ) {
585         my $n_el = $rgraph->addNewChild( $graphml_ns, 'node' );
586         $n_el->setAttribute( 'id', $n );
587         _add_graphml_data( $n_el, $nodeid_key, $rdg_lookup{$n} );
588     }
589     
590     # Add the relationship edges, with their object information
591     my $edge_ctr = 0;
592     foreach my $e ( sort { $a->[0] cmp $b->[0] } $self->graph->edges ) {
593         # Add an edge and fill in its relationship info.
594                 my $edge_el = $rgraph->addNewChild( $graphml_ns, 'edge' );
595                 $edge_el->setAttribute( 'source', $node_hash->{$e->[0]} );
596                 $edge_el->setAttribute( 'target', $node_hash->{$e->[1]} );
597                 $edge_el->setAttribute( 'id', 'e'.$edge_ctr++ );
598
599                 my $rel_obj = $self->get_relationship( @$e );
600                 foreach my $key ( keys %$edge_keys ) {
601                         my $value = $rel_obj->$key;
602                         _add_graphml_data( $edge_el, $edge_keys->{$key}, $value ) 
603                                 if defined $value;
604                 }
605         }
606 }
607
608 sub _by_xmlid {
609         my $tmp_a = $a;
610         my $tmp_b = $b;
611         $tmp_a =~ s/\D//g;
612         $tmp_b =~ s/\D//g;
613         return $tmp_a <=> $tmp_b;
614 }
615
616 sub _add_graphml_data {
617     my( $el, $key, $value ) = @_;
618     return unless defined $value;
619     my $data_el = $el->addNewChild( $el->namespaceURI, 'data' );
620     $data_el->setAttribute( 'key', $key );
621     $data_el->appendText( $value );
622 }
623
624 sub throw {
625         Text::Tradition::Error->throw( 
626                 'ident' => 'Relationship error',
627                 'message' => $_[0],
628                 );
629 }
630
631 no Moose;
632 __PACKAGE__->meta->make_immutable;
633
634 1;