simplify Directory and add exceptions;

[scpubgit/stemmatology.git] / lib / Text / Tradition / Collation.pm
diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm

index 0bbcdf3..77a367e 100644 (file)
--- a/lib/Text/Tradition/Collation.pm
+++ b/lib/Text/Tradition/Collation.pm
@@ -66,12 +66,6 @@ has 'linear' => (
     default => 1,
     );
     
-has 'collapse_punctuation' => (
-       is => 'rw',
-       isa => 'Bool',
-       default => 1,
-       );
-
 has 'ac_label' => (
     is => 'rw',
     isa => 'Str',
@@ -133,8 +127,6 @@ belongs. Required.
 transposed readings should be treated as two linked readings rather than one, 
 and therefore whether the collation graph is acyclic.  Defaults to true.
 
-=item * collapse_punctuation - TODO
-
 =item * baselabel - The default label for the path taken by a base text 
 (if any). Defaults to 'base text'.
 
@@ -154,8 +146,6 @@ the like.  Defaults to ' (a.c.)'.
 
 =head2 linear
 
-=head2 collapse_punctuation
-
 =head2 wit_list_separator
 
 =head2 baselabel
@@ -518,11 +508,26 @@ sub as_dot {
     my $graph_name = $self->tradition->name;
     $graph_name =~ s/[^\w\s]//g;
     $graph_name = join( '_', split( /\s+/, $graph_name ) );
+
+    my %graph_attrs = (
+       'rankdir' => 'LR',
+       'bgcolor' => 'none',
+       );
+    my %node_attrs = (
+       'fontsize' => 11,
+       'fillcolor' => 'white',
+       'style' => 'filled',
+       'shape' => 'ellipse'
+       );
+    my %edge_attrs = ( 
+       'arrowhead' => 'open',
+       'color' => '#000000',
+       'fontcolor' => '#000000',
+       );
+
     my $dot = sprintf( "digraph %s {\n", $graph_name );
-    $dot .= "\tedge [ arrowhead=open ];\n";
-    $dot .= "\tgraph [ rankdir=LR,bgcolor=none ];\n";
-    $dot .= sprintf( "\tnode [ fontsize=%d, fillcolor=%s, style=%s, shape=%s ];\n",
-                     11, "white", "filled", "ellipse" );
+    $dot .= "\tgraph " . _dot_attr_string( \%graph_attrs ) . ";\n";
+    $dot .= "\tnode " . _dot_attr_string( \%node_attrs ) . ";\n";
 
        # Output substitute start/end readings if necessary
        if( $startrank ) {
@@ -532,60 +537,71 @@ sub as_dot {
                $dot .= "\t\"#SUBEND#\" [ label=\"...\" ];\n";  
        }
        my %used;  # Keep track of the readings that actually appear in the graph
-       my %subedges;
-       my %subend;
     foreach my $reading ( $self->readings ) {
        # Only output readings within our rank range.
        next if $startrank && $reading->rank < $startrank;
        next if $endrank && $reading->rank > $endrank;
         $used{$reading->id} = 1;
-        $subedges{$reading->id} = '#SUBSTART#' 
-               if $startrank && $startrank == $reading->rank;
-        $subedges{$reading->id} = '#SUBEND#' 
-               if $endrank && $endrank == $reading->rank;
         # Need not output nodes without separate labels
         next if $reading->id eq $reading->text;
-        my $label = $reading->punctuated_form;
+        my $label = $reading->text;
         $label =~ s/\"/\\\"/g;
         $dot .= sprintf( "\t\"%s\" [ label=\"%s\" ];\n", $reading->id, $label );
     }
     
-    # Add substitute start and end edges if necessary
-    foreach my $node ( keys %subedges ) {
-               my @vector = ( $subedges{$node}, $node );
-               @vector = reverse( @vector ) if $vector[0] =~ /END/;
-       my $witstr = join( ', ', sort $self->reading_witnesses( $self->reading( $node ) ) );
-       my %variables = ( 'color' => '#000000',
-                          'fontcolor' => '#000000',
-                          'label' => $witstr,
-            );
-        my $varopts = join( ', ', map { $_.'="'.$variables{$_}.'"' } sort keys %variables );
-               $dot .= sprintf( "\t\"%s\" -> \"%s\" [ %s ];\n", @vector, $varopts );
-       }
-       
        # Add the real edges
     my @edges = $self->paths;
+       my( %substart, %subend );
     foreach my $edge ( @edges ) {
        # Do we need to output this edge?
        if( $used{$edge->[0]} && $used{$edge->[1]} ) {;
-                       my %variables = ( 'color' => '#000000',
-                                                         'fontcolor' => '#000000',
-                                                         'label' => join( ', ', $self->path_display_label( $edge ) ),
-                               );
-                       my $varopts = join( ', ', map { $_.'="'.$variables{$_}.'"' } sort keys %variables );
+               my $label = $self->path_display_label( $self->path_witnesses( $edge ) );
+                       my $variables = { %edge_attrs, 'label' => $label };
                        # Account for the rank gap if necessary
-                       my $rankgap = $self->reading( $edge->[1] )->rank 
+                       if( $self->reading( $edge->[1] )->has_rank 
+                               && $self->reading( $edge->[0] )->has_rank
+                               && $self->reading( $edge->[1] )->rank 
+                               - $self->reading( $edge->[0] )->rank > 1 ) {
+                               $variables->{'minlen'} = $self->reading( $edge->[1] )->rank 
                                - $self->reading( $edge->[0] )->rank;
-                       $varopts .= ", minlen=$rankgap" if $rankgap > 1;
-                       $dot .= sprintf( "\t\"%s\" -> \"%s\" [ %s ];\n",
-                                                        $edge->[0], $edge->[1], $varopts );
+                       }
+                       my $varopts = _dot_attr_string( $variables );
+                       $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n", 
+                               $edge->[0], $edge->[1], $varopts );
+        } elsif( $used{$edge->[0]} ) {
+               $subend{$edge->[0]} = 1;
+        } elsif( $used{$edge->[1]} ) {
+               $substart{$edge->[1]} = 1;
         }
     }
-    
+    # Add substitute start and end edges if necessary
+    foreach my $node ( keys %substart ) {
+       my $witstr = $self->path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
+       my $variables = { %edge_attrs, 'label' => $witstr };
+        my $varopts = _dot_attr_string( $variables );
+        $dot .= "\t\"#SUBSTART#\" -> \"$node\" $varopts;";
+       }
+    foreach my $node ( keys %subend ) {
+       my $witstr = $self->path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) );
+       my $variables = { %edge_attrs, 'label' => $witstr };
+        my $varopts = _dot_attr_string( $variables );
+        $dot .= "\t\"$node\" -> \"#SUBEND#\" $varopts;";
+       }
+       
     $dot .= "}\n";
     return $dot;
 }
 
+sub _dot_attr_string {
+       my( $hash ) = @_;
+       my @attrs;
+       foreach my $k ( sort keys %$hash ) {
+               my $v = $hash->{$k};
+               push( @attrs, $k.'="'.$v.'"' );
+       }
+       return( '[ ' . join( ', ', @attrs ) . ' ]' );
+}
+
 sub path_witnesses {
        my( $self, @edge ) = @_;
        # If edge is an arrayref, cope.
@@ -598,10 +614,10 @@ sub path_witnesses {
 }
 
 sub path_display_label {
-       my( $self, $edge ) = @_;
-       my @wits = $self->path_witnesses( $edge );
+       my( $self, @wits ) = @_;
        my $maj = scalar( $self->tradition->witnesses ) * 0.6;
        if( scalar @wits > $maj ) {
+               # TODO break out a.c. wits
                return 'majority';
        } else {
                return join( ', ', @wits );
@@ -752,7 +768,6 @@ sub as_graphml {
         $node_el->setAttribute( 'id', $node_xmlid );
         foreach my $d ( keys %node_data ) {
                my $nval = $n->$d;
-               $nval = $n->punctuated_form if $d eq 'text';
                _add_graphml_data( $node_el, $node_data_keys{$d}, $nval )
                        if defined $nval;
         }
@@ -868,7 +883,7 @@ sub make_alignment_table {
                { 'witness' => $wit->sigil, 'tokens' => \@row } );
         if( $wit->is_layered ) {
                my @wit_ac_path = $self->reading_sequence( $self->start, $self->end, 
-                       $wit->sigil.$self->ac_label, $wit->sigil );
+                       $wit->sigil.$self->ac_label );
             my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos, $noderefs );
                        push( @{$table->{'alignment'}},
                                { 'witness' => $wit->sigil.$self->ac_label, 'tokens' => \@ac_row } );
@@ -943,9 +958,10 @@ used wherever no path exists for $sigil or $backup.
 =cut
 
 # TODO Think about returning some lazy-eval iterator.
+# TODO Get rid of backup; we should know from what witness is whether we need it.
 
 sub reading_sequence {
-    my( $self, $start, $end, $witness, $backup ) = @_;
+    my( $self, $start, $end, $witness ) = @_;
 
     $witness = $self->baselabel unless $witness;
     my @readings = ( $start );
@@ -958,7 +974,7 @@ sub reading_sequence {
         }
         $seen{$n->id} = 1;
         
-        my $next = $self->next_reading( $n, $witness, $backup );
+        my $next = $self->next_reading( $n, $witness );
         unless( $next ) {
             warn "Did not find any path for $witness from reading " . $n->id;
             last;
@@ -1005,7 +1021,15 @@ sub prior_reading {
 }
 
 sub _find_linked_reading {
-    my( $self, $direction, $node, $path, $alt_path ) = @_;
+    my( $self, $direction, $node, $path ) = @_;
+    
+    # Get a backup if we are dealing with a layered witness
+    my $alt_path;
+    my $aclabel = $self->ac_label;
+    if( $path && $path =~ /^(.*)\Q$aclabel\E$/ ) {
+       $alt_path = $1;
+    }
+    
     my @linked_paths = $direction eq 'next' 
         ? $self->sequence->edges_from( $node ) 
         : $self->sequence->edges_to( $node );
@@ -1061,8 +1085,25 @@ sub _witnesses_of_label {
     my $regex = $self->wit_list_separator;
     my @answer = split( /\Q$regex\E/, $label );
     return @answer;
-}    
+}
+
+=head2 path_text( $sigil, $mainsigil [, $start, $end ] )
 
+Returns the text of a witness (plus its backup, if we are using a layer)
+as stored in the collation.  The text is returned as a string, where the
+individual readings are joined with spaces and the meta-readings (e.g.
+lacunae) are omitted.  Optional specification of $start and $end allows
+the generation of a subset of the witness text.
+
+=cut
+
+sub path_text {
+       my( $self, $wit, $start, $end ) = @_;
+       $start = $self->start unless $start;
+       $end = $self->end unless $end;
+       my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit );
+       return join( ' ', map { $_->text } @path );
+}
 
 =head1 INITIALIZATION METHODS
 
@@ -1235,6 +1276,28 @@ sub flatten_ranks {
     }
 }
 
+=head2 text_from_paths
+
+Calculate the text array for all witnesses from the path, for later consistency
+checking.  Only to be used if there is no non-graph-based way to know the
+original texts.
+
+=cut
+
+sub text_from_paths {
+       my $self = shift;
+    foreach my $wit ( $self->tradition->witnesses ) {
+       my @text = split( /\s+/, 
+               $self->reading_sequence( $self->start, $self->end, $wit->sigil ) );
+       $wit->text( \@text );
+       if( $wit->is_layered ) {
+                       my @uctext = split( /\s+/, 
+                               $self->reading_sequence( $self->start, $self->end, 
+                                       $wit->sigil.$self->ac_label ) );
+                       $wit->text( \@uctext );
+       }
+    }    
+}
 
 =head1 UTILITY FUNCTIONS