From: tla <tla@mit.edu>
Date: Tue, 17 May 2011 14:12:16 +0000 (+0200)
Subject: some more rehoming of functionality
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=4a8828f0ac801bd0a198cdaf74a7b6769db0ec6b;p=scpubgit%2Fstemmatology.git

some more rehoming of functionality
---

diff --git a/lib/Text/Tradition.pm b/lib/Text/Tradition.pm
index 695f4e2..92673be 100644
--- a/lib/Text/Tradition.pm
+++ b/lib/Text/Tradition.pm
@@ -1,5 +1,6 @@
 package Text::Tradition;
 
+use Module::Load;
 use Moose;
 use Text::Tradition::Collation;
 use Text::Tradition::Witness;
@@ -15,12 +16,10 @@ has 'witnesses' => (
     is => 'rw',
     isa => 'ArrayRef[Text::Tradition::Witness]',
     handles => {
-	all_options    => 'elements',
-	add_option     => 'push',
-	map_options    => 'map',
-	option_count   => 'count',
-	sorted_options => 'sort',
+	all    => 'elements',
+	add    => 'push',
     },
+    default => sub { [] },
     );
 
 sub BUILD {
@@ -50,13 +49,51 @@ sub BUILD {
 	    # TODO Now how to collate these?
 	}
     } else {
-	# Else we got passed args intended for the collator.
-	$init_args->{'tradition'} = $self;
-	$self->_save_collation( Text::Tradition::Collation->new( %$init_args ) );
-	$self->witnesses( $self->collation->create_witnesses() );
+	# Else we need to parse some collation data.  Make a Collation object
+	my $collation = Text::Tradition::Collation->new( %$init_args,
+							'tradition' => $self );
+	$self->_save_collation( $collation );
+
+	# Call the appropriate parser on the given data
+	my @formats = grep { /^(GraphML|CSV|CTE|TEI)$/ } keys( %$init_args );
+	my $format = shift( @formats );
+	unless( $format ) {
+	    warn "No data given to create a collation; will initialize an empty one";
+	}
+	if( $format && $format =~ /^(CSV|CTE)$/ && 
+	    !exists $init_args->{'base'} ) {
+	    warn "Cannot make a collation from $format without a base text";
+	    return;
+	}
+
+	# Starting point for all texts
+	my $last_node = $collation->add_reading( '#START#' );
+
+	# Now do the parsing. 
+	my @sigla;
+	if( $format ) {
+	    my @parseargs;
+	    if( $format =~ /^(CSV|CTE)$/ ) {
+		@parseargs = ( 'base' => $init_args->{'base'},
+			       'data' => $init_args->{$format},
+			       'format' => $format );
+		$format = 'BaseText';
+	    } else {
+		@parseargs = ( $init_args->{ $format } ); 
+	    }
+	    my $mod = "Text::Tradition::Parser::$format";
+	    load( $mod );
+	    $mod->can('parse')->( $self, @parseargs );
+	}
     }
 }
 
+sub add_witness {
+    my $self = shift;
+    my $new_wit = Text::Tradition::Witness->new( @_ );
+    push( @{$self->witnesses}, $new_wit );
+}
+
 # The user will usually be instantiating a Tradition object, and
 # examining its collation.  The information about the tradition can
 # come via several routes:
diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm
index 106ab55..fd5bda5 100644
--- a/lib/Text/Tradition/Collation.pm
+++ b/lib/Text/Tradition/Collation.pm
@@ -2,7 +2,6 @@ package Text::Tradition::Collation;
 
 use Graph::Easy;
 use IPC::Run qw( run binary );
-use Module::Load;
 use Text::Tradition::Collation::Reading;
 use Moose;
 
@@ -49,6 +48,12 @@ has 'graphml' => (
     predicate => 'has_graphml',
     );
 
+has 'wit_list_separator' => (
+			     is => 'rw',
+			     isa => 'Str',
+			     default => ', ',
+			     );
+
 # The collation can be created two ways:
 # 1. Collate a set of witnesses (with CollateX I guess) and process
 #    the results as in 2.
@@ -66,51 +71,11 @@ has 'graphml' => (
 
 sub BUILD {
     my( $self, $args ) = @_;
-
-    # Call the appropriate parser on the given data
-    my @formats = grep { /^(GraphML|CSV|CTE|TEI)$/ } keys( %$args );
-    my $format = shift( @formats );
-    unless( $format ) {
-	warn "No data given to create a graph; will initialize an empty one";
-    }
-    if( $format && $format =~ /^(CSV|CTE)$/ && !exists $args->{'base'} ) {
-	warn "Cannot make a graph from $format without a base text";
-	return;
-    }
-
-    # Initialize our graph object.
     $self->graph->use_class('node', 'Text::Tradition::Collation::Reading');
-    $self->graph->set_attribute( 'node', 'shape', 'ellipse' );
-    # Starting point for all texts
-    my $last_node = $self->add_reading( '#START#' );
-
-    # Now do the parsing.
-    my @sigla;
-    if( $format ) {
-	my @parseargs;
-	if( $format =~ /^(CSV|CTE)$/ ) {
-	    @parseargs = ( 'base' => $args->{'base'},
-		      'data' => $args->{$format},
-		      'format' => $format );
-	    $format = 'BaseText';
-	} else {
-	    @parseargs = ( $args->{ $format } ); 
-	}
-	my $mod = "Text::Tradition::Parser::$format";
-	load( $mod );
-	# TODO parse needs to return witness IDs
-	@sigla = $mod->can('parse')->( $self, @parseargs );
-    }
 
-    # Do we need to initialize the witnesses?
-    unless( $args->{'have_witnesses'} ) {
-	# initialize Witness objects for all our witnesses
-	my @witnesses;
-	foreach my $sigil ( @sigla ) {
-	    push( @witnesses, Text::Tradition::Witness->new( 'sigil' => $sigil ) );
-	}
-	$self->tradition->witnesses( \@witnesses );
-    }
+    # Pass through any graph-specific options.
+    my $shape = exists( $args->{'shape'} ) ? $args->{'shape'} : 'ellipse';
+    $self->graph->set_attribute( 'node', 'shape', $shape );
 }
 
 # Wrappers around some methods
@@ -133,7 +98,7 @@ sub merge_readings {
 print $graph->as_svg( $recalculate );
 
 Returns an SVG string that represents the graph.  Uses GraphViz to do
-this, because Graph::Easy doesn't cope well with long graphs. Unless
+this, because Graph::Easy doesn\'t cope well with long graphs. Unless
 $recalculate is passed (and is a true value), the method will return a
 cached copy of the SVG after the first call to the method.
 
@@ -268,7 +233,7 @@ Returns the beginning of the collation, a meta-reading with label '#START#'.
 =cut
 
 sub start {
-    # Return the beginning node of the graph.
+    # Return the beginning reading of the graph.
     my $self = shift;
     my( $new_start ) = @_;
     if( $new_start ) {
@@ -278,37 +243,37 @@ sub start {
     return $self->reading('#START#');
 }
 
-=item B<next_word>
+=item B<next_reading>
 
-my $next_node = $graph->next_word( $node, $path );
+my $next_reading = $graph->next_reading( $reading, $witpath );
 
-Returns the node that follows the given node along the given witness
+Returns the reading that follows the given reading along the given witness
 path.  TODO These are badly named.
 
 =cut
 
-sub next_word {
+sub next_reading {
     # Return the successor via the corresponding edge.
     my $self = shift;
-    return $self->_find_linked_word( 'next', @_ );
+    return $self->_find_linked_reading( 'next', @_ );
 }
 
-=item B<prior_word>
+=item B<prior_reading>
 
-my $prior_node = $graph->prior_word( $node, $path );
+my $prior_reading = $graph->prior_reading( $reading, $witpath );
 
-Returns the node that precedes the given node along the given witness
+Returns the reading that precedes the given reading along the given witness
 path.  TODO These are badly named.
 
 =cut
 
-sub prior_word {
+sub prior_reading {
     # Return the predecessor via the corresponding edge.
     my $self = shift;
-    return $self->_find_linked_word( 'prior', @_ );
+    return $self->_find_linked_reading( 'prior', @_ );
 }
 
-sub _find_linked_word {
+sub _find_linked_reading {
     my( $self, $direction, $node, $edge ) = @_;
     $edge = 'base text' unless $edge;
     my @linked_edges = $direction eq 'next' 
@@ -317,9 +282,9 @@ sub _find_linked_word {
     
     # We have to find the linked edge that contains all of the
     # witnesses supplied in $edge.
-    my @edge_wits = split( /, /, $edge );
+    my @edge_wits = $self->witnesses_of_label( $edge );
     foreach my $le ( @linked_edges ) {
-	my @le_wits = split( /, /, $le->name() );
+	my @le_wits = $self->witnesses_of_label( $le->name );
 	if( _is_within( \@edge_wits, \@le_wits ) ) {
 	    # This is the right edge.
 	    return $direction eq 'next' ? $le->to() : $le->from();
@@ -330,11 +295,152 @@ sub _find_linked_word {
     return undef;
 }
 
-sub create_witnesses {
-    # TODO Given a new collation, make a bunch of Witness objects.
+# Some set logic.
+sub _is_within {
+    my( $set1, $set2 ) = @_;
+    my $ret = 1;
+    foreach my $el ( @$set1 ) {
+	$ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
+    }
+    return $ret;
+}
+
+# Walk the paths for each witness in the graph, and return the nodes
+# that the graph has in common.
+
+sub walk_witness_paths {
+    my( $self, $end ) = @_;
+    # For each witness, walk the path through the graph.
+    # Then we need to find the common nodes.  
+    # TODO This method is going to fall down if we have a very gappy 
+    # text in the collation.
+    my $paths = {};
+    my @common_nodes;
+    foreach my $wit ( @{$self->tradition->witnesses} ) {
+	my $curr_reading = $self->start;
+	my @wit_path = ( $curr_reading );
+	my %seen_readings;
+	# TODO Detect loops at some point
+	while( $curr_reading->name ne $end->name ) {
+	    if( $seen_readings{$curr_reading->name} ) {
+		warn "Detected loop walking path for witness " . $wit->sigil
+		    . " at reading " . $curr_reading->name;
+		last;
+	    }
+	    my $next_reading = $self->next_reading( $curr_reading, 
+						    $wit->sigil );
+	    push( @wit_path, $next_reading );
+	    $seen_readings{$curr_reading->name} = 1;
+	    $curr_reading = $next_reading;
+	}
+	$wit->path( \@wit_path );
+	if( @common_nodes ) {
+	    my @cn;
+	    foreach my $n ( @wit_path ) {
+		push( @cn, $n ) if grep { $_ eq $n } @common_nodes;
+	    }
+	    @common_nodes = ();
+	    push( @common_nodes, @cn );
+	} else {
+	    push( @common_nodes, @wit_path );
+	}
+    }
+
+    # Mark all the nodes as either common or not.
+    foreach my $cn ( @common_nodes ) {
+	print STDERR "Setting " . $cn->name . " as common node\n";
+	$cn->make_common;
+    }
+    foreach my $n ( $self->readings() ) {
+	$n->make_variant unless $n->is_common;
+    }
+}
+
+sub common_readings {
+    my $self = shift;
+    my @common = grep { $_->is_common } $self->readings();
+    return @common;
+}
+
+# Calculate the relative positions of nodes in the graph, if they
+# were not given to us.
+sub calculate_positions {
+    my $self = shift;
+
+    # We have to calculate the position identifiers for each word,
+    # keyed on the common nodes.  This will be 'fun'.  The end result
+    # is a hash per witness, whose key is the word node and whose
+    # value is its position in the text.  Common nodes are always N,1
+    # so have identical positions in each text.
+    my @common = $self->common_readings();
+
+    my $node_pos = {};
+    foreach my $wit ( @{$self->tradition->witnesses} ) {
+	# First we walk each path, making a matrix for each witness that
+	# corresponds to its eventual position identifier.  Common nodes
+	# always start a new row, and are thus always in the first column.
+
+	my $wit_matrix = [];
+	my $cn = 0;  # We should hit the common readings in order.
+	my $row = [];
+	foreach my $wn ( @{$wit->path} ) {
+	    if( $wn eq $common[$cn] ) {
+		# Set up to look for the next common node, and
+		# start a new row of words.
+		$cn++;
+		push( @$wit_matrix, $row ) if scalar( @$row );
+		$row = [];
+	    }
+	    push( @$row, $wn );
+	}
+	push( @$wit_matrix, $row );  # Push the last row onto the matrix
+
+	# Now we have a matrix per witness, so that each row in the
+	# matrix begins with a common node, and continues with all the
+	# variant words that appear in the witness.  We turn this into
+	# real positions in row,cell format.  But we need some
+	# trickery in order to make sure that each node gets assigned
+	# to only one position.
+
+	foreach my $li ( 1..scalar(@$wit_matrix) ) {
+	    foreach my $di ( 1..scalar(@{$wit_matrix->[$li-1]}) ) {
+		my $reading = $wit_matrix->[$li-1]->[$di-1];
+		my $position = "$li,$di";
+		# If we have seen this node before, we need to compare
+		# its position with what went before.
+		unless( $reading->has_position &&
+			_cmp_position( $position, $reading->position ) < 1 ) {
+		    # The new position ID replaces the old one.
+		    $reading->position( $position );
+		} # otherwise, the old position needs to stay.
+	    }
+	}
+    }
+}
+
+sub _cmp_position {
+    my( $a, $b ) = @_;
+    my @pos_a = split(/,/, $a );
+    my @pos_b = split(/,/, $b );
 
-    return [];
+    my $big_cmp = $pos_a[0] <=> $pos_b[0];
+    return $big_cmp if $big_cmp;
+    # else 
+    return $pos_a[1] <=> $pos_b[1];
 }
+ 
+# Return the string that joins together a list of witnesses for
+# display on a single path.
+sub path_label {
+    my $self = shift;
+    return join( $self->wit_list_separator, @_ );
+}
+
+sub witnesses_of_label {
+    my $self = shift;
+    my $regex = $self->wit_list_separator;
+    return split( /^\Q$regex\E$/, @_ );
+}    
 
 no Moose;
 __PACKAGE__->meta->make_immutable;
diff --git a/lib/Text/Tradition/Collation/Reading.pm b/lib/Text/Tradition/Collation/Reading.pm
index 146eadf..e3d6d6d 100644
--- a/lib/Text/Tradition/Collation/Reading.pm
+++ b/lib/Text/Tradition/Collation/Reading.pm
@@ -14,6 +14,7 @@ subtype 'Position'
 has 'position' => (
     is => 'rw',
     isa => 'Position',
+    predicate => 'has_position',
     );
 
 # This contains an array of reading objects; the array is a pool,
@@ -132,6 +133,21 @@ sub set_relationship {
     $self->relationships->{ $rel } = $value;
 }
 
+sub is_common {
+    my( $self ) = shift;
+    return $self->get_attribute( 'class' ) eq 'common';
+}
+
+sub make_common {
+    my( $self ) = shift;
+    $self->set_attribute( 'class', 'common' );
+}
+
+sub make_variant {
+    my( $self ) = shift;
+    $self->set_attribute( 'class', 'variant' );
+}
+
 no Moose;
 __PACKAGE__->meta->make_immutable;
 
diff --git a/lib/Text/Tradition/Parser/GraphML.pm b/lib/Text/Tradition/Parser/GraphML.pm
index d731f1d..63e8deb 100644
--- a/lib/Text/Tradition/Parser/GraphML.pm
+++ b/lib/Text/Tradition/Parser/GraphML.pm
@@ -29,29 +29,44 @@ graph.
 
 =cut
 
+use vars qw/ $xpc %nodedata /;
+
 sub parse {
-    my( $collation, $graphml_str ) = @_;
+    my( $tradition, $graphml_str ) = @_;
 
+    my $collation = $tradition->collation;
     my $parser = XML::LibXML->new();
     my $doc = $parser->parse_string( $graphml_str );
     my $graphml = $doc->documentElement();
-    my $xpc = XML::LibXML::XPathContext->new( $graphml );
+    $xpc = XML::LibXML::XPathContext->new( $graphml );
     $xpc->registerNs( 'g', 'http://graphml.graphdrawing.org/xmlns' );
     
     # First get the ID keys, for witnesses and for collation data
-    my %nodedata;
     my %witnesses;
     foreach my $k ( $xpc->findnodes( '//g:key' ) ) {
 	# Each key has a 'for' attribute; the edge keys are witnesses, and
 	# the node keys contain an ID and string for each node.
 
 	if( $k->getAttribute( 'for' ) eq 'node' ) {
+	    # The node data keys we expect are:
+	    # 'number' -> unique node identifier
+	    # 'token' -> reading for the node
+	    # 'identical' -> the node of which this node is 
+	    #                a transposed version
+	    # 'position' -> a calculated position for the node
 	    $nodedata{ $k->getAttribute( 'attr.name' ) } = $k->getAttribute( 'id' );
 	} else {
 	    $witnesses{ $k->getAttribute( 'id' ) } = $k->getAttribute( 'attr.name' );
 	}
     }
 
+    my $has_explicit_positions = defined $nodedata{'position'};
+
+    # Add the witnesses that we have found
+    foreach my $wit ( values %witnesses ) {
+	$tradition->add_witness( 'sigil' => $wit );
+    }
+
     my $graph_el = $xpc->find( '/g:graphml/g:graph' )->[0];
 
     # Add the nodes to the graph.  First delete the start node, because
@@ -63,19 +78,17 @@ sub parse {
     my $extra_data = {};
     my @nodes = $xpc->findnodes( '//g:node' );
     foreach my $n ( @nodes ) {
-	my $lookup_xpath = './g:data[@key="%s"]/child::text()';
-	my $id = $xpc->findvalue( sprintf( $lookup_xpath, $nodedata{'number'} ), $n );
-	my $label = $xpc->findvalue( sprintf( $lookup_xpath, $nodedata{'token'} ), $n );
+	my $id = _lookup_node_data( $n, 'number' );
+	my $label = _lookup_node_data( $n, 'token' );
 	my $gnode = $collation->add_reading( $id );
 	$node_name{ $n->getAttribute('id') } = $id;
 	$gnode->set_attribute( 'label', $label );
 
-	# Now get the rest of the data
+	# Now get the rest of the data, i.e. not the ID or label
 	my $extra = {};
-	my @keys = grep { $_ !~ /^(number|token)$/ } keys( %nodedata );
-	foreach my $k ( @keys ) {
-	    my $data = $xpc->findvalue( sprintf( $lookup_xpath, $nodedata{ $k } ), $n );
-	    $extra->{ $k } = $data;
+	foreach my $k ( keys %nodedata ) {
+	    next if $k =~ /^(number|token)$/;
+	    $extra->{ $k } = _lookup_node_data( $n, $k );
 	}
 	$extra_data->{ $id } = $extra;
     }
@@ -88,8 +101,7 @@ sub parse {
 	# Label according to the witnesses present.
 	my @wit_ids = $xpc->findnodes( './g:data/attribute::key', $e );
 	my @wit_names = map { $witnesses{ $_->getValue() } } @wit_ids;
-	my $label = join( ', ', @wit_names );
-	    
+	my $label = $collation->path_label( @wit_names );
 	$collation->add_path( $from, $to, $label );
     }
 
@@ -107,85 +119,51 @@ sub parse {
 			       $node_name{ $xpc->findvalue( $id_xpath, $tn ) } ) );
     }
 
-
     # Find the beginning and end nodes of the graph.  The beginning node
     # has no incoming edges; the end node has no outgoing edges.
     my( $begin_node, $end_node );
     foreach my $gnode ( $collation->readings() ) {
-	print STDERR "Checking node " . $gnode->name . "\n";
+	# print STDERR "Checking node " . $gnode->name . "\n";
 	my @outgoing = $gnode->outgoing();
 	my @incoming = $gnode->incoming();
 
 	unless( scalar @incoming ) {
 	    warn "Already have a beginning node" if $begin_node;
-	    my $node_xml_id = $node_id{ $gnode->name() };
-	    my @bn = $xpc->findnodes( '//g:node[@id="' . $node_xml_id . '"]' );
-	    warn "XPath did not find a node for id $node_xml_id"
-		unless scalar @bn;
-	    $begin_node = $bn[0];
+	    $begin_node = $gnode;
 	    $collation->start( $gnode );
-	    $node_name{ $begin_node->getAttribute( 'id' ) } = '#START#';
-	    $node_id{'#START#'} = $begin_node->getAttribute( 'id' );
 	}
 	unless( scalar @outgoing ) {
 	    warn "Already have an ending node" if $end_node;
-	    my $node_xml_id = $node_id{ $gnode->name() };
-	    my @bn = $xpc->findnodes( '//g:node[@id="' . $node_xml_id . '"]' );
-	    warn "XPath did not find a node for id $node_xml_id"
-		unless scalar @bn;
-	    $end_node = $bn[0];
+	    $end_node = $gnode;
 	}
     }
 
-    # Now for each witness, walk the path through the graph.
-    # Then we need to find the common nodes.  
-    # TODO This method is going to fall down if we have a very gappy 
-    # text in the collation.
-    # TODO think about whether it makes more sense to do this in the
-    # XML or in the graph. Right now it's the XML.
-    my $paths = {};
-    my @common_nodes;
-    foreach my $wit ( keys %witnesses ) {
-	my $node_id = $begin_node->getAttribute('id');
-	my @wit_path = ( $node_name{ $node_id } );
-	# TODO Detect loops at some point
-	while( $node_id ne $end_node->getAttribute('id') ) {
-	    # Find the node which is the target of the edge whose
-	    # source is $node_id and applies to this witness.
-	    my $xpath_expr = '//g:edge[child::g:data[@key="' 
-		. $wit . '"] and attribute::source="'
-		. $node_id . '"]';
-	    my $next_edge = $xpc->find( $xpath_expr, $graph_el )->[0];
-	    print STDERR " - at $wit / $node_id\n";
-	    $node_id = $next_edge->getAttribute('target');
-	    push( @wit_path, $node_name{ $node_id } );
-	}
-	$paths->{ $witnesses{ $wit }} = \@wit_path;
-	if( @common_nodes ) {
-	    my @cn;
-	    foreach my $n ( @wit_path) {
-		push( @cn, $n ) if grep { $_ eq $n } @common_nodes;
-	    }
-	    @common_nodes = ();
-	    push( @common_nodes, @cn );
-	} else {
-	    push( @common_nodes, @wit_path );
+    $collation->walk_witness_paths( $end_node );
+    # Now we have added the witnesses and their paths, so have also
+    # implicitly marked the common nodes. Now we can calculate their
+    # explicit permissions.  This is separate because it won't always
+    # be necessary with the GraphML parsing.
+    $collation->calculate_positions() unless $has_explicit_positions;
+    if( $has_explicit_positions ) {
+	# Record the positions that came with each graph node.
+	# TODO we really need to translate these into our own style of
+	# position identifier.  That's why we defer this until now.
+	foreach my $node_id ( keys %$extra_data ) {
+	    my $pos = $extra_data->{$node_id}->{'position'};
+	    $collation->reading( $node_name{$node_id} )->position( $pos );
 	}
+    } else {
+	# Calculate a position for each graph node.
+	$collation->calculate_positions();
     }
+}
 
-    # Mark all the nodes as either common or not.
-    foreach my $cn ( @common_nodes ) {
-	print STDERR "Setting $cn as common node\n";
-	$collation->reading( $cn )->set_attribute( 'class', 'common' );
-    }
-    foreach my $n ( $collation->readings() ) {
-	$n->set_attribute( 'class', 'variant' )
-	    unless $n->get_attribute( 'class' ) eq 'common';
-    }
-
-    # Now calculate graph positions.
-    # $collation->make_positions( \@common_nodes, $paths );
-
+sub _lookup_node_data {
+    my( $xmlnode, $key ) = @_;
+    my $lookup_xpath = './g:data[@key="%s"]/child::text()';
+    my $data = $xpc->findvalue( sprintf( $lookup_xpath, $nodedata{$key} ), 
+				$xmlnode );
+    return $data;
 }
     
 =back
diff --git a/lib/Text/Tradition/Witness.pm b/lib/Text/Tradition/Witness.pm
index 1eccbb0..8480e71 100644
--- a/lib/Text/Tradition/Witness.pm
+++ b/lib/Text/Tradition/Witness.pm
@@ -24,6 +24,11 @@ has 'source' => (
     predicate => 'has_source',
     );
 
+has 'path' => (
+    is => 'rw',
+    isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
+    );	       
+
 sub BUILD {
     my $self = shift;
     if( $self->has_source ) {