From: Tara L Andrews <tla\@mit.edu>
Date: Mon, 16 Jan 2012 21:02:36 +0000 (+0100)
Subject: parse our new GraphML format
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=2626f7099a2e4f00962f6b33dd2ff66a7b41c5a1;hp=c84275ff42c4d3e6f7fbc13140101975c990101a;p=scpubgit%2Fstemmatology.git

parse our new GraphML format
---

diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm
index 3cc85b8..00b48b7 100644
--- a/lib/Text/Tradition/Collation.pm
+++ b/lib/Text/Tradition/Collation.pm
@@ -610,7 +610,8 @@ sub as_graphml {
 	}
 	
 	# Add the relationship graph to the XML
-	$self->relations->as_graphml( $root, $graphml_ns, \%node_hash, \%edge_data_keys );
+	$self->relations->as_graphml( $graphml_ns, $root, \%node_hash, 
+		$node_data_keys{'id'}, \%edge_data_keys );
 
     # Save and return the thing
     my $result = decode_utf8( $graphml->toString(1) );
diff --git a/lib/Text/Tradition/Collation/RelationshipStore.pm b/lib/Text/Tradition/Collation/RelationshipStore.pm
index ba8b2de..69e5ccb 100644
--- a/lib/Text/Tradition/Collation/RelationshipStore.pm
+++ b/lib/Text/Tradition/Collation/RelationshipStore.pm
@@ -8,7 +8,8 @@ use Moose;
 
 =head1 NAME
 
-Text::Tradition::Collation::Reading - represents a reading (usually a word) in a collation.
+Text::Tradition::Collation::RelationshipStore - Keeps track of the relationships
+between readings in a given collation
     
 =head1 DESCRIPTION
 
@@ -65,7 +66,7 @@ sub create {
 	if( $self->graph->has_edge( $source, $target ) ) {
 		$rel = $self->graph->get_edge_attribute( $source, $target, 'object' );
 		if( $rel->type ne $options->type ) {
-			warn "Relationship of type " . $rel->type 
+			warn "Another relationship of type " . $rel->type 
 				. "already exists between $source and $target";
 			return;
 		} else {
@@ -325,7 +326,7 @@ sub merge_readings {
 }
 
 sub as_graphml { 
-	my( $self, $graphml_ns, $xmlroot, $node_hash, $edge_keys ) = @_;
+	my( $self, $graphml_ns, $xmlroot, $node_hash, $nodeid_key, $edge_keys ) = @_;
 	
     my $rgraph = $xmlroot->addNewChild( $graphml_ns, 'graph' );
 	$rgraph->setAttribute( 'edgedefault', 'directed' );
@@ -337,9 +338,11 @@ sub as_graphml {
     $rgraph->setAttribute( 'parse.order', 'nodesfirst' );
     
     # Add the vertices according to their XML IDs
-    foreach my $n ( sort _by_xmlid values( %$node_hash ) ) {
+    my %rdg_lookup = ( reverse %$node_hash );
+    foreach my $n ( sort _by_xmlid keys( %rdg_lookup ) ) {
     	my $n_el = $rgraph->addNewChild( $graphml_ns, 'node' );
     	$n_el->setAttribute( 'id', $n );
+    	_add_graphml_data( $n_el, $nodeid_key, $rdg_lookup{$n} );
     }
     
     # Add the relationship edges, with their object information
@@ -362,9 +365,11 @@ sub as_graphml {
 }
 
 sub _by_xmlid {
-	$a =~ s/\D//g;
-	$b =~ s/\D//g;
-	return $a <=> $b;
+	my $tmp_a = $a;
+	my $tmp_b = $b;
+	$tmp_a =~ s/\D//g;
+	$tmp_b =~ s/\D//g;
+	return $tmp_a <=> $tmp_b;
 }
 
 sub _add_graphml_data {
diff --git a/lib/Text/Tradition/Parser/CollateX.pm b/lib/Text/Tradition/Parser/CollateX.pm
index 6618a73..1ee448b 100644
--- a/lib/Text/Tradition/Parser/CollateX.pm
+++ b/lib/Text/Tradition/Parser/CollateX.pm
@@ -79,7 +79,7 @@ my $TRANSKEY = 'identical';
 
 sub parse {
     my( $tradition, $opts ) = @_;
-    my $graph_data = graphml_parse( $opts );
+    my( $graph_data ) = graphml_parse( $opts );
     my $collation = $tradition->collation;
 
 	# First add the readings to the graph.
diff --git a/lib/Text/Tradition/Parser/GraphML.pm b/lib/Text/Tradition/Parser/GraphML.pm
index e6fbddc..5cd33bb 100644
--- a/lib/Text/Tradition/Parser/GraphML.pm
+++ b/lib/Text/Tradition/Parser/GraphML.pm
@@ -40,9 +40,6 @@ and their associated data.
 sub graphml_parse {
     my( $opts ) = @_;
 
-    my $graph_hash = { 'nodes' => [],
-                       'edges' => [] };
-                       
     my $parser = XML::LibXML->new();
     my $doc;
     if( exists $opts->{'string'} ) {
@@ -54,15 +51,15 @@ sub graphml_parse {
         return;
     }
     
-    my( $graphattr, $nodedata, $witnesses ) = ( {}, {}, {} );
+    my( $graphattr, $nodedata, $edgedata ) = ( {}, {}, {} );
     my $graphml = $doc->documentElement();
     $xpc = XML::LibXML::XPathContext->new( $graphml );
     $xpc->registerNs( 'g', 'http://graphml.graphdrawing.org/xmlns' );
     
-    # First get the ID keys, for witnesses and for collation data
+    # First get the ID keys, for node/edge data and for collation data
     foreach my $k ( $xpc->findnodes( '//g:key' ) ) {
-        # Each key has a 'for' attribute; the edge keys are witnesses, and
-        # the node keys contain an ID and string for each node.
+        # Each key has a 'for' attribute to say whether it is for graph,
+        # node, or edge.
         my $keyid = $k->getAttribute( 'id' );
         my $keyname = $k->getAttribute( 'attr.name' );
 
@@ -74,60 +71,65 @@ sub graphml_parse {
         } elsif( $dtype eq 'node' ) {
             $nodedata->{$keyid} = $keyname;
         } else {
-            $witnesses->{$keyid} = $keyname;
+            $edgedata->{$keyid} = $keyname;
         }
     }
 
-    my $graph_el = $xpc->find( '/g:graphml/g:graph' )->[0];
-    $graph_hash->{'name'} = $graph_el->getAttribute( 'id' );
-
-    my $node_reg = {};
-    
-    # Read in graph globals (if any).
-    # print STDERR "Reading graphml global data\n";
-    foreach my $dkey ( keys %$graphattr ) {
-    	my $keyname = $graphattr->{$dkey};
-    	my $keyvalue = _lookup_node_data( $graph_el, $dkey );
-    	$graph_hash->{'global'}->{$keyname} = $keyvalue;
-    }
-
-    # Add the nodes to the graph hash.
-    # print STDERR "Reading graphml nodes\n"; 
-    my @nodes = $xpc->findnodes( '//g:node' );
-    foreach my $n ( @nodes ) {
-        # Could use a better way of registering these
-        my $node_hash = {};
-        foreach my $dkey ( keys %$nodedata ) {
-            my $keyname = $nodedata->{$dkey};
-            my $keyvalue = _lookup_node_data( $n, $dkey );
-            $node_hash->{$keyname} = $keyvalue if defined $keyvalue;
-        }
-        $node_reg->{$n->getAttribute( 'id' )} = $node_hash;
-        push( @{$graph_hash->{'nodes'}}, $node_hash );
-    }
-        
-    # Now add the edges, and cross-ref with the node objects.
-    # print STDERR "Reading graphml edges\n";
-    my @edges = $xpc->findnodes( '//g:edge' );
-    foreach my $e ( @edges ) {
-        my $from = $e->getAttribute('source');
-        my $to = $e->getAttribute('target');
-
-        # We don't know whether the edge data is one per witness
-        # or one per witness type, or something else.  So we just
-        # save it and let our calling parser decide.
-        my $edge_hash = {
-            'source' => $node_reg->{$from},
-            'target' => $node_reg->{$to},
-        };
-        foreach my $wkey( keys %$witnesses ) {
-            my $wname = $witnesses->{$wkey};
-            my $wlabel = _lookup_node_data( $e, $wkey );
-            $edge_hash->{$wname} = $wlabel if $wlabel;
-        }
-        push( @{$graph_hash->{'edges'}}, $edge_hash );
+    my @returned_graphs;
+    foreach my $graph_el ( $xpc->findnodes( '/g:graphml/g:graph' ) ) {
+        my $graph_hash = { 'nodes' => [],
+						   'edges' => [],
+						   'name'  => $graph_el->getAttribute( 'id' ) };
+                       	
+		my $node_reg = {};
+		
+		# Read in graph globals (if any).
+		# print STDERR "Reading graphml global data\n";
+		foreach my $dkey ( keys %$graphattr ) {
+			my $keyname = $graphattr->{$dkey};
+			my $keyvalue = _lookup_node_data( $graph_el, $dkey );
+			$graph_hash->{'global'}->{$keyname} = $keyvalue if defined $keyvalue;
+		}
+	
+		# Add the nodes to the graph hash.
+		# print STDERR "Reading graphml nodes\n"; 
+		my @nodes = $xpc->findnodes( './/g:node', $graph_el );
+		foreach my $n ( @nodes ) {
+			# Could use a better way of registering these
+			my $node_hash = {};
+			foreach my $dkey ( keys %$nodedata ) {
+				my $keyname = $nodedata->{$dkey};
+				my $keyvalue = _lookup_node_data( $n, $dkey );
+				$node_hash->{$keyname} = $keyvalue if defined $keyvalue;
+			}
+			$node_reg->{$n->getAttribute( 'id' )} = $node_hash;
+			push( @{$graph_hash->{'nodes'}}, $node_hash );
+		}
+			
+		# Now add the edges, and cross-ref with the node objects.
+		# print STDERR "Reading graphml edges\n";
+		my @edges = $xpc->findnodes( './/g:edge', $graph_el );
+		foreach my $e ( @edges ) {
+			my $from = $e->getAttribute('source');
+			my $to = $e->getAttribute('target');
+	
+			# We don't know whether the edge data is one per witness
+			# or one per witness type, or something else.  So we just
+			# save it and let our calling parser decide.
+			my $edge_hash = {
+				'source' => $node_reg->{$from},
+				'target' => $node_reg->{$to},
+			};
+			foreach my $wkey( keys %$edgedata ) {
+				my $wname = $edgedata->{$wkey};
+				my $wlabel = _lookup_node_data( $e, $wkey );
+				$edge_hash->{$wname} = $wlabel if $wlabel;
+			}
+			push( @{$graph_hash->{'edges'}}, $edge_hash );
+		}
+    	push( @returned_graphs, $graph_hash );
     }
-    return $graph_hash;
+    return @returned_graphs;
 }
 
 
diff --git a/lib/Text/Tradition/Parser/Self.pm b/lib/Text/Tradition/Parser/Self.pm
index f574d37..dee5969 100644
--- a/lib/Text/Tradition/Parser/Self.pm
+++ b/lib/Text/Tradition/Parser/Self.pm
@@ -117,27 +117,28 @@ if( $t ) {
 
 =cut
 
-my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $RANK_KEY, $CLASS_KEY,
+my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $RANK_KEY,
 	$START_KEY, $END_KEY, $LACUNA_KEY,
 	$SOURCE_KEY, $TARGET_KEY, $WITNESS_KEY, $EXTRA_KEY, $RELATIONSHIP_KEY,
-	$COLO_KEY, $CORRECT_KEY, $INDEP_KEY )
-    = qw/ name reading identical rank class
+	$SCOPE_KEY, $CORRECT_KEY, $INDEP_KEY )
+    = qw/ id text identical rank 
     	  is_start is_end is_lacuna 
     	  source target witness extra relationship
-    	  equal_rank non_correctable non_independent /;
+    	  scope non_correctable non_independent /;
 
 sub parse {
     my( $tradition, $opts ) = @_;
-    my $graph_data = graphml_parse( $opts );
+    
+    # Collation data is in the first graph; relationship-specific stuff 
+    # is in the second.
+    my( $graph_data, $rel_data ) = graphml_parse( $opts );
     
     my $collation = $tradition->collation;
     my %witnesses;
     
-    # Set up the graph-global attributes.  They will appear in the
-    # hash under their accessor names.
-    my $use_version;
     # print STDERR "Setting graph globals\n";
     $tradition->name( $graph_data->{'name'} );
+    my $use_version;
     foreach my $gkey ( keys %{$graph_data->{'global'}} ) {
 		my $val = $graph_data->{'global'}->{$gkey};
 		if( $gkey eq 'version' ) {
@@ -146,25 +147,11 @@ sub parse {
 			$collation->$gkey( $val );
 		}
 	}
-	if( $use_version ) {
-		# Many of our tags changed.
-		$IDKEY = 'id';
-		$TOKENKEY = 'text';
-		$COLO_KEY = 'colocated';
-	}
 		
     # Add the nodes to the graph. 
 
-    my $extra_data = {}; # Keep track of data that needs to be processed
-                         # after the nodes & edges are created.
     # print STDERR "Adding graph nodes\n";
-    foreach my $n ( @{$graph_data->{'nodes'}} ) {
-    	unless( $use_version ) {
-    		# Backwards compat!
-    		$n->{$START_KEY} = 1 if $n->{$IDKEY} eq '#START#';
-    		$n->{$END_KEY} = 1 if $n->{$IDKEY} eq '#END#';
-    	}
-    	
+    foreach my $n ( @{$graph_data->{'nodes'}} ) {    	
     	# If it is the start or end node, we already have one, so
     	# grab the rank and go.
     	next if( defined $n->{$START_KEY} );
@@ -175,32 +162,18 @@ sub parse {
     	
     	# First extract the data that we can use without reference to
     	# anything else.
-    	my %node_data = %$n; # Need $n itself untouched for edge processing
         
         # Create the node.  
         my $reading_options = { 
-        	'id' => delete $node_data{$IDKEY},
-        	'is_lacuna' => delete $node_data{$LACUNA_KEY},
+        	'id' => $n->{$IDKEY},
+        	'is_lacuna' => $n->{$LACUNA_KEY},
         	};
-        my $rank = delete $node_data{$RANK_KEY};
+        my $rank = $n->{$RANK_KEY};
 		$reading_options->{'rank'} = $rank if $rank;
-		my $text = delete $node_data{$TOKENKEY};
+		my $text = $n->{$TOKENKEY};
 		$reading_options->{'text'} = $text if $text;
 
-        # This is a horrible hack for backwards compatibility.
-        unless( $use_version ) {
-			$reading_options->{'is_lacuna'} = 1 
-				if $reading_options->{'text'} =~ /^\#LACUNA/;
-		}
-		
-		delete $node_data{$CLASS_KEY}; # Not actually used
 		my $gnode = $collation->add_reading( $reading_options );
-
-        # Now save the data that we need for post-processing,
-        # if it exists. TODO this is unneeded after conversion
-        if ( keys %node_data ) {
-            $extra_data->{$gnode->id} = \%node_data
-        }
     }
         
     # Now add the edges.
@@ -208,58 +181,39 @@ sub parse {
     foreach my $e ( @{$graph_data->{'edges'}} ) {
         my $from = $e->{$SOURCE_KEY};
         my $to = $e->{$TARGET_KEY};
-        my $class = $e->{$CLASS_KEY} || 'path';
-
-        # We may have more information depending on the class.
-        if( $class eq 'path' ) {
-        	# We need the witness, and whether it is an 'extra' reading path.
-        	my $wit = $e->{$WITNESS_KEY};
-        	warn "No witness label on path edge!" unless $wit;
-        	my $extra = $e->{$EXTRA_KEY};
-        	my $label = $wit . ( $extra ? $collation->ac_label : '' );
-        	$collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label );
-        	# Add the witness if we don't have it already.
-			unless( $witnesses{$wit} ) {
-				$tradition->add_witness( sigil => $wit );
-				$witnesses{$wit} = 1;
-			}
-			$tradition->witness( $wit )->is_layered( 1 ) if $extra;
-        } elsif( $class eq 'relationship' ) {
-        	# We need the metadata about the relationship.
-        	my $opts = { 'type' => $e->{$RELATIONSHIP_KEY} };
-        	$opts->{$COLO_KEY} = $e->{$COLO_KEY} 
-        		if exists $e->{$COLO_KEY};
-        	$opts->{$CORRECT_KEY} = $e->{$CORRECT_KEY} 
-        		if exists $e->{$CORRECT_KEY};
-        	$opts->{$INDEP_KEY} = $e->{$INDEP_KEY} 
-        		if exists $e->{$INDEP_KEY};
-        	warn "No relationship type for relationship edge!" unless $opts->{'type'};
-        	my( $ok, @result ) = $collation->add_relationship( $from->{$IDKEY}, $to->{$IDKEY}, $opts );
-        	unless( $ok ) {
-        		my $relinfo = $opts->{'type'} . ' ' 
-        			. join( ' -> ', $from->{$IDKEY}, $to->{$IDKEY} );
-        		warn "Did not add relationship $relinfo: @result";
-        	}
-        } 
-    }
 
-    ## Deal with node information (transposition, relationships, etc.) that
-    ## needs to be processed after all the nodes are created.
-    ## TODO unneeded after conversion
-    unless( $use_version ) {
-		# print STDERR "Adding second-pass node data\n";
-		foreach my $nkey ( keys %$extra_data ) {
-			foreach my $edkey ( keys %{$extra_data->{$nkey}} ) {
-				my $this_reading = $collation->reading( $nkey );
-				if( $edkey eq $TRANSPOS_KEY ) {
-					my $other_reading = $collation->reading( $extra_data->{$nkey}->{$edkey} );
-					$this_reading->set_identical( $other_reading );
-				} else {
-					warn "Unfamiliar reading node data $edkey for $nkey";
-				}
-			}
+		# We need the witness, and whether it is an 'extra' reading path.
+		my $wit = $e->{$WITNESS_KEY};
+		warn "No witness label on path edge!" unless $wit;
+		my $extra = $e->{$EXTRA_KEY};
+		my $label = $wit . ( $extra ? $collation->ac_label : '' );
+		$collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label );
+		# Add the witness if we don't have it already.
+		unless( $witnesses{$wit} ) {
+			$tradition->add_witness( sigil => $wit );
+			$witnesses{$wit} = 1;
 		}
+		$tradition->witness( $wit )->is_layered( 1 ) if $extra;
     }
+    
+    ## Done with the main graph, now look at the relationships.
+	# Nodes are added via the call to add_reading above.  We only need
+	# add the relationships themselves.
+	# TODO check that scoping does trt
+	foreach my $e ( @{$rel_data->{'edges'}} ) {
+		my $from = $e->{$SOURCE_KEY};
+		my $to = $e->{$TARGET_KEY};
+		my $relationship_opts = {
+			'type' => $e->{$RELATIONSHIP_KEY},
+			'scope' => $e->{$SCOPE_KEY},
+			};
+		$relationship_opts->{'non_correctable'} = $e->{$CORRECT_KEY}
+			if exists $e->{$CORRECT_KEY};
+		$relationship_opts->{'non_independent'} = $e->{$INDEP_KEY}
+			if exists $e->{$INDEP_KEY};
+		$collation->add_relationship( $from->{$IDKEY}, $to->{$IDKEY}, 
+			$relationship_opts );
+	}
 }
 
 1;