From: Tara L Andrews Date: Mon, 16 Jan 2012 21:02:36 +0000 (+0100) Subject: parse our new GraphML format X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=2626f7099a2e4f00962f6b33dd2ff66a7b41c5a1;hp=c84275ff42c4d3e6f7fbc13140101975c990101a;p=scpubgit%2Fstemmatology.git parse our new GraphML format --- diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index 3cc85b8..00b48b7 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -610,7 +610,8 @@ sub as_graphml { } # Add the relationship graph to the XML - $self->relations->as_graphml( $root, $graphml_ns, \%node_hash, \%edge_data_keys ); + $self->relations->as_graphml( $graphml_ns, $root, \%node_hash, + $node_data_keys{'id'}, \%edge_data_keys ); # Save and return the thing my $result = decode_utf8( $graphml->toString(1) ); diff --git a/lib/Text/Tradition/Collation/RelationshipStore.pm b/lib/Text/Tradition/Collation/RelationshipStore.pm index ba8b2de..69e5ccb 100644 --- a/lib/Text/Tradition/Collation/RelationshipStore.pm +++ b/lib/Text/Tradition/Collation/RelationshipStore.pm @@ -8,7 +8,8 @@ use Moose; =head1 NAME -Text::Tradition::Collation::Reading - represents a reading (usually a word) in a collation. +Text::Tradition::Collation::RelationshipStore - Keeps track of the relationships +between readings in a given collation =head1 DESCRIPTION @@ -65,7 +66,7 @@ sub create { if( $self->graph->has_edge( $source, $target ) ) { $rel = $self->graph->get_edge_attribute( $source, $target, 'object' ); if( $rel->type ne $options->type ) { - warn "Relationship of type " . $rel->type + warn "Another relationship of type " . $rel->type . "already exists between $source and $target"; return; } else { @@ -325,7 +326,7 @@ sub merge_readings { } sub as_graphml { - my( $self, $graphml_ns, $xmlroot, $node_hash, $edge_keys ) = @_; + my( $self, $graphml_ns, $xmlroot, $node_hash, $nodeid_key, $edge_keys ) = @_; my $rgraph = $xmlroot->addNewChild( $graphml_ns, 'graph' ); $rgraph->setAttribute( 'edgedefault', 'directed' ); @@ -337,9 +338,11 @@ sub as_graphml { $rgraph->setAttribute( 'parse.order', 'nodesfirst' ); # Add the vertices according to their XML IDs - foreach my $n ( sort _by_xmlid values( %$node_hash ) ) { + my %rdg_lookup = ( reverse %$node_hash ); + foreach my $n ( sort _by_xmlid keys( %rdg_lookup ) ) { my $n_el = $rgraph->addNewChild( $graphml_ns, 'node' ); $n_el->setAttribute( 'id', $n ); + _add_graphml_data( $n_el, $nodeid_key, $rdg_lookup{$n} ); } # Add the relationship edges, with their object information @@ -362,9 +365,11 @@ sub as_graphml { } sub _by_xmlid { - $a =~ s/\D//g; - $b =~ s/\D//g; - return $a <=> $b; + my $tmp_a = $a; + my $tmp_b = $b; + $tmp_a =~ s/\D//g; + $tmp_b =~ s/\D//g; + return $tmp_a <=> $tmp_b; } sub _add_graphml_data { diff --git a/lib/Text/Tradition/Parser/CollateX.pm b/lib/Text/Tradition/Parser/CollateX.pm index 6618a73..1ee448b 100644 --- a/lib/Text/Tradition/Parser/CollateX.pm +++ b/lib/Text/Tradition/Parser/CollateX.pm @@ -79,7 +79,7 @@ my $TRANSKEY = 'identical'; sub parse { my( $tradition, $opts ) = @_; - my $graph_data = graphml_parse( $opts ); + my( $graph_data ) = graphml_parse( $opts ); my $collation = $tradition->collation; # First add the readings to the graph. diff --git a/lib/Text/Tradition/Parser/GraphML.pm b/lib/Text/Tradition/Parser/GraphML.pm index e6fbddc..5cd33bb 100644 --- a/lib/Text/Tradition/Parser/GraphML.pm +++ b/lib/Text/Tradition/Parser/GraphML.pm @@ -40,9 +40,6 @@ and their associated data. sub graphml_parse { my( $opts ) = @_; - my $graph_hash = { 'nodes' => [], - 'edges' => [] }; - my $parser = XML::LibXML->new(); my $doc; if( exists $opts->{'string'} ) { @@ -54,15 +51,15 @@ sub graphml_parse { return; } - my( $graphattr, $nodedata, $witnesses ) = ( {}, {}, {} ); + my( $graphattr, $nodedata, $edgedata ) = ( {}, {}, {} ); my $graphml = $doc->documentElement(); $xpc = XML::LibXML::XPathContext->new( $graphml ); $xpc->registerNs( 'g', 'http://graphml.graphdrawing.org/xmlns' ); - # First get the ID keys, for witnesses and for collation data + # First get the ID keys, for node/edge data and for collation data foreach my $k ( $xpc->findnodes( '//g:key' ) ) { - # Each key has a 'for' attribute; the edge keys are witnesses, and - # the node keys contain an ID and string for each node. + # Each key has a 'for' attribute to say whether it is for graph, + # node, or edge. my $keyid = $k->getAttribute( 'id' ); my $keyname = $k->getAttribute( 'attr.name' ); @@ -74,60 +71,65 @@ sub graphml_parse { } elsif( $dtype eq 'node' ) { $nodedata->{$keyid} = $keyname; } else { - $witnesses->{$keyid} = $keyname; + $edgedata->{$keyid} = $keyname; } } - my $graph_el = $xpc->find( '/g:graphml/g:graph' )->[0]; - $graph_hash->{'name'} = $graph_el->getAttribute( 'id' ); - - my $node_reg = {}; - - # Read in graph globals (if any). - # print STDERR "Reading graphml global data\n"; - foreach my $dkey ( keys %$graphattr ) { - my $keyname = $graphattr->{$dkey}; - my $keyvalue = _lookup_node_data( $graph_el, $dkey ); - $graph_hash->{'global'}->{$keyname} = $keyvalue; - } - - # Add the nodes to the graph hash. - # print STDERR "Reading graphml nodes\n"; - my @nodes = $xpc->findnodes( '//g:node' ); - foreach my $n ( @nodes ) { - # Could use a better way of registering these - my $node_hash = {}; - foreach my $dkey ( keys %$nodedata ) { - my $keyname = $nodedata->{$dkey}; - my $keyvalue = _lookup_node_data( $n, $dkey ); - $node_hash->{$keyname} = $keyvalue if defined $keyvalue; - } - $node_reg->{$n->getAttribute( 'id' )} = $node_hash; - push( @{$graph_hash->{'nodes'}}, $node_hash ); - } - - # Now add the edges, and cross-ref with the node objects. - # print STDERR "Reading graphml edges\n"; - my @edges = $xpc->findnodes( '//g:edge' ); - foreach my $e ( @edges ) { - my $from = $e->getAttribute('source'); - my $to = $e->getAttribute('target'); - - # We don't know whether the edge data is one per witness - # or one per witness type, or something else. So we just - # save it and let our calling parser decide. - my $edge_hash = { - 'source' => $node_reg->{$from}, - 'target' => $node_reg->{$to}, - }; - foreach my $wkey( keys %$witnesses ) { - my $wname = $witnesses->{$wkey}; - my $wlabel = _lookup_node_data( $e, $wkey ); - $edge_hash->{$wname} = $wlabel if $wlabel; - } - push( @{$graph_hash->{'edges'}}, $edge_hash ); + my @returned_graphs; + foreach my $graph_el ( $xpc->findnodes( '/g:graphml/g:graph' ) ) { + my $graph_hash = { 'nodes' => [], + 'edges' => [], + 'name' => $graph_el->getAttribute( 'id' ) }; + + my $node_reg = {}; + + # Read in graph globals (if any). + # print STDERR "Reading graphml global data\n"; + foreach my $dkey ( keys %$graphattr ) { + my $keyname = $graphattr->{$dkey}; + my $keyvalue = _lookup_node_data( $graph_el, $dkey ); + $graph_hash->{'global'}->{$keyname} = $keyvalue if defined $keyvalue; + } + + # Add the nodes to the graph hash. + # print STDERR "Reading graphml nodes\n"; + my @nodes = $xpc->findnodes( './/g:node', $graph_el ); + foreach my $n ( @nodes ) { + # Could use a better way of registering these + my $node_hash = {}; + foreach my $dkey ( keys %$nodedata ) { + my $keyname = $nodedata->{$dkey}; + my $keyvalue = _lookup_node_data( $n, $dkey ); + $node_hash->{$keyname} = $keyvalue if defined $keyvalue; + } + $node_reg->{$n->getAttribute( 'id' )} = $node_hash; + push( @{$graph_hash->{'nodes'}}, $node_hash ); + } + + # Now add the edges, and cross-ref with the node objects. + # print STDERR "Reading graphml edges\n"; + my @edges = $xpc->findnodes( './/g:edge', $graph_el ); + foreach my $e ( @edges ) { + my $from = $e->getAttribute('source'); + my $to = $e->getAttribute('target'); + + # We don't know whether the edge data is one per witness + # or one per witness type, or something else. So we just + # save it and let our calling parser decide. + my $edge_hash = { + 'source' => $node_reg->{$from}, + 'target' => $node_reg->{$to}, + }; + foreach my $wkey( keys %$edgedata ) { + my $wname = $edgedata->{$wkey}; + my $wlabel = _lookup_node_data( $e, $wkey ); + $edge_hash->{$wname} = $wlabel if $wlabel; + } + push( @{$graph_hash->{'edges'}}, $edge_hash ); + } + push( @returned_graphs, $graph_hash ); } - return $graph_hash; + return @returned_graphs; } diff --git a/lib/Text/Tradition/Parser/Self.pm b/lib/Text/Tradition/Parser/Self.pm index f574d37..dee5969 100644 --- a/lib/Text/Tradition/Parser/Self.pm +++ b/lib/Text/Tradition/Parser/Self.pm @@ -117,27 +117,28 @@ if( $t ) { =cut -my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $RANK_KEY, $CLASS_KEY, +my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $RANK_KEY, $START_KEY, $END_KEY, $LACUNA_KEY, $SOURCE_KEY, $TARGET_KEY, $WITNESS_KEY, $EXTRA_KEY, $RELATIONSHIP_KEY, - $COLO_KEY, $CORRECT_KEY, $INDEP_KEY ) - = qw/ name reading identical rank class + $SCOPE_KEY, $CORRECT_KEY, $INDEP_KEY ) + = qw/ id text identical rank is_start is_end is_lacuna source target witness extra relationship - equal_rank non_correctable non_independent /; + scope non_correctable non_independent /; sub parse { my( $tradition, $opts ) = @_; - my $graph_data = graphml_parse( $opts ); + + # Collation data is in the first graph; relationship-specific stuff + # is in the second. + my( $graph_data, $rel_data ) = graphml_parse( $opts ); my $collation = $tradition->collation; my %witnesses; - # Set up the graph-global attributes. They will appear in the - # hash under their accessor names. - my $use_version; # print STDERR "Setting graph globals\n"; $tradition->name( $graph_data->{'name'} ); + my $use_version; foreach my $gkey ( keys %{$graph_data->{'global'}} ) { my $val = $graph_data->{'global'}->{$gkey}; if( $gkey eq 'version' ) { @@ -146,25 +147,11 @@ sub parse { $collation->$gkey( $val ); } } - if( $use_version ) { - # Many of our tags changed. - $IDKEY = 'id'; - $TOKENKEY = 'text'; - $COLO_KEY = 'colocated'; - } # Add the nodes to the graph. - my $extra_data = {}; # Keep track of data that needs to be processed - # after the nodes & edges are created. # print STDERR "Adding graph nodes\n"; - foreach my $n ( @{$graph_data->{'nodes'}} ) { - unless( $use_version ) { - # Backwards compat! - $n->{$START_KEY} = 1 if $n->{$IDKEY} eq '#START#'; - $n->{$END_KEY} = 1 if $n->{$IDKEY} eq '#END#'; - } - + foreach my $n ( @{$graph_data->{'nodes'}} ) { # If it is the start or end node, we already have one, so # grab the rank and go. next if( defined $n->{$START_KEY} ); @@ -175,32 +162,18 @@ sub parse { # First extract the data that we can use without reference to # anything else. - my %node_data = %$n; # Need $n itself untouched for edge processing # Create the node. my $reading_options = { - 'id' => delete $node_data{$IDKEY}, - 'is_lacuna' => delete $node_data{$LACUNA_KEY}, + 'id' => $n->{$IDKEY}, + 'is_lacuna' => $n->{$LACUNA_KEY}, }; - my $rank = delete $node_data{$RANK_KEY}; + my $rank = $n->{$RANK_KEY}; $reading_options->{'rank'} = $rank if $rank; - my $text = delete $node_data{$TOKENKEY}; + my $text = $n->{$TOKENKEY}; $reading_options->{'text'} = $text if $text; - # This is a horrible hack for backwards compatibility. - unless( $use_version ) { - $reading_options->{'is_lacuna'} = 1 - if $reading_options->{'text'} =~ /^\#LACUNA/; - } - - delete $node_data{$CLASS_KEY}; # Not actually used my $gnode = $collation->add_reading( $reading_options ); - - # Now save the data that we need for post-processing, - # if it exists. TODO this is unneeded after conversion - if ( keys %node_data ) { - $extra_data->{$gnode->id} = \%node_data - } } # Now add the edges. @@ -208,58 +181,39 @@ sub parse { foreach my $e ( @{$graph_data->{'edges'}} ) { my $from = $e->{$SOURCE_KEY}; my $to = $e->{$TARGET_KEY}; - my $class = $e->{$CLASS_KEY} || 'path'; - - # We may have more information depending on the class. - if( $class eq 'path' ) { - # We need the witness, and whether it is an 'extra' reading path. - my $wit = $e->{$WITNESS_KEY}; - warn "No witness label on path edge!" unless $wit; - my $extra = $e->{$EXTRA_KEY}; - my $label = $wit . ( $extra ? $collation->ac_label : '' ); - $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label ); - # Add the witness if we don't have it already. - unless( $witnesses{$wit} ) { - $tradition->add_witness( sigil => $wit ); - $witnesses{$wit} = 1; - } - $tradition->witness( $wit )->is_layered( 1 ) if $extra; - } elsif( $class eq 'relationship' ) { - # We need the metadata about the relationship. - my $opts = { 'type' => $e->{$RELATIONSHIP_KEY} }; - $opts->{$COLO_KEY} = $e->{$COLO_KEY} - if exists $e->{$COLO_KEY}; - $opts->{$CORRECT_KEY} = $e->{$CORRECT_KEY} - if exists $e->{$CORRECT_KEY}; - $opts->{$INDEP_KEY} = $e->{$INDEP_KEY} - if exists $e->{$INDEP_KEY}; - warn "No relationship type for relationship edge!" unless $opts->{'type'}; - my( $ok, @result ) = $collation->add_relationship( $from->{$IDKEY}, $to->{$IDKEY}, $opts ); - unless( $ok ) { - my $relinfo = $opts->{'type'} . ' ' - . join( ' -> ', $from->{$IDKEY}, $to->{$IDKEY} ); - warn "Did not add relationship $relinfo: @result"; - } - } - } - ## Deal with node information (transposition, relationships, etc.) that - ## needs to be processed after all the nodes are created. - ## TODO unneeded after conversion - unless( $use_version ) { - # print STDERR "Adding second-pass node data\n"; - foreach my $nkey ( keys %$extra_data ) { - foreach my $edkey ( keys %{$extra_data->{$nkey}} ) { - my $this_reading = $collation->reading( $nkey ); - if( $edkey eq $TRANSPOS_KEY ) { - my $other_reading = $collation->reading( $extra_data->{$nkey}->{$edkey} ); - $this_reading->set_identical( $other_reading ); - } else { - warn "Unfamiliar reading node data $edkey for $nkey"; - } - } + # We need the witness, and whether it is an 'extra' reading path. + my $wit = $e->{$WITNESS_KEY}; + warn "No witness label on path edge!" unless $wit; + my $extra = $e->{$EXTRA_KEY}; + my $label = $wit . ( $extra ? $collation->ac_label : '' ); + $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label ); + # Add the witness if we don't have it already. + unless( $witnesses{$wit} ) { + $tradition->add_witness( sigil => $wit ); + $witnesses{$wit} = 1; } + $tradition->witness( $wit )->is_layered( 1 ) if $extra; } + + ## Done with the main graph, now look at the relationships. + # Nodes are added via the call to add_reading above. We only need + # add the relationships themselves. + # TODO check that scoping does trt + foreach my $e ( @{$rel_data->{'edges'}} ) { + my $from = $e->{$SOURCE_KEY}; + my $to = $e->{$TARGET_KEY}; + my $relationship_opts = { + 'type' => $e->{$RELATIONSHIP_KEY}, + 'scope' => $e->{$SCOPE_KEY}, + }; + $relationship_opts->{'non_correctable'} = $e->{$CORRECT_KEY} + if exists $e->{$CORRECT_KEY}; + $relationship_opts->{'non_independent'} = $e->{$INDEP_KEY} + if exists $e->{$INDEP_KEY}; + $collation->add_relationship( $from->{$IDKEY}, $to->{$IDKEY}, + $relationship_opts ); + } } 1;