Commit | Line | Data |
e58153d6 |
1 | package Text::Tradition::Parser::GraphML; |
b49c4318 |
2 | |
3 | use strict; |
4 | use warnings; |
b49c4318 |
5 | use XML::LibXML; |
6 | use XML::LibXML::XPathContext; |
7 | |
8 | |
9 | # Takes a GraphML string; returns a Graph::Easy object. |
10 | |
11 | sub parse { |
12 | my( $graph, $graphml_str ) = @_; |
13 | |
14 | my $parser = XML::LibXML->new(); |
15 | my $doc = $parser->parse_string( $graphml_str ); |
16 | my $collation = $doc->documentElement(); |
17 | my $xpc = XML::LibXML::XPathContext->new( $collation ); |
18 | $xpc->registerNs( 'g', 'http://graphml.graphdrawing.org/xmlns' ); |
19 | |
20 | # First get the ID keys, for witnesses and for collation data |
21 | my %nodedata; |
22 | my %witnesses; |
23 | foreach my $k ( $xpc->findnodes( '//g:key' ) ) { |
24 | # Each key has a 'for' attribute; the edge keys are witnesses, and |
25 | # the node keys contain an ID and string for each node. |
26 | |
27 | if( $k->getAttribute( 'for' ) eq 'node' ) { |
28 | $nodedata{ $k->getAttribute( 'attr.name' ) } = $k->getAttribute( 'id' ); |
29 | } else { |
30 | $witnesses{ $k->getAttribute( 'id' ) } = $k->getAttribute( 'attr.name' ); |
31 | } |
32 | } |
33 | |
34 | my $graph_el = $xpc->find( '/g:graphml/g:graph' )->[0]; |
35 | |
36 | # Add the nodes to the graph. First delete the start node, because |
37 | # GraphML graphs will have their own start nodes. |
38 | $graph->del_node( $graph->start() ); |
39 | # Map from XML IDs to node name/identity |
40 | my %node_name; |
41 | # Keep track of whatever extra info we're passed |
42 | my $extra_data = {}; |
43 | my @nodes = $xpc->findnodes( '//g:node' ); |
44 | foreach my $n ( @nodes ) { |
45 | my $lookup_xpath = './g:data[@key="%s"]/child::text()'; |
46 | my $id = $xpc->findvalue( sprintf( $lookup_xpath, $nodedata{'number'} ), $n ); |
47 | my $label = $xpc->findvalue( sprintf( $lookup_xpath, $nodedata{'token'} ), $n ); |
48 | my $gnode = $graph->add_node( $id ); |
49 | $node_name{ $n->getAttribute('id') } = $id; |
50 | $gnode->set_attribute( 'label', $label ); |
51 | |
52 | # Now get the rest of the data |
53 | my $extra = {}; |
54 | my @keys = grep { $_ !~ /^(number|token)$/ } keys( %nodedata ); |
55 | foreach my $k ( @keys ) { |
56 | my $data = $xpc->findvalue( sprintf( $lookup_xpath, $nodedata{ $k } ), $n ); |
57 | $extra->{ $k } = $data; |
58 | } |
59 | $extra_data->{ $id } = $extra; |
60 | } |
61 | |
62 | # Now add the edges. |
63 | my @edges = $xpc->findnodes( '//g:edge' ); |
64 | foreach my $e ( @edges ) { |
65 | my $from = $node_name{ $e->getAttribute('source') }; |
66 | my $to = $node_name{ $e->getAttribute('target') }; |
67 | # Label according to the witnesses present. |
68 | my @wit_ids = $xpc->findnodes( './g:data/attribute::key', $e ); |
69 | my @wit_names = map { $witnesses{ $_->getValue() } } @wit_ids; |
70 | my $label = join( ', ', @wit_names ); |
71 | |
72 | $graph->add_edge( $from, $to, $label ); |
73 | } |
74 | |
75 | ## Reverse the node_name hash so that we have two-way lookup. |
76 | my %node_id = reverse %node_name; |
a25d4374 |
77 | |
78 | ## Record the nodes that are marked as transposed. |
c2d16875 |
79 | my $tr_xpath = '//g:node[g:data[@key="' . $nodedata{'identity'} . '"]]'; |
80 | my $transposition_nodes = $xpc->find( $tr_xpath ); |
81 | foreach my $tn ( @$transposition_nodes ) { |
82 | my $id_xpath = sprintf( './g:data[@key="%s"]/text()', |
83 | $nodedata{'identity'} ); |
84 | $graph->set_identical_node( $node_name{ $tn->getAttribute( 'id' ) }, |
85 | $node_name{ $xpc->findvalue( $id_xpath, |
86 | $tn ) } ); |
a25d4374 |
87 | } |
a25d4374 |
88 | |
b49c4318 |
89 | |
90 | # Find the beginning and end nodes of the graph. The beginning node |
91 | # has no incoming edges; the end node has no outgoing edges. |
92 | my( $begin_node, $end_node ); |
93 | foreach my $gnode ( $graph->nodes() ) { |
94 | print STDERR "Checking node " . $gnode->name . "\n"; |
95 | my @outgoing = $gnode->outgoing(); |
96 | my @incoming = $gnode->incoming(); |
97 | |
98 | unless( scalar @incoming ) { |
99 | warn "Already have a beginning node" if $begin_node; |
100 | my $node_xml_id = $node_id{ $gnode->name() }; |
101 | my @bn = $xpc->findnodes( '//g:node[@id="' . $node_xml_id . '"]' ); |
102 | warn "XPath did not find a node for id $node_xml_id" |
103 | unless scalar @bn; |
104 | $begin_node = $bn[0]; |
105 | $graph->start( $gnode ); |
106 | $node_name{ 0 } = '#START#'; |
107 | $node_id{'#START#'} = 0; |
108 | } |
109 | unless( scalar @outgoing ) { |
110 | warn "Already have an ending node" if $end_node; |
111 | my $node_xml_id = $node_id{ $gnode->name() }; |
112 | my @bn = $xpc->findnodes( '//g:node[@id="' . $node_xml_id . '"]' ); |
113 | warn "XPath did not find a node for id $node_xml_id" |
114 | unless scalar @bn; |
115 | $end_node = $bn[0]; |
116 | } |
117 | } |
118 | |
119 | # Now for each witness, walk the path through the graph. |
120 | # Then we need to find the common nodes. |
121 | # TODO This method is going to fall down if we have a very gappy |
122 | # text in the collation. |
123 | # TODO think about whether it makes more sense to do this in the |
124 | # XML or in the graph. Right now it's the XML. |
125 | my $paths = {}; |
126 | my @common_nodes; |
127 | foreach my $wit ( keys %witnesses ) { |
128 | my $node_id = $begin_node->getAttribute('id'); |
129 | my @wit_path = ( $node_name{ $node_id } ); |
130 | # TODO Detect loops at some point |
131 | while( $node_id != $end_node->getAttribute('id') ) { |
132 | # Find the node which is the target of the edge whose |
133 | # source is $node_id and applies to this witness. |
134 | my $xpath_expr = '//g:edge[child::g:data[@key="' |
135 | . $wit . '"] and attribute::source="' |
136 | . $node_id . '"]'; |
137 | my $next_edge = $xpc->find( $xpath_expr, $graph_el )->[0]; |
138 | $node_id = $next_edge->getAttribute('target'); |
139 | push( @wit_path, $node_name{ $node_id } ); |
140 | } |
141 | $paths->{ $witnesses{ $wit }} = \@wit_path; |
142 | if( @common_nodes ) { |
143 | my @cn; |
144 | foreach my $n ( @wit_path) { |
145 | push( @cn, $n ) if grep { $_ eq $n } @common_nodes; |
146 | } |
147 | @common_nodes = (); |
148 | push( @common_nodes, @cn ); |
149 | } else { |
150 | push( @common_nodes, @wit_path ); |
151 | } |
152 | } |
153 | |
154 | # Mark all the nodes as either common or not. |
155 | foreach my $cn ( @common_nodes ) { |
156 | print STDERR "Setting $cn as common node\n"; |
157 | $graph->node( $cn )->set_attribute( 'class', 'common' ); |
158 | } |
159 | foreach my $n ( $graph->nodes() ) { |
160 | $n->set_attribute( 'class', 'variant' ) |
161 | unless $n->get_attribute( 'class' ) eq 'common'; |
162 | } |
163 | |
a25d4374 |
164 | # Now calculate graph positions. |
165 | $graph->make_positions( \@common_nodes, $paths ); |
166 | |
b49c4318 |
167 | } |
168 | |
169 | 1; |