Commit | Line | Data |
e58153d6 |
1 | package Text::Tradition::Parser::GraphML; |
b49c4318 |
2 | |
3 | use strict; |
4 | use warnings; |
b49c4318 |
5 | use XML::LibXML; |
6 | use XML::LibXML::XPathContext; |
7 | |
2ceca8c3 |
8 | =head1 NAME |
b49c4318 |
9 | |
2ceca8c3 |
10 | Text::Tradition::Parser::GraphML |
11 | |
12 | =head1 DESCRIPTION |
13 | |
14 | Parser module for Text::Tradition, given a GraphML file that describes |
15 | a collation graph. For further information on the GraphML format for |
16 | text collation, see http://gregor.middell.net/collatex/ |
17 | |
18 | =head1 METHODS |
19 | |
20 | =over |
21 | |
22 | =item B<parse> |
23 | |
24 | parse( $graph, $graphml_string ); |
25 | |
26 | Takes an initialized Text::Tradition::Graph object and a string |
27 | containing the GraphML; creates the appropriate nodes and edges on the |
28 | graph. |
29 | |
30 | =cut |
b49c4318 |
31 | |
32 | sub parse { |
33 | my( $graph, $graphml_str ) = @_; |
34 | |
35 | my $parser = XML::LibXML->new(); |
36 | my $doc = $parser->parse_string( $graphml_str ); |
37 | my $collation = $doc->documentElement(); |
38 | my $xpc = XML::LibXML::XPathContext->new( $collation ); |
39 | $xpc->registerNs( 'g', 'http://graphml.graphdrawing.org/xmlns' ); |
40 | |
41 | # First get the ID keys, for witnesses and for collation data |
42 | my %nodedata; |
43 | my %witnesses; |
44 | foreach my $k ( $xpc->findnodes( '//g:key' ) ) { |
45 | # Each key has a 'for' attribute; the edge keys are witnesses, and |
46 | # the node keys contain an ID and string for each node. |
47 | |
48 | if( $k->getAttribute( 'for' ) eq 'node' ) { |
49 | $nodedata{ $k->getAttribute( 'attr.name' ) } = $k->getAttribute( 'id' ); |
50 | } else { |
51 | $witnesses{ $k->getAttribute( 'id' ) } = $k->getAttribute( 'attr.name' ); |
52 | } |
53 | } |
54 | |
55 | my $graph_el = $xpc->find( '/g:graphml/g:graph' )->[0]; |
56 | |
57 | # Add the nodes to the graph. First delete the start node, because |
58 | # GraphML graphs will have their own start nodes. |
59 | $graph->del_node( $graph->start() ); |
60 | # Map from XML IDs to node name/identity |
61 | my %node_name; |
62 | # Keep track of whatever extra info we're passed |
63 | my $extra_data = {}; |
64 | my @nodes = $xpc->findnodes( '//g:node' ); |
65 | foreach my $n ( @nodes ) { |
66 | my $lookup_xpath = './g:data[@key="%s"]/child::text()'; |
67 | my $id = $xpc->findvalue( sprintf( $lookup_xpath, $nodedata{'number'} ), $n ); |
68 | my $label = $xpc->findvalue( sprintf( $lookup_xpath, $nodedata{'token'} ), $n ); |
69 | my $gnode = $graph->add_node( $id ); |
70 | $node_name{ $n->getAttribute('id') } = $id; |
71 | $gnode->set_attribute( 'label', $label ); |
72 | |
73 | # Now get the rest of the data |
74 | my $extra = {}; |
75 | my @keys = grep { $_ !~ /^(number|token)$/ } keys( %nodedata ); |
76 | foreach my $k ( @keys ) { |
77 | my $data = $xpc->findvalue( sprintf( $lookup_xpath, $nodedata{ $k } ), $n ); |
78 | $extra->{ $k } = $data; |
79 | } |
80 | $extra_data->{ $id } = $extra; |
81 | } |
82 | |
83 | # Now add the edges. |
84 | my @edges = $xpc->findnodes( '//g:edge' ); |
85 | foreach my $e ( @edges ) { |
86 | my $from = $node_name{ $e->getAttribute('source') }; |
87 | my $to = $node_name{ $e->getAttribute('target') }; |
88 | # Label according to the witnesses present. |
89 | my @wit_ids = $xpc->findnodes( './g:data/attribute::key', $e ); |
90 | my @wit_names = map { $witnesses{ $_->getValue() } } @wit_ids; |
91 | my $label = join( ', ', @wit_names ); |
92 | |
93 | $graph->add_edge( $from, $to, $label ); |
94 | } |
95 | |
96 | ## Reverse the node_name hash so that we have two-way lookup. |
97 | my %node_id = reverse %node_name; |
a25d4374 |
98 | |
99 | ## Record the nodes that are marked as transposed. |
c557b209 |
100 | my $tr_xpath = '//g:node[g:data[@key="' . $nodedata{'identical'} . '"]]'; |
c2d16875 |
101 | my $transposition_nodes = $xpc->find( $tr_xpath ); |
102 | foreach my $tn ( @$transposition_nodes ) { |
103 | my $id_xpath = sprintf( './g:data[@key="%s"]/text()', |
c557b209 |
104 | $nodedata{'identical'} ); |
c2d16875 |
105 | $graph->set_identical_node( $node_name{ $tn->getAttribute( 'id' ) }, |
106 | $node_name{ $xpc->findvalue( $id_xpath, |
107 | $tn ) } ); |
a25d4374 |
108 | } |
a25d4374 |
109 | |
b49c4318 |
110 | |
111 | # Find the beginning and end nodes of the graph. The beginning node |
112 | # has no incoming edges; the end node has no outgoing edges. |
113 | my( $begin_node, $end_node ); |
114 | foreach my $gnode ( $graph->nodes() ) { |
115 | print STDERR "Checking node " . $gnode->name . "\n"; |
116 | my @outgoing = $gnode->outgoing(); |
117 | my @incoming = $gnode->incoming(); |
118 | |
119 | unless( scalar @incoming ) { |
120 | warn "Already have a beginning node" if $begin_node; |
121 | my $node_xml_id = $node_id{ $gnode->name() }; |
122 | my @bn = $xpc->findnodes( '//g:node[@id="' . $node_xml_id . '"]' ); |
123 | warn "XPath did not find a node for id $node_xml_id" |
124 | unless scalar @bn; |
125 | $begin_node = $bn[0]; |
126 | $graph->start( $gnode ); |
c557b209 |
127 | $node_name{ $begin_node->getAttribute( 'id' ) } = '#START#'; |
128 | $node_id{'#START#'} = $begin_node->getAttribute( 'id' ); |
b49c4318 |
129 | } |
130 | unless( scalar @outgoing ) { |
131 | warn "Already have an ending node" if $end_node; |
132 | my $node_xml_id = $node_id{ $gnode->name() }; |
133 | my @bn = $xpc->findnodes( '//g:node[@id="' . $node_xml_id . '"]' ); |
134 | warn "XPath did not find a node for id $node_xml_id" |
135 | unless scalar @bn; |
136 | $end_node = $bn[0]; |
137 | } |
138 | } |
139 | |
140 | # Now for each witness, walk the path through the graph. |
141 | # Then we need to find the common nodes. |
142 | # TODO This method is going to fall down if we have a very gappy |
143 | # text in the collation. |
144 | # TODO think about whether it makes more sense to do this in the |
145 | # XML or in the graph. Right now it's the XML. |
146 | my $paths = {}; |
147 | my @common_nodes; |
148 | foreach my $wit ( keys %witnesses ) { |
149 | my $node_id = $begin_node->getAttribute('id'); |
150 | my @wit_path = ( $node_name{ $node_id } ); |
151 | # TODO Detect loops at some point |
c557b209 |
152 | while( $node_id ne $end_node->getAttribute('id') ) { |
b49c4318 |
153 | # Find the node which is the target of the edge whose |
154 | # source is $node_id and applies to this witness. |
155 | my $xpath_expr = '//g:edge[child::g:data[@key="' |
156 | . $wit . '"] and attribute::source="' |
157 | . $node_id . '"]'; |
158 | my $next_edge = $xpc->find( $xpath_expr, $graph_el )->[0]; |
c557b209 |
159 | print STDERR " - at $wit / $node_id\n"; |
b49c4318 |
160 | $node_id = $next_edge->getAttribute('target'); |
161 | push( @wit_path, $node_name{ $node_id } ); |
162 | } |
163 | $paths->{ $witnesses{ $wit }} = \@wit_path; |
164 | if( @common_nodes ) { |
165 | my @cn; |
166 | foreach my $n ( @wit_path) { |
167 | push( @cn, $n ) if grep { $_ eq $n } @common_nodes; |
168 | } |
169 | @common_nodes = (); |
170 | push( @common_nodes, @cn ); |
171 | } else { |
172 | push( @common_nodes, @wit_path ); |
173 | } |
174 | } |
175 | |
176 | # Mark all the nodes as either common or not. |
177 | foreach my $cn ( @common_nodes ) { |
178 | print STDERR "Setting $cn as common node\n"; |
179 | $graph->node( $cn )->set_attribute( 'class', 'common' ); |
180 | } |
181 | foreach my $n ( $graph->nodes() ) { |
182 | $n->set_attribute( 'class', 'variant' ) |
183 | unless $n->get_attribute( 'class' ) eq 'common'; |
184 | } |
185 | |
a25d4374 |
186 | # Now calculate graph positions. |
187 | $graph->make_positions( \@common_nodes, $paths ); |
188 | |
b49c4318 |
189 | } |
190 | |
2ceca8c3 |
191 | =back |
192 | |
193 | =head1 LICENSE |
194 | |
195 | This package is free software and is provided "as is" without express |
196 | or implied warranty. You can redistribute it and/or modify it under |
197 | the same terms as Perl itself. |
198 | |
199 | =head1 AUTHOR |
200 | |
201 | Tara L Andrews, aurum@cpan.org |
202 | |
203 | =cut |
204 | |
b49c4318 |
205 | 1; |