CollateX version of GraphML parsing
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / CollateX.pm
CommitLineData
cda6a45b 1package Text::Tradition::Parser::CollateX;
2
3use strict;
4use warnings;
5use Text::Tradition::Parser::GraphML;
6
7=head1 NAME
8
9Text::Tradition::Parser::CollateX
10
11=head1 DESCRIPTION
12
13Parser module for Text::Tradition, given a GraphML file from the
14CollateX program that describes a collation graph. For further
15information on the GraphML format for text collation, see
16http://gregor.middell.net/collatex/
17
18=head1 METHODS
19
20=over
21
22=item B<parse>
23
24parse( $graph, $graphml_string );
25
26Takes an initialized Text::Tradition::Graph object and a string
27containing the GraphML; creates the appropriate nodes and edges on the
28graph.
29
30=cut
31
32my $IDKEY = 'number';
33my $CONTENTKEY = 'token';
34my $TRANSKEY = 'identical';
35
36sub parse {
37 my( $tradition, $graphml_str ) = @_;
38 my $graph_data = Text::Tradition::Parser::GraphML::parse( $graphml_str );
39 my $collation = $tradition->collation;
40 my %witnesses; # Keep track of the witnesses we encounter as we
41 # run through the graph data.
42
43 # Add the nodes to the graph. First delete the start node, because
44 # GraphML graphs will have their own start nodes.
45 $collation->del_reading( $collation->start() );
46
47 my $extra_data = {}; # Keep track of info to be processed after all
48 # nodes have been created
49 foreach my $n ( @{$graph_data->{'nodes'}} ) {
50 my %node_data = %$n;
51 my $nodeid = delete $node_data{$IDKEY};
52 my $token = delete $node_data{$CONTENTKEY};
53 unless( $nodeid && $token ) {
54 warn "Did not find an ID or token for graph node, can't add it";
55 next;
56 }
57 my $gnode = $collation->add_reading( $nodeid );
58 $gnode->text( $token );
59
60 # Whatever is left is extra info to be processed later.
61 if( keys %node_data ) {
62 $extra_data->{$nodeid} = \%node_data;
63 }
64 }
65
66 # Now add the edges.
67 foreach my $e ( @{$graph_data->{'edges'}} ) {
68 my %edge_data = %$e;
69 my $from = delete $edge_data{'source'};
70 my $to = delete $edge_data{'target'};
71
72 # In CollateX, we have a distinct witness data ID per witness,
73 # so that we can have multiple witnesses per edge. We want to
74 # translate this to one witness per edge in our own
75 # representation.
76 foreach my $ekey ( keys %edge_data ) {
77 my $wit = $edge_data{$ekey};
78 # Create the witness object if it does not yet exist.
79 unless( $witnesses{$wit} ) {
80 $tradition->add_witness( 'sigil' => $wit );
81 $witnesses{$wit} = 1;
82 }
83 $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $wit );
84 }
85 }
86
87 # Process the extra node data if it exists.
88 foreach my $nodeid ( keys %$extra_data ) {
89 my $ed = $extra_data->{$nodeid};
90 if( exists $ed->{$TRANSKEY} ) {
91
92 my $tn_reading = $collation->reading( $nodeid );
93 my $main_reading = $collation->reading( $ed->{$TRANSKEY} );
94 if( $collation->linear ) {
95 $tn_reading->set_identical( $main_reading );
96 } else {
97 $collation->merge_readings( $main_reading, $tn_reading );
98 }
99 } # else we don't have any other tags to process yet.
100 }
101
102 # Find the beginning and end nodes of the graph. The beginning node
103 # has no incoming edges; the end node has no outgoing edges.
104 my( $begin_node, $end_node );
105 foreach my $gnode ( $collation->readings() ) {
106 # print STDERR "Checking node " . $gnode->name . "\n";
107 my @outgoing = $gnode->outgoing();
108 my @incoming = $gnode->incoming();
109
110 unless( scalar @incoming ) {
111 warn "Already have a beginning node" if $begin_node;
112 $begin_node = $gnode;
113 $collation->start( $gnode );
114 }
115 unless( scalar @outgoing ) {
116 warn "Already have an ending node" if $end_node;
117 $end_node = $gnode;
118 }
119 }
120
121 # Record for each witness its sequence of readings, and determine
122 # the common nodes.
123 my @common_nodes = $collation->walk_witness_paths( $end_node );
124
125 # Now we have added the witnesses and their paths, so have also
126 # implicitly marked the common nodes. Now we can calculate their
127 # explicit positions.
128 $collation->calculate_positions( @common_nodes );
129}
130
131=back
132
133=head1 LICENSE
134
135This package is free software and is provided "as is" without express
136or implied warranty. You can redistribute it and/or modify it under
137the same terms as Perl itself.
138
139=head1 AUTHOR
140
141Tara L Andrews, aurum@cpan.org
142
143=cut
144
1451;