continued doc and testing drive; rationalize GraphML a little
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / CollateX.pm
CommitLineData
cda6a45b 1package Text::Tradition::Parser::CollateX;
2
3use strict;
4use warnings;
e867486f 5use Text::Tradition::Parser::GraphML qw/ graphml_parse populate_witness_path /;
cda6a45b 6
7=head1 NAME
8
9Text::Tradition::Parser::CollateX
10
e867486f 11=head1 SYNOPSIS
12
13 use Text::Tradition;
14
15 my $t_from_file = Text::Tradition->new(
16 'name' => 'my text',
17 'input' => 'CollateX',
18 'file' => '/path/to/collation.xml'
19 );
20
21 my $t_from_string = Text::Tradition->new(
22 'name' => 'my text',
23 'input' => 'CollateX',
24 'string' => $collation_xml,
25 );
26
cda6a45b 27=head1 DESCRIPTION
28
29Parser module for Text::Tradition, given a GraphML file from the
30CollateX program that describes a collation graph. For further
31information on the GraphML format for text collation, see
32http://gregor.middell.net/collatex/
33
34=head1 METHODS
35
e867486f 36=head2 B<parse>
cda6a45b 37
dfc37e38 38parse( $tradition, $init_options );
cda6a45b 39
e867486f 40Takes an initialized Text::Tradition object and a set of options; creates
41the appropriate nodes and edges on the graph. The options hash should
42include either a 'file' argument or a 'string' argument, depending on the
43source of the XML to be parsed.
44
45=begin testing
46
47use Text::Tradition;
48binmode STDOUT, ":utf8";
49binmode STDERR, ":utf8";
50eval { no warnings; binmode $DB::OUT, ":utf8"; };
51
52my $cxfile = 't/data/Collatex-16.xml';
53my $t = Text::Tradition->new(
54 'name' => 'inline',
55 'input' => 'CollateX',
56 'file' => $cxfile,
57 );
58
59is( ref( $t ), 'Text::Tradition', "Parsed our own GraphML" );
60if( $t ) {
61 is( scalar $t->collation->readings, 26, "Collation has all readings" );
62 is( scalar $t->collation->paths, 49, "Collation has all paths" );
63 is( scalar $t->witnesses, 3, "Collation has all witnesses" );
64
65 # Check an 'identical' node
66 my $transposed = $t->collation->reading( 'n15' );
67 ok( $transposed->has_primary, "Reading links to transposed primary" );
68 is( $transposed->primary->name, 'n17', "Correct transposition link" );
69}
70
71=end testing
cda6a45b 72
73=cut
74
75my $IDKEY = 'number';
76my $CONTENTKEY = 'token';
77my $TRANSKEY = 'identical';
78
79sub parse {
dfc37e38 80 my( $tradition, $opts ) = @_;
e867486f 81 my $graph_data = graphml_parse( $opts );
cda6a45b 82 my $collation = $tradition->collation;
83 my %witnesses; # Keep track of the witnesses we encounter as we
84 # run through the graph data.
85
86 # Add the nodes to the graph. First delete the start node, because
87 # GraphML graphs will have their own start nodes.
88 $collation->del_reading( $collation->start() );
910a0a6d 89 $collation->del_reading( $collation->end() );
cda6a45b 90
91 my $extra_data = {}; # Keep track of info to be processed after all
92 # nodes have been created
93 foreach my $n ( @{$graph_data->{'nodes'}} ) {
910a0a6d 94 my %node_data = %$n;
95 my $nodeid = delete $node_data{$IDKEY};
96 my $token = delete $node_data{$CONTENTKEY};
97 unless( defined $nodeid && defined $token ) {
98 warn "Did not find an ID or token for graph node, can't add it";
99 next;
100 }
101 my $gnode = $collation->add_reading( $nodeid );
102 $gnode->text( $token );
103
104 # Whatever is left is extra info to be processed later.
105 if( keys %node_data ) {
106 $extra_data->{$nodeid} = \%node_data;
107 }
cda6a45b 108 }
910a0a6d 109
cda6a45b 110 # Now add the edges.
111 foreach my $e ( @{$graph_data->{'edges'}} ) {
910a0a6d 112 my %edge_data = %$e;
113 my $from = delete $edge_data{'source'};
114 my $to = delete $edge_data{'target'};
115
116 # In CollateX, we have a distinct witness data ID per witness,
117 # so that we can have multiple witnesses per edge. We want to
118 # translate this to one witness per edge in our own
119 # representation.
120 foreach my $ekey ( keys %edge_data ) {
121 my $wit = $edge_data{$ekey};
122 # Create the witness object if it does not yet exist.
123 unless( $witnesses{$wit} ) {
124 $tradition->add_witness( 'sigil' => $wit );
125 $witnesses{$wit} = 1;
126 }
127 $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $wit );
128 }
cda6a45b 129 }
130
131 # Process the extra node data if it exists.
132 foreach my $nodeid ( keys %$extra_data ) {
910a0a6d 133 my $ed = $extra_data->{$nodeid};
134 if( exists $ed->{$TRANSKEY} ) {
135
136 my $tn_reading = $collation->reading( $nodeid );
137 my $main_reading = $collation->reading( $ed->{$TRANSKEY} );
138 if( $collation->linear ) {
139 $tn_reading->set_identical( $main_reading );
140 } else {
141 $collation->merge_readings( $main_reading, $tn_reading );
142 }
143 } # else we don't have any other tags to process yet.
cda6a45b 144 }
145
146 # Find the beginning and end nodes of the graph. The beginning node
147 # has no incoming edges; the end node has no outgoing edges.
148 my( $begin_node, $end_node );
149 foreach my $gnode ( $collation->readings() ) {
910a0a6d 150 # print STDERR "Checking node " . $gnode->name . "\n";
151 my @outgoing = $gnode->outgoing();
152 my @incoming = $gnode->incoming();
153
154 unless( scalar @incoming ) {
155 warn "Already have a beginning node" if $begin_node;
156 $begin_node = $gnode;
157 $collation->start( $gnode );
158 }
159 unless( scalar @outgoing ) {
160 warn "Already have an ending node" if $end_node;
161 $end_node = $gnode;
162 $collation->end( $gnode );
163 }
cda6a45b 164 }
d9e873d0 165
e867486f 166 # Set the $witness->path arrays for each wit.
167 populate_witness_path( $tradition );
cda6a45b 168
e867486f 169 # Rank the readings.
d9e873d0 170 $collation->calculate_ranks();
cda6a45b 171}
172
e867486f 173=head1 BUGS / TODO
174
175=over
176
177=item * Make this into a stream parser with GraphML
178
179=item * Use CollateX-calculated ranks instead of recalculating our own
180
cda6a45b 181=back
182
183=head1 LICENSE
184
185This package is free software and is provided "as is" without express
186or implied warranty. You can redistribute it and/or modify it under
187the same terms as Perl itself.
188
189=head1 AUTHOR
190
191Tara L Andrews, aurum@cpan.org
192
193=cut
194
1951;