simplify Directory and add exceptions;
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / CollateX.pm
CommitLineData
cda6a45b 1package Text::Tradition::Parser::CollateX;
2
3use strict;
4use warnings;
1f7aa795 5use Text::Tradition::Parser::GraphML qw/ graphml_parse /;
cda6a45b 6
7=head1 NAME
8
9Text::Tradition::Parser::CollateX
10
e867486f 11=head1 SYNOPSIS
12
13 use Text::Tradition;
14
15 my $t_from_file = Text::Tradition->new(
16 'name' => 'my text',
17 'input' => 'CollateX',
18 'file' => '/path/to/collation.xml'
19 );
20
21 my $t_from_string = Text::Tradition->new(
22 'name' => 'my text',
23 'input' => 'CollateX',
24 'string' => $collation_xml,
25 );
26
cda6a45b 27=head1 DESCRIPTION
28
29Parser module for Text::Tradition, given a GraphML file from the
30CollateX program that describes a collation graph. For further
31information on the GraphML format for text collation, see
32http://gregor.middell.net/collatex/
33
34=head1 METHODS
35
e867486f 36=head2 B<parse>
cda6a45b 37
dfc37e38 38parse( $tradition, $init_options );
cda6a45b 39
e867486f 40Takes an initialized Text::Tradition object and a set of options; creates
41the appropriate nodes and edges on the graph. The options hash should
42include either a 'file' argument or a 'string' argument, depending on the
43source of the XML to be parsed.
44
45=begin testing
46
47use Text::Tradition;
48binmode STDOUT, ":utf8";
49binmode STDERR, ":utf8";
50eval { no warnings; binmode $DB::OUT, ":utf8"; };
51
52my $cxfile = 't/data/Collatex-16.xml';
53my $t = Text::Tradition->new(
54 'name' => 'inline',
55 'input' => 'CollateX',
56 'file' => $cxfile,
57 );
58
59is( ref( $t ), 'Text::Tradition', "Parsed our own GraphML" );
60if( $t ) {
61 is( scalar $t->collation->readings, 26, "Collation has all readings" );
a753cc84 62 is( scalar $t->collation->paths, 32, "Collation has all paths" );
e867486f 63 is( scalar $t->witnesses, 3, "Collation has all witnesses" );
64
65 # Check an 'identical' node
66 my $transposed = $t->collation->reading( 'n15' );
a753cc84 67 my @related = $transposed->related_readings;
68 is( scalar @related, 1, "Reading links to transposed version" );
69 is( $related[0]->id, 'n17', "Correct transposition link" );
e867486f 70}
71
72=end testing
cda6a45b 73
74=cut
75
76my $IDKEY = 'number';
77my $CONTENTKEY = 'token';
78my $TRANSKEY = 'identical';
79
80sub parse {
dfc37e38 81 my( $tradition, $opts ) = @_;
2626f709 82 my( $graph_data ) = graphml_parse( $opts );
cda6a45b 83 my $collation = $tradition->collation;
cda6a45b 84
3a2ebbf4 85 # First add the readings to the graph.
cda6a45b 86 my $extra_data = {}; # Keep track of info to be processed after all
87 # nodes have been created
88 foreach my $n ( @{$graph_data->{'nodes'}} ) {
3a2ebbf4 89 unless( defined $n->{$IDKEY} && defined $n->{$CONTENTKEY} ) {
910a0a6d 90 warn "Did not find an ID or token for graph node, can't add it";
91 next;
92 }
3a2ebbf4 93 my %node_data = %$n;
94 my $gnode_args = {
3a2ebbf4 95 'id' => delete $node_data{$IDKEY},
96 'text' => delete $node_data{$CONTENTKEY},
97 };
98 my $gnode = $collation->add_reading( $gnode_args );
99
100 # Whatever is left is extra info to be processed later,
101 # e.g. a transposition link.
910a0a6d 102 if( keys %node_data ) {
3a2ebbf4 103 $extra_data->{$gnode->id} = \%node_data;
910a0a6d 104 }
cda6a45b 105 }
910a0a6d 106
3a2ebbf4 107 # Now add the path edges.
cda6a45b 108 foreach my $e ( @{$graph_data->{'edges'}} ) {
910a0a6d 109 my %edge_data = %$e;
110 my $from = delete $edge_data{'source'};
111 my $to = delete $edge_data{'target'};
112
113 # In CollateX, we have a distinct witness data ID per witness,
114 # so that we can have multiple witnesses per edge. We want to
115 # translate this to one witness per edge in our own
116 # representation.
117 foreach my $ekey ( keys %edge_data ) {
118 my $wit = $edge_data{$ekey};
119 # Create the witness object if it does not yet exist.
3a2ebbf4 120 unless( $tradition->witness( $wit ) ) {
910a0a6d 121 $tradition->add_witness( 'sigil' => $wit );
910a0a6d 122 }
123 $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $wit );
124 }
cda6a45b 125 }
126
127 # Process the extra node data if it exists.
128 foreach my $nodeid ( keys %$extra_data ) {
910a0a6d 129 my $ed = $extra_data->{$nodeid};
130 if( exists $ed->{$TRANSKEY} ) {
910a0a6d 131 my $tn_reading = $collation->reading( $nodeid );
132 my $main_reading = $collation->reading( $ed->{$TRANSKEY} );
133 if( $collation->linear ) {
3a2ebbf4 134 $collation->add_relationship( $tn_reading, $main_reading,
135 { type => 'transposition' } );
910a0a6d 136 } else {
137 $collation->merge_readings( $main_reading, $tn_reading );
138 }
139 } # else we don't have any other tags to process yet.
cda6a45b 140 }
141
142 # Find the beginning and end nodes of the graph. The beginning node
143 # has no incoming edges; the end node has no outgoing edges.
144 my( $begin_node, $end_node );
3a2ebbf4 145 my @starts = $collation->sequence->source_vertices();
146 my @ends = $collation->sequence->sink_vertices();
147 if( @starts != 1 ) {
148 warn "Found more or less than one start vertex: @starts";
149 } else {
150 $collation->merge_readings( $collation->start, @starts );
151 }
152 if( @ends != 1 ) {
153 warn "Found more or less than one end vertex: @ends";
154 } else {
155 $collation->merge_readings( $collation->end, @ends );
cda6a45b 156 }
d9e873d0 157
e867486f 158 # Rank the readings.
94a077d6 159 $collation->calculate_ranks() if $collation->linear;
861c3e27 160
161 # Save the text for each witness so that we can ensure consistency
162 # later on
163 $tradition->collation->text_from_paths();
cda6a45b 164}
165
e867486f 166=head1 BUGS / TODO
167
168=over
169
170=item * Make this into a stream parser with GraphML
171
172=item * Use CollateX-calculated ranks instead of recalculating our own
173
cda6a45b 174=back
175
176=head1 LICENSE
177
178This package is free software and is provided "as is" without express
179or implied warranty. You can redistribute it and/or modify it under
180the same terms as Perl itself.
181
182=head1 AUTHOR
183
184Tara L Andrews, aurum@cpan.org
185
186=cut
187
1881;