XML parsers should accept already-parsed XML object too
[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Parser / CollateX.pm
CommitLineData
cda6a45b 1package Text::Tradition::Parser::CollateX;
2
3use strict;
4use warnings;
1f7aa795 5use Text::Tradition::Parser::GraphML qw/ graphml_parse /;
ca9208b6 6use TryCatch;
cda6a45b 7
8=head1 NAME
9
10Text::Tradition::Parser::CollateX
11
e867486f 12=head1 SYNOPSIS
13
14 use Text::Tradition;
15
16 my $t_from_file = Text::Tradition->new(
17 'name' => 'my text',
18 'input' => 'CollateX',
19 'file' => '/path/to/collation.xml'
20 );
21
22 my $t_from_string = Text::Tradition->new(
23 'name' => 'my text',
24 'input' => 'CollateX',
25 'string' => $collation_xml,
26 );
27
cda6a45b 28=head1 DESCRIPTION
29
30Parser module for Text::Tradition, given a GraphML file from the
31CollateX program that describes a collation graph. For further
32information on the GraphML format for text collation, see
33http://gregor.middell.net/collatex/
34
35=head1 METHODS
36
e867486f 37=head2 B<parse>
cda6a45b 38
dfc37e38 39parse( $tradition, $init_options );
cda6a45b 40
e867486f 41Takes an initialized Text::Tradition object and a set of options; creates
42the appropriate nodes and edges on the graph. The options hash should
43include either a 'file' argument or a 'string' argument, depending on the
44source of the XML to be parsed.
45
46=begin testing
47
48use Text::Tradition;
49binmode STDOUT, ":utf8";
50binmode STDERR, ":utf8";
51eval { no warnings; binmode $DB::OUT, ":utf8"; };
52
53my $cxfile = 't/data/Collatex-16.xml';
54my $t = Text::Tradition->new(
55 'name' => 'inline',
56 'input' => 'CollateX',
57 'file' => $cxfile,
58 );
59
679f17e1 60is( ref( $t ), 'Text::Tradition', "Parsed a CollateX input" );
e867486f 61if( $t ) {
62 is( scalar $t->collation->readings, 26, "Collation has all readings" );
a753cc84 63 is( scalar $t->collation->paths, 32, "Collation has all paths" );
e867486f 64 is( scalar $t->witnesses, 3, "Collation has all witnesses" );
65
66 # Check an 'identical' node
67 my $transposed = $t->collation->reading( 'n15' );
a753cc84 68 my @related = $transposed->related_readings;
69 is( scalar @related, 1, "Reading links to transposed version" );
679f17e1 70 is( $related[0]->id, 'n18', "Correct transposition link" );
e867486f 71}
72
73=end testing
cda6a45b 74
75=cut
76
77my $IDKEY = 'number';
679f17e1 78my $CONTENTKEY = 'tokens';
79my $EDGETYPEKEY = 'type';
80my $WITKEY = 'witnesses';
cda6a45b 81
82sub parse {
dfc37e38 83 my( $tradition, $opts ) = @_;
2626f709 84 my( $graph_data ) = graphml_parse( $opts );
cda6a45b 85 my $collation = $tradition->collation;
cda6a45b 86
3a2ebbf4 87 # First add the readings to the graph.
679f17e1 88 ## Assume the start node has no text and id 0, and the end node has
89 ## no text and ID [number of nodes] - 1.
90 my $endnode = scalar @{$graph_data->{'nodes'}} - 1;
cda6a45b 91 foreach my $n ( @{$graph_data->{'nodes'}} ) {
3a2ebbf4 92 unless( defined $n->{$IDKEY} && defined $n->{$CONTENTKEY} ) {
679f17e1 93 if( defined $n->{$IDKEY} && $n->{$IDKEY} == 0 ) {
94 # It's the start node.
95 $n->{$IDKEY} = $collation->start->id;
96 } elsif ( defined $n->{$IDKEY} && $n->{$IDKEY} == $endnode ) {
97 # It's the end node.
98 $n->{$IDKEY} = $collation->end->id;
99 } else {
100 # Something is probably wrong.
101 warn "Did not find an ID or token for graph node, can't add it";
102 }
910a0a6d 103 next;
104 }
679f17e1 105 # Node ID should be an XML name, so prepend an 'n' if necessary.
106 if( $n->{$IDKEY} =~ /^\d/ ) {
107 $n->{$IDKEY} = 'n' . $n->{$IDKEY};
108 }
109 # Create the reading.
3a2ebbf4 110 my $gnode_args = {
679f17e1 111 'id' => $n->{$IDKEY},
112 'text' => $n->{$CONTENTKEY},
3a2ebbf4 113 };
114 my $gnode = $collation->add_reading( $gnode_args );
cda6a45b 115 }
910a0a6d 116
3a2ebbf4 117 # Now add the path edges.
ca9208b6 118 my %transpositions;
cda6a45b 119 foreach my $e ( @{$graph_data->{'edges'}} ) {
679f17e1 120 my $from = $e->{'source'};
121 my $to = $e->{'target'};
122
123 ## Edge data keys are ID (which we don't need), witnesses, and type.
124 ## Type can be 'path' or 'relationship';
125 ## witnesses is a comma-separated list.
126 if( $e->{$EDGETYPEKEY} eq 'path' ) {
127 ## Add the path for each witness listesd.
128 # Create the witness objects if they does not yet exist.
129 foreach my $wit ( split( /, /, $e->{$WITKEY} ) ) {
4889be4f 130 if( $tradition->witness( $wit ) ) {
131 $tradition->witness( $wit )->is_collated( 1 );
132 } else {
fae52efd 133 $tradition->add_witness(
134 'sigil' => $wit, 'sourcetype' => 'collation' );
679f17e1 135 }
136 $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $wit );
137 }
ca9208b6 138 } else { # CollateX-marked transpositions
139 # Save the transposition links so that we can apply them
140 # once they are all collected.
141 $transpositions{ $from->{$IDKEY} } = $to->{$IDKEY};
910a0a6d 142 }
cda6a45b 143 }
ca9208b6 144
145 # TODO Split readings by word unless we're asked not to
146
147 # Mark initialization as done so that relationship validation turns on
148 $tradition->_init_done( 1 );
149 # Now apply transpositions as appropriate.
150 if( $collation->linear ) {
151 # Sort the transpositions by reading length, then try first to merge them
152 # and then to transpose them. Warn if the text isn't identical.
153 foreach my $k ( sort {
154 my $t1 = $collation->reading( $a )->text;
155 my $t2 = $collation->reading( $b )->text;
156 return length( $t2 ) <=> length( $t1 );
157 } keys %transpositions ) {
158 my $v = $transpositions{$k};
159 my $merged;
160 try {
161 $collation->add_relationship( $k, $v, { type => 'collated' } );
162 $merged = 1;
163 } catch ( Text::Tradition::Error $e ) {
164 1;
165 }
166 unless( $merged ) {
167 my $transpopts = { type => 'transposition' };
168 unless( $collation->reading( $k )->text eq $collation->reading( $v )->text ) {
169 $transpopts->{annotation} = 'CollateX fuzzy match';
170 }
171 try {
172 $collation->add_relationship( $k, $v, $transpopts );
173 } catch ( Text::Tradition::Error $e ) {
174 warn "Could neither merge nor transpose $k and $v; DROPPING transposition";
175 }
176 }
177 }
178
179 # Rank the readings and find the commonalities
82a45078 180 unless( $opts->{'nocalc'} ) {
181 $collation->calculate_ranks();
182 $collation->flatten_ranks();
183 $collation->calculate_common_readings();
184 }
ca9208b6 185 } else {
186 my %merged;
187 foreach my $k ( keys %transpositions ) {
188 my $v = $transpositions{$k};
189 $k = $merged{$k} if exists $merged{$k};
190 $v = $merged{$v} if exists $merged{$v};
191 next if $k eq $v;
192 if( $collation->reading( $k )->text eq $collation->reading( $v )->text ) {
193 $collation->merge_readings( $k, $v );
194 $merged{$v} = $k;
195 } else {
196 warn "DROPPING transposition link for non-identical readings $k and $v";
197 }
198 }
199 }
861c3e27 200
201 # Save the text for each witness so that we can ensure consistency
202 # later on
203 $tradition->collation->text_from_paths();
cda6a45b 204}
ca9208b6 205
cda6a45b 206
e867486f 207=head1 BUGS / TODO
208
209=over
210
211=item * Make this into a stream parser with GraphML
212
213=item * Use CollateX-calculated ranks instead of recalculating our own
214
cda6a45b 215=back
216
217=head1 LICENSE
218
219This package is free software and is provided "as is" without express
220or implied warranty. You can redistribute it and/or modify it under
221the same terms as Perl itself.
222
223=head1 AUTHOR
224
225Tara L Andrews, aurum@cpan.org
226
227=cut
228
2291;