Commit | Line | Data |
cda6a45b |
1 | package Text::Tradition::Parser::CollateX; |
2 | |
3 | use strict; |
4 | use warnings; |
1f7aa795 |
5 | use Text::Tradition::Parser::GraphML qw/ graphml_parse /; |
ca9208b6 |
6 | use TryCatch; |
cda6a45b |
7 | |
8 | =head1 NAME |
9 | |
10 | Text::Tradition::Parser::CollateX |
11 | |
e867486f |
12 | =head1 SYNOPSIS |
13 | |
14 | use Text::Tradition; |
15 | |
16 | my $t_from_file = Text::Tradition->new( |
17 | 'name' => 'my text', |
18 | 'input' => 'CollateX', |
19 | 'file' => '/path/to/collation.xml' |
20 | ); |
21 | |
22 | my $t_from_string = Text::Tradition->new( |
23 | 'name' => 'my text', |
24 | 'input' => 'CollateX', |
25 | 'string' => $collation_xml, |
26 | ); |
27 | |
cda6a45b |
28 | =head1 DESCRIPTION |
29 | |
30 | Parser module for Text::Tradition, given a GraphML file from the |
31 | CollateX program that describes a collation graph. For further |
32 | information on the GraphML format for text collation, see |
33 | http://gregor.middell.net/collatex/ |
34 | |
35 | =head1 METHODS |
36 | |
e867486f |
37 | =head2 B<parse> |
cda6a45b |
38 | |
dfc37e38 |
39 | parse( $tradition, $init_options ); |
cda6a45b |
40 | |
e867486f |
41 | Takes an initialized Text::Tradition object and a set of options; creates |
42 | the appropriate nodes and edges on the graph. The options hash should |
43 | include either a 'file' argument or a 'string' argument, depending on the |
44 | source of the XML to be parsed. |
45 | |
46 | =begin testing |
47 | |
48 | use Text::Tradition; |
49 | binmode STDOUT, ":utf8"; |
50 | binmode STDERR, ":utf8"; |
51 | eval { no warnings; binmode $DB::OUT, ":utf8"; }; |
52 | |
53 | my $cxfile = 't/data/Collatex-16.xml'; |
54 | my $t = Text::Tradition->new( |
55 | 'name' => 'inline', |
56 | 'input' => 'CollateX', |
57 | 'file' => $cxfile, |
58 | ); |
59 | |
679f17e1 |
60 | is( ref( $t ), 'Text::Tradition', "Parsed a CollateX input" ); |
e867486f |
61 | if( $t ) { |
62 | is( scalar $t->collation->readings, 26, "Collation has all readings" ); |
a753cc84 |
63 | is( scalar $t->collation->paths, 32, "Collation has all paths" ); |
e867486f |
64 | is( scalar $t->witnesses, 3, "Collation has all witnesses" ); |
65 | |
66 | # Check an 'identical' node |
67 | my $transposed = $t->collation->reading( 'n15' ); |
a753cc84 |
68 | my @related = $transposed->related_readings; |
69 | is( scalar @related, 1, "Reading links to transposed version" ); |
679f17e1 |
70 | is( $related[0]->id, 'n18', "Correct transposition link" ); |
e867486f |
71 | } |
72 | |
73 | =end testing |
cda6a45b |
74 | |
75 | =cut |
76 | |
77 | my $IDKEY = 'number'; |
679f17e1 |
78 | my $CONTENTKEY = 'tokens'; |
79 | my $EDGETYPEKEY = 'type'; |
80 | my $WITKEY = 'witnesses'; |
cda6a45b |
81 | |
82 | sub parse { |
dfc37e38 |
83 | my( $tradition, $opts ) = @_; |
2626f709 |
84 | my( $graph_data ) = graphml_parse( $opts ); |
cda6a45b |
85 | my $collation = $tradition->collation; |
cda6a45b |
86 | |
3a2ebbf4 |
87 | # First add the readings to the graph. |
679f17e1 |
88 | ## Assume the start node has no text and id 0, and the end node has |
89 | ## no text and ID [number of nodes] - 1. |
90 | my $endnode = scalar @{$graph_data->{'nodes'}} - 1; |
cda6a45b |
91 | foreach my $n ( @{$graph_data->{'nodes'}} ) { |
3a2ebbf4 |
92 | unless( defined $n->{$IDKEY} && defined $n->{$CONTENTKEY} ) { |
679f17e1 |
93 | if( defined $n->{$IDKEY} && $n->{$IDKEY} == 0 ) { |
94 | # It's the start node. |
95 | $n->{$IDKEY} = $collation->start->id; |
96 | } elsif ( defined $n->{$IDKEY} && $n->{$IDKEY} == $endnode ) { |
97 | # It's the end node. |
98 | $n->{$IDKEY} = $collation->end->id; |
99 | } else { |
100 | # Something is probably wrong. |
101 | warn "Did not find an ID or token for graph node, can't add it"; |
102 | } |
910a0a6d |
103 | next; |
104 | } |
679f17e1 |
105 | # Node ID should be an XML name, so prepend an 'n' if necessary. |
106 | if( $n->{$IDKEY} =~ /^\d/ ) { |
107 | $n->{$IDKEY} = 'n' . $n->{$IDKEY}; |
108 | } |
109 | # Create the reading. |
3a2ebbf4 |
110 | my $gnode_args = { |
679f17e1 |
111 | 'id' => $n->{$IDKEY}, |
112 | 'text' => $n->{$CONTENTKEY}, |
3a2ebbf4 |
113 | }; |
114 | my $gnode = $collation->add_reading( $gnode_args ); |
cda6a45b |
115 | } |
910a0a6d |
116 | |
3a2ebbf4 |
117 | # Now add the path edges. |
ca9208b6 |
118 | my %transpositions; |
cda6a45b |
119 | foreach my $e ( @{$graph_data->{'edges'}} ) { |
679f17e1 |
120 | my $from = $e->{'source'}; |
121 | my $to = $e->{'target'}; |
122 | |
123 | ## Edge data keys are ID (which we don't need), witnesses, and type. |
124 | ## Type can be 'path' or 'relationship'; |
125 | ## witnesses is a comma-separated list. |
126 | if( $e->{$EDGETYPEKEY} eq 'path' ) { |
127 | ## Add the path for each witness listesd. |
128 | # Create the witness objects if they does not yet exist. |
129 | foreach my $wit ( split( /, /, $e->{$WITKEY} ) ) { |
4889be4f |
130 | if( $tradition->witness( $wit ) ) { |
131 | $tradition->witness( $wit )->is_collated( 1 ); |
132 | } else { |
fae52efd |
133 | $tradition->add_witness( |
134 | 'sigil' => $wit, 'sourcetype' => 'collation' ); |
679f17e1 |
135 | } |
136 | $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $wit ); |
137 | } |
ca9208b6 |
138 | } else { # CollateX-marked transpositions |
139 | # Save the transposition links so that we can apply them |
140 | # once they are all collected. |
141 | $transpositions{ $from->{$IDKEY} } = $to->{$IDKEY}; |
910a0a6d |
142 | } |
cda6a45b |
143 | } |
ca9208b6 |
144 | |
145 | # TODO Split readings by word unless we're asked not to |
146 | |
147 | # Mark initialization as done so that relationship validation turns on |
148 | $tradition->_init_done( 1 ); |
149 | # Now apply transpositions as appropriate. |
150 | if( $collation->linear ) { |
151 | # Sort the transpositions by reading length, then try first to merge them |
152 | # and then to transpose them. Warn if the text isn't identical. |
153 | foreach my $k ( sort { |
154 | my $t1 = $collation->reading( $a )->text; |
155 | my $t2 = $collation->reading( $b )->text; |
156 | return length( $t2 ) <=> length( $t1 ); |
157 | } keys %transpositions ) { |
158 | my $v = $transpositions{$k}; |
159 | my $merged; |
160 | try { |
161 | $collation->add_relationship( $k, $v, { type => 'collated' } ); |
162 | $merged = 1; |
163 | } catch ( Text::Tradition::Error $e ) { |
164 | 1; |
165 | } |
166 | unless( $merged ) { |
167 | my $transpopts = { type => 'transposition' }; |
168 | unless( $collation->reading( $k )->text eq $collation->reading( $v )->text ) { |
169 | $transpopts->{annotation} = 'CollateX fuzzy match'; |
170 | } |
171 | try { |
172 | $collation->add_relationship( $k, $v, $transpopts ); |
173 | } catch ( Text::Tradition::Error $e ) { |
174 | warn "Could neither merge nor transpose $k and $v; DROPPING transposition"; |
175 | } |
176 | } |
177 | } |
178 | |
179 | # Rank the readings and find the commonalities |
82a45078 |
180 | unless( $opts->{'nocalc'} ) { |
181 | $collation->calculate_ranks(); |
182 | $collation->flatten_ranks(); |
183 | $collation->calculate_common_readings(); |
184 | } |
ca9208b6 |
185 | } else { |
186 | my %merged; |
187 | foreach my $k ( keys %transpositions ) { |
188 | my $v = $transpositions{$k}; |
189 | $k = $merged{$k} if exists $merged{$k}; |
190 | $v = $merged{$v} if exists $merged{$v}; |
191 | next if $k eq $v; |
192 | if( $collation->reading( $k )->text eq $collation->reading( $v )->text ) { |
193 | $collation->merge_readings( $k, $v ); |
194 | $merged{$v} = $k; |
195 | } else { |
196 | warn "DROPPING transposition link for non-identical readings $k and $v"; |
197 | } |
198 | } |
199 | } |
861c3e27 |
200 | |
201 | # Save the text for each witness so that we can ensure consistency |
202 | # later on |
203 | $tradition->collation->text_from_paths(); |
cda6a45b |
204 | } |
ca9208b6 |
205 | |
cda6a45b |
206 | |
e867486f |
207 | =head1 BUGS / TODO |
208 | |
209 | =over |
210 | |
211 | =item * Make this into a stream parser with GraphML |
212 | |
213 | =item * Use CollateX-calculated ranks instead of recalculating our own |
214 | |
cda6a45b |
215 | =back |
216 | |
217 | =head1 LICENSE |
218 | |
219 | This package is free software and is provided "as is" without express |
220 | or implied warranty. You can redistribute it and/or modify it under |
221 | the same terms as Perl itself. |
222 | |
223 | =head1 AUTHOR |
224 | |
225 | Tara L Andrews, aurum@cpan.org |
226 | |
227 | =cut |
228 | |
229 | 1; |