1 package Text::Tradition::Parser::CollateX;
5 use Text::Tradition::Parser::GraphML qw/ graphml_parse /;
10 Text::Tradition::Parser::CollateX
16 my $t_from_file = Text::Tradition->new(
18 'input' => 'CollateX',
19 'file' => '/path/to/collation.xml'
22 my $t_from_string = Text::Tradition->new(
24 'input' => 'CollateX',
25 'string' => $collation_xml,
30 Parser module for Text::Tradition, given a GraphML file from the
31 CollateX program that describes a collation graph. For further
32 information on the GraphML format for text collation, see
33 http://gregor.middell.net/collatex/
39 parse( $tradition, $init_options );
41 Takes an initialized Text::Tradition object and a set of options; creates
42 the appropriate nodes and edges on the graph. The options hash should
43 include either a 'file' argument or a 'string' argument, depending on the
44 source of the XML to be parsed.
49 binmode STDOUT, ":utf8";
50 binmode STDERR, ":utf8";
51 eval { no warnings; binmode $DB::OUT, ":utf8"; };
53 my $cxfile = 't/data/Collatex-16.xml';
54 my $t = Text::Tradition->new(
56 'input' => 'CollateX',
60 is( ref( $t ), 'Text::Tradition', "Parsed a CollateX input" );
62 is( scalar $t->collation->readings, 26, "Collation has all readings" );
63 is( scalar $t->collation->paths, 32, "Collation has all paths" );
64 is( scalar $t->witnesses, 3, "Collation has all witnesses" );
66 # Check an 'identical' node
67 my $transposed = $t->collation->reading( 'n15' );
68 my @related = $transposed->related_readings;
69 is( scalar @related, 1, "Reading links to transposed version" );
70 is( $related[0]->id, 'n18', "Correct transposition link" );
78 my $CONTENTKEY = 'tokens';
79 my $EDGETYPEKEY = 'type';
80 my $WITKEY = 'witnesses';
83 my( $tradition, $opts ) = @_;
84 my( $graph_data ) = graphml_parse( $opts );
85 my $collation = $tradition->collation;
87 # First add the readings to the graph.
88 ## Assume the start node has no text and id 0, and the end node has
89 ## no text and ID [number of nodes] - 1.
90 my $endnode = scalar @{$graph_data->{'nodes'}} - 1;
91 foreach my $n ( @{$graph_data->{'nodes'}} ) {
92 unless( defined $n->{$IDKEY} && defined $n->{$CONTENTKEY} ) {
93 if( defined $n->{$IDKEY} && $n->{$IDKEY} == 0 ) {
94 # It's the start node.
95 $n->{$IDKEY} = $collation->start->id;
96 } elsif ( defined $n->{$IDKEY} && $n->{$IDKEY} == $endnode ) {
98 $n->{$IDKEY} = $collation->end->id;
100 # Something is probably wrong.
101 warn "Did not find an ID or token for graph node, can't add it";
105 # Node ID should be an XML name, so prepend an 'n' if necessary.
106 if( $n->{$IDKEY} =~ /^\d/ ) {
107 $n->{$IDKEY} = 'n' . $n->{$IDKEY};
109 # Create the reading.
111 'id' => $n->{$IDKEY},
112 'text' => $n->{$CONTENTKEY},
114 my $gnode = $collation->add_reading( $gnode_args );
117 # Now add the path edges.
119 foreach my $e ( @{$graph_data->{'edges'}} ) {
120 my $from = $e->{'source'};
121 my $to = $e->{'target'};
123 ## Edge data keys are ID (which we don't need), witnesses, and type.
124 ## Type can be 'path' or 'relationship';
125 ## witnesses is a comma-separated list.
126 if( $e->{$EDGETYPEKEY} eq 'path' ) {
127 ## Add the path for each witness listesd.
128 # Create the witness objects if they does not yet exist.
129 foreach my $wit ( split( /, /, $e->{$WITKEY} ) ) {
130 if( $tradition->witness( $wit ) ) {
131 $tradition->witness( $wit )->is_collated( 1 );
133 $tradition->add_witness(
134 'sigil' => $wit, 'sourcetype' => 'collation' );
136 $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $wit );
138 } else { # CollateX-marked transpositions
139 # Save the transposition links so that we can apply them
140 # once they are all collected.
141 $transpositions{ $from->{$IDKEY} } = $to->{$IDKEY};
145 # TODO Split readings by word unless we're asked not to
147 # Mark initialization as done so that relationship validation turns on
148 $tradition->_init_done( 1 );
149 # Now apply transpositions as appropriate.
150 if( $collation->linear ) {
151 # Sort the transpositions by reading length, then try first to merge them
152 # and then to transpose them. Warn if the text isn't identical.
153 foreach my $k ( sort {
154 my $t1 = $collation->reading( $a )->text;
155 my $t2 = $collation->reading( $b )->text;
156 return length( $t2 ) <=> length( $t1 );
157 } keys %transpositions ) {
158 my $v = $transpositions{$k};
161 $collation->add_relationship( $k, $v, { type => 'collated' } );
163 } catch ( Text::Tradition::Error $e ) {
167 my $transpopts = { type => 'transposition' };
168 unless( $collation->reading( $k )->text eq $collation->reading( $v )->text ) {
169 $transpopts->{annotation} = 'CollateX fuzzy match';
172 $collation->add_relationship( $k, $v, $transpopts );
173 } catch ( Text::Tradition::Error $e ) {
174 warn "Could neither merge nor transpose $k and $v; DROPPING transposition";
179 # Rank the readings and find the commonalities
180 unless( $opts->{'nocalc'} ) {
181 $collation->calculate_ranks();
182 $collation->flatten_ranks();
183 $collation->calculate_common_readings();
187 foreach my $k ( keys %transpositions ) {
188 my $v = $transpositions{$k};
189 $k = $merged{$k} if exists $merged{$k};
190 $v = $merged{$v} if exists $merged{$v};
192 if( $collation->reading( $k )->text eq $collation->reading( $v )->text ) {
193 $collation->merge_readings( $k, $v );
196 warn "DROPPING transposition link for non-identical readings $k and $v";
201 # Save the text for each witness so that we can ensure consistency
203 $tradition->collation->text_from_paths();
211 =item * Make this into a stream parser with GraphML
213 =item * Use CollateX-calculated ranks instead of recalculating our own
219 This package is free software and is provided "as is" without express
220 or implied warranty. You can redistribute it and/or modify it under
221 the same terms as Perl itself.
225 Tara L Andrews, aurum@cpan.org