make witness plaintext parsing work
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / CollateX.pm
CommitLineData
cda6a45b 1package Text::Tradition::Parser::CollateX;
2
3use strict;
4use warnings;
1f7aa795 5use Text::Tradition::Parser::GraphML qw/ graphml_parse /;
cda6a45b 6
7=head1 NAME
8
9Text::Tradition::Parser::CollateX
10
e867486f 11=head1 SYNOPSIS
12
13 use Text::Tradition;
14
15 my $t_from_file = Text::Tradition->new(
16 'name' => 'my text',
17 'input' => 'CollateX',
18 'file' => '/path/to/collation.xml'
19 );
20
21 my $t_from_string = Text::Tradition->new(
22 'name' => 'my text',
23 'input' => 'CollateX',
24 'string' => $collation_xml,
25 );
26
cda6a45b 27=head1 DESCRIPTION
28
29Parser module for Text::Tradition, given a GraphML file from the
30CollateX program that describes a collation graph. For further
31information on the GraphML format for text collation, see
32http://gregor.middell.net/collatex/
33
34=head1 METHODS
35
e867486f 36=head2 B<parse>
cda6a45b 37
dfc37e38 38parse( $tradition, $init_options );
cda6a45b 39
e867486f 40Takes an initialized Text::Tradition object and a set of options; creates
41the appropriate nodes and edges on the graph. The options hash should
42include either a 'file' argument or a 'string' argument, depending on the
43source of the XML to be parsed.
44
45=begin testing
46
47use Text::Tradition;
48binmode STDOUT, ":utf8";
49binmode STDERR, ":utf8";
50eval { no warnings; binmode $DB::OUT, ":utf8"; };
51
52my $cxfile = 't/data/Collatex-16.xml';
53my $t = Text::Tradition->new(
54 'name' => 'inline',
55 'input' => 'CollateX',
56 'file' => $cxfile,
57 );
58
679f17e1 59is( ref( $t ), 'Text::Tradition', "Parsed a CollateX input" );
e867486f 60if( $t ) {
61 is( scalar $t->collation->readings, 26, "Collation has all readings" );
a753cc84 62 is( scalar $t->collation->paths, 32, "Collation has all paths" );
e867486f 63 is( scalar $t->witnesses, 3, "Collation has all witnesses" );
64
65 # Check an 'identical' node
66 my $transposed = $t->collation->reading( 'n15' );
a753cc84 67 my @related = $transposed->related_readings;
68 is( scalar @related, 1, "Reading links to transposed version" );
679f17e1 69 is( $related[0]->id, 'n18', "Correct transposition link" );
e867486f 70}
71
72=end testing
cda6a45b 73
74=cut
75
76my $IDKEY = 'number';
679f17e1 77my $CONTENTKEY = 'tokens';
78my $EDGETYPEKEY = 'type';
79my $WITKEY = 'witnesses';
cda6a45b 80
81sub parse {
dfc37e38 82 my( $tradition, $opts ) = @_;
2626f709 83 my( $graph_data ) = graphml_parse( $opts );
cda6a45b 84 my $collation = $tradition->collation;
cda6a45b 85
3a2ebbf4 86 # First add the readings to the graph.
679f17e1 87 ## Assume the start node has no text and id 0, and the end node has
88 ## no text and ID [number of nodes] - 1.
89 my $endnode = scalar @{$graph_data->{'nodes'}} - 1;
cda6a45b 90 foreach my $n ( @{$graph_data->{'nodes'}} ) {
3a2ebbf4 91 unless( defined $n->{$IDKEY} && defined $n->{$CONTENTKEY} ) {
679f17e1 92 if( defined $n->{$IDKEY} && $n->{$IDKEY} == 0 ) {
93 # It's the start node.
94 $n->{$IDKEY} = $collation->start->id;
95 } elsif ( defined $n->{$IDKEY} && $n->{$IDKEY} == $endnode ) {
96 # It's the end node.
97 $n->{$IDKEY} = $collation->end->id;
98 } else {
99 # Something is probably wrong.
100 warn "Did not find an ID or token for graph node, can't add it";
101 }
910a0a6d 102 next;
103 }
679f17e1 104 # Node ID should be an XML name, so prepend an 'n' if necessary.
105 if( $n->{$IDKEY} =~ /^\d/ ) {
106 $n->{$IDKEY} = 'n' . $n->{$IDKEY};
107 }
108 # Create the reading.
3a2ebbf4 109 my $gnode_args = {
679f17e1 110 'id' => $n->{$IDKEY},
111 'text' => $n->{$CONTENTKEY},
3a2ebbf4 112 };
113 my $gnode = $collation->add_reading( $gnode_args );
cda6a45b 114 }
910a0a6d 115
3a2ebbf4 116 # Now add the path edges.
cda6a45b 117 foreach my $e ( @{$graph_data->{'edges'}} ) {
679f17e1 118 my $from = $e->{'source'};
119 my $to = $e->{'target'};
120
121 ## Edge data keys are ID (which we don't need), witnesses, and type.
122 ## Type can be 'path' or 'relationship';
123 ## witnesses is a comma-separated list.
124 if( $e->{$EDGETYPEKEY} eq 'path' ) {
125 ## Add the path for each witness listesd.
126 # Create the witness objects if they does not yet exist.
127 foreach my $wit ( split( /, /, $e->{$WITKEY} ) ) {
128 unless( $tradition->witness( $wit ) ) {
fae52efd 129 $tradition->add_witness(
130 'sigil' => $wit, 'sourcetype' => 'collation' );
679f17e1 131 }
132 $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $wit );
133 }
134 } else { # type 'relationship'
135 $collation->add_relationship( $from->{$IDKEY}, $to->{$IDKEY},
136 { 'type' => 'transposition' } );
910a0a6d 137 }
cda6a45b 138 }
139
e867486f 140 # Rank the readings.
15db7774 141 $collation->calculate_common_readings(); # will implicitly rank
861c3e27 142
143 # Save the text for each witness so that we can ensure consistency
144 # later on
145 $tradition->collation->text_from_paths();
cda6a45b 146}
147
e867486f 148=head1 BUGS / TODO
149
150=over
151
152=item * Make this into a stream parser with GraphML
153
154=item * Use CollateX-calculated ranks instead of recalculating our own
155
cda6a45b 156=back
157
158=head1 LICENSE
159
160This package is free software and is provided "as is" without express
161or implied warranty. You can redistribute it and/or modify it under
162the same terms as Perl itself.
163
164=head1 AUTHOR
165
166Tara L Andrews, aurum@cpan.org
167
168=cut
169
1701;