various bugfixes, getting real traditions to parse
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Self.pm
CommitLineData
32014ec9 1package Text::Tradition::Parser::Self;
2
3use strict;
4use warnings;
1f7aa795 5use Text::Tradition::Parser::GraphML qw/ graphml_parse /;
32014ec9 6
7=head1 NAME
8
9Text::Tradition::Parser::GraphML
10
e867486f 11=head1 SYNOPSIS
12
13 use Text::Tradition;
14
15 my $t_from_file = Text::Tradition->new(
16 'name' => 'my text',
17 'input' => 'Self',
18 'file' => '/path/to/tradition.xml'
19 );
20
21 my $t_from_string = Text::Tradition->new(
22 'name' => 'my text',
23 'input' => 'Self',
24 'string' => $tradition_xml,
25 );
26
32014ec9 27=head1 DESCRIPTION
28
29Parser module for Text::Tradition to read in its own GraphML output format.
e867486f 30GraphML is a relatively simple graph description language; a 'graph' element
31can have 'node' and 'edge' elements, and each of these can have simple 'data'
32elements for attributes to be saved.
32014ec9 33
e867486f 34The graph itself has attributes as in the Collation object:
35
36=over
37
38=item * linear
39
40=item * ac_label
41
42=item * baselabel
43
44=item * wit_list_separator
45
46=back
47
48The node objects have the following attributes:
32014ec9 49
50=over
51
e867486f 52=item * name
53
54=item * reading
55
56=item * identical
57
58=item * rank
59
60=item * class
61
62=back
63
64The edge objects have the following attributes:
65
66=over
67
68=item * class
69
70=item * witness (for 'path' class edges)
71
72=item * extra (for 'path' class edges)
73
74=item * relationship (for 'relationship' class edges)
75
76=item * equal_rank (for 'relationship' class edges)
32014ec9 77
e867486f 78=item * non_correctable (for 'relationship' class edges)
32014ec9 79
e867486f 80=item * non_independent (for 'relationship' class edges)
81
82=back
83
84=head1 METHODS
85
86=head2 B<parse>
87
88parse( $graph, $opts );
89
90Takes an initialized Text::Tradition object and a set of options; creates
91the appropriate nodes and edges on the graph. The options hash should
92include either a 'file' argument or a 'string' argument, depending on the
93source of the XML to be parsed.
94
95=begin testing
96
97use Text::Tradition;
98binmode STDOUT, ":utf8";
99binmode STDERR, ":utf8";
100eval { no warnings; binmode $DB::OUT, ":utf8"; };
101
102my $tradition = 't/data/florilegium_graphml.xml';
103my $t = Text::Tradition->new(
104 'name' => 'inline',
105 'input' => 'Self',
106 'file' => $tradition,
107 );
108
109is( ref( $t ), 'Text::Tradition', "Parsed our own GraphML" );
110if( $t ) {
111 is( scalar $t->collation->readings, 319, "Collation has all readings" );
255875b8 112 is( scalar $t->collation->paths, 376, "Collation has all paths" );
e867486f 113 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
114}
115
116=end testing
32014ec9 117
118=cut
119
94c00c71 120my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $RANK_KEY, $CLASS_KEY,
255875b8 121 $START_KEY, $END_KEY, $LACUNA_KEY,
122 $SOURCE_KEY, $TARGET_KEY, $WITNESS_KEY, $EXTRA_KEY, $RELATIONSHIP_KEY,
123 $COLO_KEY, $CORRECT_KEY, $INDEP_KEY )
124 = qw/ name reading identical rank class
125 is_start is_end is_lacuna
126 source target witness extra relationship
127 equal_rank non_correctable non_independent /;
32014ec9 128
129sub parse {
dfc37e38 130 my( $tradition, $opts ) = @_;
e867486f 131 my $graph_data = graphml_parse( $opts );
94c00c71 132
32014ec9 133 my $collation = $tradition->collation;
134 my %witnesses;
e309421a 135
136 # Set up the graph-global attributes. They will appear in the
137 # hash under their accessor names.
255875b8 138 my $use_version;
e309421a 139 print STDERR "Setting graph globals\n";
e3196b2a 140 $tradition->name( $graph_data->{'name'} );
255875b8 141 foreach my $gkey ( keys %{$graph_data->{'global'}} ) {
142 my $val = $graph_data->{'global'}->{$gkey};
143 if( $gkey eq 'version' ) {
144 $use_version = $val;
145 } else {
146 $collation->$gkey( $val );
147 }
148 }
149 if( $use_version ) {
150 # Many of our tags changed.
151 $IDKEY = 'id';
152 $TOKENKEY = 'text';
153 $COLO_KEY = 'colocated';
e309421a 154 }
155
32014ec9 156 # Add the nodes to the graph.
157
158 my $extra_data = {}; # Keep track of data that needs to be processed
159 # after the nodes & edges are created.
f6066bac 160 print STDERR "Adding graph nodes\n";
32014ec9 161 foreach my $n ( @{$graph_data->{'nodes'}} ) {
255875b8 162 # If it is the start or end node, we already have one, so skip it.
163 next if defined $n->{$START_KEY} || defined $n->{$END_KEY};
164
94c00c71 165 # First extract the data that we can use without reference to
166 # anything else.
167 my %node_data = %$n; # Need $n itself untouched for edge processing
94c00c71 168
255875b8 169 # Create the node.
170 my $reading_options = {
171 'id' => delete $node_data{$IDKEY},
172 'is_lacuna' => delete $node_data{$LACUNA_KEY},
173 };
174 my $rank = delete $node_data{$RANK_KEY};
175 $reading_options->{'rank'} = $rank if $rank;
176 my $text = delete $node_data{$TOKENKEY};
177 $reading_options->{'text'} = $text if $text;
178
179 # This is a horrible hack for backwards compatibility.
180 unless( $use_version ) {
181 $reading_options->{'is_lacuna'} = 1
182 if $reading_options->{'text'} =~ /^\#LACUNA/;
183 }
184
185 delete $node_data{$CLASS_KEY}; # Not actually used
186 my $gnode = $collation->add_reading( $reading_options );
94c00c71 187
188 # Now save the data that we need for post-processing,
255875b8 189 # if it exists. TODO this is unneeded after conversion
910a0a6d 190 if ( keys %node_data ) {
255875b8 191 $extra_data->{$gnode->id} = \%node_data
910a0a6d 192 }
32014ec9 193 }
910a0a6d 194
32014ec9 195 # Now add the edges.
f6066bac 196 print STDERR "Adding graph edges\n";
32014ec9 197 foreach my $e ( @{$graph_data->{'edges'}} ) {
94c00c71 198 my $from = $e->{$SOURCE_KEY};
199 my $to = $e->{$TARGET_KEY};
200 my $class = $e->{$CLASS_KEY};
201
202 # We may have more information depending on the class.
203 if( $class eq 'path' ) {
204 # We need the witness, and whether it is an 'extra' reading path.
205 my $wit = $e->{$WITNESS_KEY};
206 warn "No witness label on path edge!" unless $wit;
207 my $extra = $e->{$EXTRA_KEY};
208 my $label = $wit . ( $extra ? $collation->ac_label : '' );
209 $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label );
210 # Add the witness if we don't have it already.
211 unless( $witnesses{$wit} ) {
212 $tradition->add_witness( sigil => $wit );
213 $witnesses{$wit} = 1;
214 }
1f7aa795 215 $tradition->witness( $wit )->is_layered( 1 ) if $extra;
94c00c71 216 } elsif( $class eq 'relationship' ) {
c9bf3dbf 217 # We need the metadata about the relationship.
218 my $opts = { 'type' => $e->{$RELATIONSHIP_KEY} };
255875b8 219 $opts->{$COLO_KEY} = $e->{$COLO_KEY}
220 if exists $e->{$COLO_KEY};
221 $opts->{$CORRECT_KEY} = $e->{$CORRECT_KEY}
222 if exists $e->{$CORRECT_KEY};
223 $opts->{$INDEP_KEY} = $e->{$INDEP_KEY}
224 if exists $e->{$INDEP_KEY};
c9bf3dbf 225 warn "No relationship type for relationship edge!" unless $opts->{'type'};
decc2a20 226 my( $ok, @result ) = $collation->add_relationship( $from->{$IDKEY}, $to->{$IDKEY}, $opts );
227 unless( $ok ) {
228 warn "Did not add relationship: @result";
229 }
94c00c71 230 }
32014ec9 231 }
232
233 ## Deal with node information (transposition, relationships, etc.) that
234 ## needs to be processed after all the nodes are created.
255875b8 235 ## TODO unneeded after conversion
236 unless( $use_version ) {
237 print STDERR "Adding second-pass node data\n";
238 foreach my $nkey ( keys %$extra_data ) {
239 foreach my $edkey ( keys %{$extra_data->{$nkey}} ) {
240 my $this_reading = $collation->reading( $nkey );
241 if( $edkey eq $TRANSPOS_KEY ) {
242 my $other_reading = $collation->reading( $extra_data->{$nkey}->{$edkey} );
243 $this_reading->set_identical( $other_reading );
244 } else {
245 warn "Unfamiliar reading node data $edkey for $nkey";
246 }
247 }
248 }
32014ec9 249 }
32014ec9 250}
251
e867486f 2521;
253
254=head1 BUGS / TODO
255
256=over
257
258=item * Make this into a stream parser with GraphML
259
260=item * Simply field -> attribute correspondence for nodes and edges
261
262=item * Share key name constants with Collation.pm
263
32014ec9 264=back
265
266=head1 LICENSE
267
268This package is free software and is provided "as is" without express
269or implied warranty. You can redistribute it and/or modify it under
270the same terms as Perl itself.
271
272=head1 AUTHOR
273
e867486f 274Tara L Andrews E<lt>aurum@cpan.orgE<gt>