make the stemma a property of the tradition
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Self.pm
CommitLineData
32014ec9 1package Text::Tradition::Parser::Self;
2
3use strict;
4use warnings;
1f7aa795 5use Text::Tradition::Parser::GraphML qw/ graphml_parse /;
32014ec9 6
7=head1 NAME
8
9Text::Tradition::Parser::GraphML
10
e867486f 11=head1 SYNOPSIS
12
13 use Text::Tradition;
14
15 my $t_from_file = Text::Tradition->new(
16 'name' => 'my text',
17 'input' => 'Self',
18 'file' => '/path/to/tradition.xml'
19 );
20
21 my $t_from_string = Text::Tradition->new(
22 'name' => 'my text',
23 'input' => 'Self',
24 'string' => $tradition_xml,
25 );
26
32014ec9 27=head1 DESCRIPTION
28
29Parser module for Text::Tradition to read in its own GraphML output format.
e867486f 30GraphML is a relatively simple graph description language; a 'graph' element
31can have 'node' and 'edge' elements, and each of these can have simple 'data'
32elements for attributes to be saved.
32014ec9 33
e867486f 34The graph itself has attributes as in the Collation object:
35
36=over
37
38=item * linear
39
40=item * ac_label
41
42=item * baselabel
43
44=item * wit_list_separator
45
46=back
47
48The node objects have the following attributes:
32014ec9 49
50=over
51
e867486f 52=item * name
53
54=item * reading
55
56=item * identical
57
58=item * rank
59
60=item * class
61
62=back
63
64The edge objects have the following attributes:
65
66=over
67
68=item * class
69
70=item * witness (for 'path' class edges)
71
72=item * extra (for 'path' class edges)
73
74=item * relationship (for 'relationship' class edges)
75
76=item * equal_rank (for 'relationship' class edges)
32014ec9 77
e867486f 78=item * non_correctable (for 'relationship' class edges)
32014ec9 79
e867486f 80=item * non_independent (for 'relationship' class edges)
81
82=back
83
84=head1 METHODS
85
86=head2 B<parse>
87
88parse( $graph, $opts );
89
90Takes an initialized Text::Tradition object and a set of options; creates
91the appropriate nodes and edges on the graph. The options hash should
92include either a 'file' argument or a 'string' argument, depending on the
93source of the XML to be parsed.
94
95=begin testing
96
97use Text::Tradition;
98binmode STDOUT, ":utf8";
99binmode STDERR, ":utf8";
100eval { no warnings; binmode $DB::OUT, ":utf8"; };
101
102my $tradition = 't/data/florilegium_graphml.xml';
103my $t = Text::Tradition->new(
104 'name' => 'inline',
105 'input' => 'Self',
106 'file' => $tradition,
107 );
108
109is( ref( $t ), 'Text::Tradition', "Parsed our own GraphML" );
110if( $t ) {
111 is( scalar $t->collation->readings, 319, "Collation has all readings" );
255875b8 112 is( scalar $t->collation->paths, 376, "Collation has all paths" );
e867486f 113 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
114}
115
116=end testing
32014ec9 117
118=cut
119
94c00c71 120my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $RANK_KEY, $CLASS_KEY,
255875b8 121 $START_KEY, $END_KEY, $LACUNA_KEY,
122 $SOURCE_KEY, $TARGET_KEY, $WITNESS_KEY, $EXTRA_KEY, $RELATIONSHIP_KEY,
123 $COLO_KEY, $CORRECT_KEY, $INDEP_KEY )
124 = qw/ name reading identical rank class
125 is_start is_end is_lacuna
126 source target witness extra relationship
127 equal_rank non_correctable non_independent /;
32014ec9 128
129sub parse {
dfc37e38 130 my( $tradition, $opts ) = @_;
e867486f 131 my $graph_data = graphml_parse( $opts );
94c00c71 132
32014ec9 133 my $collation = $tradition->collation;
134 my %witnesses;
e309421a 135
136 # Set up the graph-global attributes. They will appear in the
137 # hash under their accessor names.
255875b8 138 my $use_version;
e309421a 139 print STDERR "Setting graph globals\n";
e3196b2a 140 $tradition->name( $graph_data->{'name'} );
255875b8 141 foreach my $gkey ( keys %{$graph_data->{'global'}} ) {
142 my $val = $graph_data->{'global'}->{$gkey};
143 if( $gkey eq 'version' ) {
144 $use_version = $val;
145 } else {
146 $collation->$gkey( $val );
147 }
148 }
149 if( $use_version ) {
150 # Many of our tags changed.
151 $IDKEY = 'id';
152 $TOKENKEY = 'text';
153 $COLO_KEY = 'colocated';
e309421a 154 }
155
32014ec9 156 # Add the nodes to the graph.
157
158 my $extra_data = {}; # Keep track of data that needs to be processed
159 # after the nodes & edges are created.
f6066bac 160 print STDERR "Adding graph nodes\n";
32014ec9 161 foreach my $n ( @{$graph_data->{'nodes'}} ) {
0174d6a9 162 # If it is the start or end node, we already have one, so
163 # grab the rank and go.
164 if( defined $n->{$START_KEY} ) {
165 $collation->start->rank( $n->{$RANK_KEY} );
166 next;
167 }
168 if( defined $n->{$END_KEY} ) {
169 $collation->end->rank( $n->{$RANK_KEY} );
170 next;
171 }
255875b8 172
94c00c71 173 # First extract the data that we can use without reference to
174 # anything else.
175 my %node_data = %$n; # Need $n itself untouched for edge processing
94c00c71 176
255875b8 177 # Create the node.
178 my $reading_options = {
179 'id' => delete $node_data{$IDKEY},
180 'is_lacuna' => delete $node_data{$LACUNA_KEY},
181 };
182 my $rank = delete $node_data{$RANK_KEY};
183 $reading_options->{'rank'} = $rank if $rank;
184 my $text = delete $node_data{$TOKENKEY};
185 $reading_options->{'text'} = $text if $text;
186
187 # This is a horrible hack for backwards compatibility.
188 unless( $use_version ) {
189 $reading_options->{'is_lacuna'} = 1
190 if $reading_options->{'text'} =~ /^\#LACUNA/;
191 }
192
193 delete $node_data{$CLASS_KEY}; # Not actually used
194 my $gnode = $collation->add_reading( $reading_options );
94c00c71 195
196 # Now save the data that we need for post-processing,
255875b8 197 # if it exists. TODO this is unneeded after conversion
910a0a6d 198 if ( keys %node_data ) {
255875b8 199 $extra_data->{$gnode->id} = \%node_data
910a0a6d 200 }
32014ec9 201 }
910a0a6d 202
32014ec9 203 # Now add the edges.
f6066bac 204 print STDERR "Adding graph edges\n";
32014ec9 205 foreach my $e ( @{$graph_data->{'edges'}} ) {
94c00c71 206 my $from = $e->{$SOURCE_KEY};
207 my $to = $e->{$TARGET_KEY};
208 my $class = $e->{$CLASS_KEY};
209
210 # We may have more information depending on the class.
211 if( $class eq 'path' ) {
212 # We need the witness, and whether it is an 'extra' reading path.
213 my $wit = $e->{$WITNESS_KEY};
214 warn "No witness label on path edge!" unless $wit;
215 my $extra = $e->{$EXTRA_KEY};
216 my $label = $wit . ( $extra ? $collation->ac_label : '' );
217 $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label );
218 # Add the witness if we don't have it already.
219 unless( $witnesses{$wit} ) {
220 $tradition->add_witness( sigil => $wit );
221 $witnesses{$wit} = 1;
222 }
1f7aa795 223 $tradition->witness( $wit )->is_layered( 1 ) if $extra;
94c00c71 224 } elsif( $class eq 'relationship' ) {
c9bf3dbf 225 # We need the metadata about the relationship.
226 my $opts = { 'type' => $e->{$RELATIONSHIP_KEY} };
255875b8 227 $opts->{$COLO_KEY} = $e->{$COLO_KEY}
228 if exists $e->{$COLO_KEY};
229 $opts->{$CORRECT_KEY} = $e->{$CORRECT_KEY}
230 if exists $e->{$CORRECT_KEY};
231 $opts->{$INDEP_KEY} = $e->{$INDEP_KEY}
232 if exists $e->{$INDEP_KEY};
c9bf3dbf 233 warn "No relationship type for relationship edge!" unless $opts->{'type'};
decc2a20 234 my( $ok, @result ) = $collation->add_relationship( $from->{$IDKEY}, $to->{$IDKEY}, $opts );
235 unless( $ok ) {
0174d6a9 236 my $relinfo = $opts->{'type'} . ' '
237 . join( ' -> ', $from->{$IDKEY}, $to->{$IDKEY} );
238 warn "Did not add relationship $relinfo: @result";
decc2a20 239 }
94c00c71 240 }
32014ec9 241 }
242
243 ## Deal with node information (transposition, relationships, etc.) that
244 ## needs to be processed after all the nodes are created.
255875b8 245 ## TODO unneeded after conversion
246 unless( $use_version ) {
247 print STDERR "Adding second-pass node data\n";
248 foreach my $nkey ( keys %$extra_data ) {
249 foreach my $edkey ( keys %{$extra_data->{$nkey}} ) {
250 my $this_reading = $collation->reading( $nkey );
251 if( $edkey eq $TRANSPOS_KEY ) {
252 my $other_reading = $collation->reading( $extra_data->{$nkey}->{$edkey} );
253 $this_reading->set_identical( $other_reading );
254 } else {
255 warn "Unfamiliar reading node data $edkey for $nkey";
256 }
257 }
258 }
32014ec9 259 }
32014ec9 260}
261
e867486f 2621;
263
264=head1 BUGS / TODO
265
266=over
267
268=item * Make this into a stream parser with GraphML
269
270=item * Simply field -> attribute correspondence for nodes and edges
271
272=item * Share key name constants with Collation.pm
273
32014ec9 274=back
275
276=head1 LICENSE
277
278This package is free software and is provided "as is" without express
279or implied warranty. You can redistribute it and/or modify it under
280the same terms as Perl itself.
281
282=head1 AUTHOR
283
e867486f 284Tara L Andrews E<lt>aurum@cpan.orgE<gt>