make Self module parse old and new graphml
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Self.pm
CommitLineData
32014ec9 1package Text::Tradition::Parser::Self;
2
3use strict;
4use warnings;
1f7aa795 5use Text::Tradition::Parser::GraphML qw/ graphml_parse /;
32014ec9 6
7=head1 NAME
8
9Text::Tradition::Parser::GraphML
10
e867486f 11=head1 SYNOPSIS
12
13 use Text::Tradition;
14
15 my $t_from_file = Text::Tradition->new(
16 'name' => 'my text',
17 'input' => 'Self',
18 'file' => '/path/to/tradition.xml'
19 );
20
21 my $t_from_string = Text::Tradition->new(
22 'name' => 'my text',
23 'input' => 'Self',
24 'string' => $tradition_xml,
25 );
26
32014ec9 27=head1 DESCRIPTION
28
29Parser module for Text::Tradition to read in its own GraphML output format.
e867486f 30GraphML is a relatively simple graph description language; a 'graph' element
31can have 'node' and 'edge' elements, and each of these can have simple 'data'
32elements for attributes to be saved.
32014ec9 33
e867486f 34The graph itself has attributes as in the Collation object:
35
36=over
37
38=item * linear
39
40=item * ac_label
41
42=item * baselabel
43
44=item * wit_list_separator
45
46=back
47
48The node objects have the following attributes:
32014ec9 49
50=over
51
e867486f 52=item * name
53
54=item * reading
55
56=item * identical
57
58=item * rank
59
60=item * class
61
62=back
63
64The edge objects have the following attributes:
65
66=over
67
68=item * class
69
70=item * witness (for 'path' class edges)
71
72=item * extra (for 'path' class edges)
73
74=item * relationship (for 'relationship' class edges)
75
76=item * equal_rank (for 'relationship' class edges)
32014ec9 77
e867486f 78=item * non_correctable (for 'relationship' class edges)
32014ec9 79
e867486f 80=item * non_independent (for 'relationship' class edges)
81
82=back
83
84=head1 METHODS
85
86=head2 B<parse>
87
88parse( $graph, $opts );
89
90Takes an initialized Text::Tradition object and a set of options; creates
91the appropriate nodes and edges on the graph. The options hash should
92include either a 'file' argument or a 'string' argument, depending on the
93source of the XML to be parsed.
94
95=begin testing
96
97use Text::Tradition;
98binmode STDOUT, ":utf8";
99binmode STDERR, ":utf8";
100eval { no warnings; binmode $DB::OUT, ":utf8"; };
101
102my $tradition = 't/data/florilegium_graphml.xml';
103my $t = Text::Tradition->new(
104 'name' => 'inline',
105 'input' => 'Self',
106 'file' => $tradition,
107 );
108
109is( ref( $t ), 'Text::Tradition', "Parsed our own GraphML" );
110if( $t ) {
111 is( scalar $t->collation->readings, 319, "Collation has all readings" );
255875b8 112 is( scalar $t->collation->paths, 376, "Collation has all paths" );
e867486f 113 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
114}
115
116=end testing
32014ec9 117
118=cut
119
94c00c71 120my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $RANK_KEY, $CLASS_KEY,
255875b8 121 $START_KEY, $END_KEY, $LACUNA_KEY,
122 $SOURCE_KEY, $TARGET_KEY, $WITNESS_KEY, $EXTRA_KEY, $RELATIONSHIP_KEY,
123 $COLO_KEY, $CORRECT_KEY, $INDEP_KEY )
124 = qw/ name reading identical rank class
125 is_start is_end is_lacuna
126 source target witness extra relationship
127 equal_rank non_correctable non_independent /;
32014ec9 128
129sub parse {
dfc37e38 130 my( $tradition, $opts ) = @_;
e867486f 131 my $graph_data = graphml_parse( $opts );
94c00c71 132
32014ec9 133 my $collation = $tradition->collation;
134 my %witnesses;
e309421a 135
136 # Set up the graph-global attributes. They will appear in the
137 # hash under their accessor names.
255875b8 138 my $use_version;
e309421a 139 print STDERR "Setting graph globals\n";
e3196b2a 140 $tradition->name( $graph_data->{'name'} );
255875b8 141 $DB::single = 1;
142 foreach my $gkey ( keys %{$graph_data->{'global'}} ) {
143 my $val = $graph_data->{'global'}->{$gkey};
144 if( $gkey eq 'version' ) {
145 $use_version = $val;
146 } else {
147 $collation->$gkey( $val );
148 }
149 }
150 if( $use_version ) {
151 # Many of our tags changed.
152 $IDKEY = 'id';
153 $TOKENKEY = 'text';
154 $COLO_KEY = 'colocated';
e309421a 155 }
156
32014ec9 157 # Add the nodes to the graph.
158
159 my $extra_data = {}; # Keep track of data that needs to be processed
160 # after the nodes & edges are created.
f6066bac 161 print STDERR "Adding graph nodes\n";
32014ec9 162 foreach my $n ( @{$graph_data->{'nodes'}} ) {
255875b8 163 # If it is the start or end node, we already have one, so skip it.
164 next if defined $n->{$START_KEY} || defined $n->{$END_KEY};
165
94c00c71 166 # First extract the data that we can use without reference to
167 # anything else.
168 my %node_data = %$n; # Need $n itself untouched for edge processing
94c00c71 169
255875b8 170 # Create the node.
171 my $reading_options = {
172 'id' => delete $node_data{$IDKEY},
173 'is_lacuna' => delete $node_data{$LACUNA_KEY},
174 };
175 my $rank = delete $node_data{$RANK_KEY};
176 $reading_options->{'rank'} = $rank if $rank;
177 my $text = delete $node_data{$TOKENKEY};
178 $reading_options->{'text'} = $text if $text;
179
180 # This is a horrible hack for backwards compatibility.
181 unless( $use_version ) {
182 $reading_options->{'is_lacuna'} = 1
183 if $reading_options->{'text'} =~ /^\#LACUNA/;
184 }
185
186 delete $node_data{$CLASS_KEY}; # Not actually used
187 my $gnode = $collation->add_reading( $reading_options );
94c00c71 188
189 # Now save the data that we need for post-processing,
255875b8 190 # if it exists. TODO this is unneeded after conversion
910a0a6d 191 if ( keys %node_data ) {
255875b8 192 $extra_data->{$gnode->id} = \%node_data
910a0a6d 193 }
32014ec9 194 }
910a0a6d 195
32014ec9 196 # Now add the edges.
f6066bac 197 print STDERR "Adding graph edges\n";
32014ec9 198 foreach my $e ( @{$graph_data->{'edges'}} ) {
94c00c71 199 my $from = $e->{$SOURCE_KEY};
200 my $to = $e->{$TARGET_KEY};
201 my $class = $e->{$CLASS_KEY};
202
203 # We may have more information depending on the class.
204 if( $class eq 'path' ) {
205 # We need the witness, and whether it is an 'extra' reading path.
206 my $wit = $e->{$WITNESS_KEY};
207 warn "No witness label on path edge!" unless $wit;
208 my $extra = $e->{$EXTRA_KEY};
209 my $label = $wit . ( $extra ? $collation->ac_label : '' );
210 $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label );
211 # Add the witness if we don't have it already.
212 unless( $witnesses{$wit} ) {
213 $tradition->add_witness( sigil => $wit );
214 $witnesses{$wit} = 1;
215 }
1f7aa795 216 $tradition->witness( $wit )->is_layered( 1 ) if $extra;
94c00c71 217 } elsif( $class eq 'relationship' ) {
c9bf3dbf 218 # We need the metadata about the relationship.
219 my $opts = { 'type' => $e->{$RELATIONSHIP_KEY} };
255875b8 220 $opts->{$COLO_KEY} = $e->{$COLO_KEY}
221 if exists $e->{$COLO_KEY};
222 $opts->{$CORRECT_KEY} = $e->{$CORRECT_KEY}
223 if exists $e->{$CORRECT_KEY};
224 $opts->{$INDEP_KEY} = $e->{$INDEP_KEY}
225 if exists $e->{$INDEP_KEY};
c9bf3dbf 226 warn "No relationship type for relationship edge!" unless $opts->{'type'};
decc2a20 227 my( $ok, @result ) = $collation->add_relationship( $from->{$IDKEY}, $to->{$IDKEY}, $opts );
228 unless( $ok ) {
229 warn "Did not add relationship: @result";
230 }
94c00c71 231 }
32014ec9 232 }
233
234 ## Deal with node information (transposition, relationships, etc.) that
235 ## needs to be processed after all the nodes are created.
255875b8 236 ## TODO unneeded after conversion
237 unless( $use_version ) {
238 print STDERR "Adding second-pass node data\n";
239 foreach my $nkey ( keys %$extra_data ) {
240 foreach my $edkey ( keys %{$extra_data->{$nkey}} ) {
241 my $this_reading = $collation->reading( $nkey );
242 if( $edkey eq $TRANSPOS_KEY ) {
243 my $other_reading = $collation->reading( $extra_data->{$nkey}->{$edkey} );
244 $this_reading->set_identical( $other_reading );
245 } else {
246 warn "Unfamiliar reading node data $edkey for $nkey";
247 }
248 }
249 }
32014ec9 250 }
32014ec9 251}
252
e867486f 2531;
254
255=head1 BUGS / TODO
256
257=over
258
259=item * Make this into a stream parser with GraphML
260
261=item * Simply field -> attribute correspondence for nodes and edges
262
263=item * Share key name constants with Collation.pm
264
32014ec9 265=back
266
267=head1 LICENSE
268
269This package is free software and is provided "as is" without express
270or implied warranty. You can redistribute it and/or modify it under
271the same terms as Perl itself.
272
273=head1 AUTHOR
274
e867486f 275Tara L Andrews E<lt>aurum@cpan.orgE<gt>