fix up CTE parser, including an ugly hack I need, with new graph
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Self.pm
CommitLineData
32014ec9 1package Text::Tradition::Parser::Self;
2
3use strict;
4use warnings;
1f7aa795 5use Text::Tradition::Parser::GraphML qw/ graphml_parse /;
32014ec9 6
7=head1 NAME
8
9Text::Tradition::Parser::GraphML
10
e867486f 11=head1 SYNOPSIS
12
13 use Text::Tradition;
14
15 my $t_from_file = Text::Tradition->new(
16 'name' => 'my text',
17 'input' => 'Self',
18 'file' => '/path/to/tradition.xml'
19 );
20
21 my $t_from_string = Text::Tradition->new(
22 'name' => 'my text',
23 'input' => 'Self',
24 'string' => $tradition_xml,
25 );
26
32014ec9 27=head1 DESCRIPTION
28
29Parser module for Text::Tradition to read in its own GraphML output format.
e867486f 30GraphML is a relatively simple graph description language; a 'graph' element
31can have 'node' and 'edge' elements, and each of these can have simple 'data'
32elements for attributes to be saved.
32014ec9 33
e867486f 34The graph itself has attributes as in the Collation object:
35
36=over
37
38=item * linear
39
40=item * ac_label
41
42=item * baselabel
43
44=item * wit_list_separator
45
46=back
47
48The node objects have the following attributes:
32014ec9 49
50=over
51
e867486f 52=item * name
53
54=item * reading
55
56=item * identical
57
58=item * rank
59
60=item * class
61
62=back
63
64The edge objects have the following attributes:
65
66=over
67
68=item * class
69
70=item * witness (for 'path' class edges)
71
72=item * extra (for 'path' class edges)
73
74=item * relationship (for 'relationship' class edges)
75
76=item * equal_rank (for 'relationship' class edges)
32014ec9 77
e867486f 78=item * non_correctable (for 'relationship' class edges)
32014ec9 79
e867486f 80=item * non_independent (for 'relationship' class edges)
81
82=back
83
84=head1 METHODS
85
86=head2 B<parse>
87
88parse( $graph, $opts );
89
90Takes an initialized Text::Tradition object and a set of options; creates
91the appropriate nodes and edges on the graph. The options hash should
92include either a 'file' argument or a 'string' argument, depending on the
93source of the XML to be parsed.
94
95=begin testing
96
97use Text::Tradition;
98binmode STDOUT, ":utf8";
99binmode STDERR, ":utf8";
100eval { no warnings; binmode $DB::OUT, ":utf8"; };
101
102my $tradition = 't/data/florilegium_graphml.xml';
103my $t = Text::Tradition->new(
104 'name' => 'inline',
105 'input' => 'Self',
106 'file' => $tradition,
107 );
108
109is( ref( $t ), 'Text::Tradition', "Parsed our own GraphML" );
110if( $t ) {
111 is( scalar $t->collation->readings, 319, "Collation has all readings" );
255875b8 112 is( scalar $t->collation->paths, 376, "Collation has all paths" );
e867486f 113 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
114}
115
116=end testing
32014ec9 117
118=cut
119
94c00c71 120my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $RANK_KEY, $CLASS_KEY,
255875b8 121 $START_KEY, $END_KEY, $LACUNA_KEY,
122 $SOURCE_KEY, $TARGET_KEY, $WITNESS_KEY, $EXTRA_KEY, $RELATIONSHIP_KEY,
123 $COLO_KEY, $CORRECT_KEY, $INDEP_KEY )
124 = qw/ name reading identical rank class
125 is_start is_end is_lacuna
126 source target witness extra relationship
127 equal_rank non_correctable non_independent /;
32014ec9 128
129sub parse {
dfc37e38 130 my( $tradition, $opts ) = @_;
e867486f 131 my $graph_data = graphml_parse( $opts );
94c00c71 132
32014ec9 133 my $collation = $tradition->collation;
134 my %witnesses;
e309421a 135
136 # Set up the graph-global attributes. They will appear in the
137 # hash under their accessor names.
255875b8 138 my $use_version;
0068967c 139 # print STDERR "Setting graph globals\n";
e3196b2a 140 $tradition->name( $graph_data->{'name'} );
255875b8 141 foreach my $gkey ( keys %{$graph_data->{'global'}} ) {
142 my $val = $graph_data->{'global'}->{$gkey};
143 if( $gkey eq 'version' ) {
144 $use_version = $val;
145 } else {
146 $collation->$gkey( $val );
147 }
148 }
149 if( $use_version ) {
150 # Many of our tags changed.
151 $IDKEY = 'id';
152 $TOKENKEY = 'text';
153 $COLO_KEY = 'colocated';
e309421a 154 }
155
32014ec9 156 # Add the nodes to the graph.
157
158 my $extra_data = {}; # Keep track of data that needs to be processed
159 # after the nodes & edges are created.
0068967c 160 # print STDERR "Adding graph nodes\n";
32014ec9 161 foreach my $n ( @{$graph_data->{'nodes'}} ) {
0068967c 162 unless( $use_version ) {
163 # Backwards compat!
164 $n->{$START_KEY} = 1 if $n->{$IDKEY} eq '#START#';
165 $n->{$END_KEY} = 1 if $n->{$IDKEY} eq '#END#';
166 }
167
0174d6a9 168 # If it is the start or end node, we already have one, so
169 # grab the rank and go.
0068967c 170 next if( defined $n->{$START_KEY} );
0174d6a9 171 if( defined $n->{$END_KEY} ) {
172 $collation->end->rank( $n->{$RANK_KEY} );
173 next;
174 }
255875b8 175
94c00c71 176 # First extract the data that we can use without reference to
177 # anything else.
178 my %node_data = %$n; # Need $n itself untouched for edge processing
94c00c71 179
255875b8 180 # Create the node.
181 my $reading_options = {
182 'id' => delete $node_data{$IDKEY},
183 'is_lacuna' => delete $node_data{$LACUNA_KEY},
184 };
185 my $rank = delete $node_data{$RANK_KEY};
186 $reading_options->{'rank'} = $rank if $rank;
187 my $text = delete $node_data{$TOKENKEY};
188 $reading_options->{'text'} = $text if $text;
189
190 # This is a horrible hack for backwards compatibility.
191 unless( $use_version ) {
192 $reading_options->{'is_lacuna'} = 1
193 if $reading_options->{'text'} =~ /^\#LACUNA/;
194 }
195
196 delete $node_data{$CLASS_KEY}; # Not actually used
197 my $gnode = $collation->add_reading( $reading_options );
94c00c71 198
199 # Now save the data that we need for post-processing,
255875b8 200 # if it exists. TODO this is unneeded after conversion
910a0a6d 201 if ( keys %node_data ) {
255875b8 202 $extra_data->{$gnode->id} = \%node_data
910a0a6d 203 }
32014ec9 204 }
910a0a6d 205
32014ec9 206 # Now add the edges.
0068967c 207 # print STDERR "Adding graph edges\n";
32014ec9 208 foreach my $e ( @{$graph_data->{'edges'}} ) {
94c00c71 209 my $from = $e->{$SOURCE_KEY};
210 my $to = $e->{$TARGET_KEY};
211 my $class = $e->{$CLASS_KEY};
212
213 # We may have more information depending on the class.
214 if( $class eq 'path' ) {
215 # We need the witness, and whether it is an 'extra' reading path.
216 my $wit = $e->{$WITNESS_KEY};
217 warn "No witness label on path edge!" unless $wit;
218 my $extra = $e->{$EXTRA_KEY};
219 my $label = $wit . ( $extra ? $collation->ac_label : '' );
220 $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label );
221 # Add the witness if we don't have it already.
222 unless( $witnesses{$wit} ) {
223 $tradition->add_witness( sigil => $wit );
224 $witnesses{$wit} = 1;
225 }
1f7aa795 226 $tradition->witness( $wit )->is_layered( 1 ) if $extra;
94c00c71 227 } elsif( $class eq 'relationship' ) {
c9bf3dbf 228 # We need the metadata about the relationship.
229 my $opts = { 'type' => $e->{$RELATIONSHIP_KEY} };
255875b8 230 $opts->{$COLO_KEY} = $e->{$COLO_KEY}
231 if exists $e->{$COLO_KEY};
232 $opts->{$CORRECT_KEY} = $e->{$CORRECT_KEY}
233 if exists $e->{$CORRECT_KEY};
234 $opts->{$INDEP_KEY} = $e->{$INDEP_KEY}
235 if exists $e->{$INDEP_KEY};
c9bf3dbf 236 warn "No relationship type for relationship edge!" unless $opts->{'type'};
decc2a20 237 my( $ok, @result ) = $collation->add_relationship( $from->{$IDKEY}, $to->{$IDKEY}, $opts );
238 unless( $ok ) {
0174d6a9 239 my $relinfo = $opts->{'type'} . ' '
240 . join( ' -> ', $from->{$IDKEY}, $to->{$IDKEY} );
241 warn "Did not add relationship $relinfo: @result";
decc2a20 242 }
94c00c71 243 }
32014ec9 244 }
245
246 ## Deal with node information (transposition, relationships, etc.) that
247 ## needs to be processed after all the nodes are created.
255875b8 248 ## TODO unneeded after conversion
249 unless( $use_version ) {
0068967c 250 # print STDERR "Adding second-pass node data\n";
255875b8 251 foreach my $nkey ( keys %$extra_data ) {
252 foreach my $edkey ( keys %{$extra_data->{$nkey}} ) {
253 my $this_reading = $collation->reading( $nkey );
254 if( $edkey eq $TRANSPOS_KEY ) {
255 my $other_reading = $collation->reading( $extra_data->{$nkey}->{$edkey} );
256 $this_reading->set_identical( $other_reading );
257 } else {
258 warn "Unfamiliar reading node data $edkey for $nkey";
259 }
260 }
261 }
32014ec9 262 }
32014ec9 263}
264
e867486f 2651;
266
267=head1 BUGS / TODO
268
269=over
270
271=item * Make this into a stream parser with GraphML
272
273=item * Simply field -> attribute correspondence for nodes and edges
274
275=item * Share key name constants with Collation.pm
276
32014ec9 277=back
278
279=head1 LICENSE
280
281This package is free software and is provided "as is" without express
282or implied warranty. You can redistribute it and/or modify it under
283the same terms as Perl itself.
284
285=head1 AUTHOR
286
e867486f 287Tara L Andrews E<lt>aurum@cpan.orgE<gt>