stop saving duplicate path arrays in witnesses; get rid of relationship
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Self.pm
CommitLineData
32014ec9 1package Text::Tradition::Parser::Self;
2
3use strict;
4use warnings;
b74d89f9 5use Text::Tradition::Parser::GraphML qw/ graphml_parse /;
32014ec9 6
7=head1 NAME
8
9Text::Tradition::Parser::GraphML
10
e867486f 11=head1 SYNOPSIS
12
13 use Text::Tradition;
14
15 my $t_from_file = Text::Tradition->new(
16 'name' => 'my text',
17 'input' => 'Self',
18 'file' => '/path/to/tradition.xml'
19 );
20
21 my $t_from_string = Text::Tradition->new(
22 'name' => 'my text',
23 'input' => 'Self',
24 'string' => $tradition_xml,
25 );
26
32014ec9 27=head1 DESCRIPTION
28
29Parser module for Text::Tradition to read in its own GraphML output format.
e867486f 30GraphML is a relatively simple graph description language; a 'graph' element
31can have 'node' and 'edge' elements, and each of these can have simple 'data'
32elements for attributes to be saved.
32014ec9 33
e867486f 34The graph itself has attributes as in the Collation object:
35
36=over
37
38=item * linear
39
40=item * ac_label
41
42=item * baselabel
43
44=item * wit_list_separator
45
46=back
47
48The node objects have the following attributes:
32014ec9 49
50=over
51
e867486f 52=item * name
53
54=item * reading
55
56=item * identical
57
58=item * rank
59
60=item * class
61
62=back
63
64The edge objects have the following attributes:
65
66=over
67
68=item * class
69
70=item * witness (for 'path' class edges)
71
72=item * extra (for 'path' class edges)
73
74=item * relationship (for 'relationship' class edges)
75
76=item * equal_rank (for 'relationship' class edges)
32014ec9 77
e867486f 78=item * non_correctable (for 'relationship' class edges)
32014ec9 79
e867486f 80=item * non_independent (for 'relationship' class edges)
81
82=back
83
84=head1 METHODS
85
86=head2 B<parse>
87
88parse( $graph, $opts );
89
90Takes an initialized Text::Tradition object and a set of options; creates
91the appropriate nodes and edges on the graph. The options hash should
92include either a 'file' argument or a 'string' argument, depending on the
93source of the XML to be parsed.
94
95=begin testing
96
97use Text::Tradition;
98binmode STDOUT, ":utf8";
99binmode STDERR, ":utf8";
100eval { no warnings; binmode $DB::OUT, ":utf8"; };
101
102my $tradition = 't/data/florilegium_graphml.xml';
103my $t = Text::Tradition->new(
104 'name' => 'inline',
105 'input' => 'Self',
106 'file' => $tradition,
107 );
108
109is( ref( $t ), 'Text::Tradition', "Parsed our own GraphML" );
110if( $t ) {
111 is( scalar $t->collation->readings, 319, "Collation has all readings" );
112 is( scalar $t->collation->paths, 2854, "Collation has all paths" );
113 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
114}
115
116=end testing
32014ec9 117
118=cut
119
94c00c71 120my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $RANK_KEY, $CLASS_KEY,
121 $SOURCE_KEY, $TARGET_KEY, $WITNESS_KEY, $EXTRA_KEY, $RELATIONSHIP_KEY )
122 = qw/ name reading identical rank class
123 source target witness extra relationship/;
32014ec9 124
125sub parse {
dfc37e38 126 my( $tradition, $opts ) = @_;
e867486f 127 my $graph_data = graphml_parse( $opts );
94c00c71 128
32014ec9 129 my $collation = $tradition->collation;
130 my %witnesses;
e309421a 131
132 # Set up the graph-global attributes. They will appear in the
133 # hash under their accessor names.
e309421a 134 print STDERR "Setting graph globals\n";
e3196b2a 135 $tradition->name( $graph_data->{'name'} );
e309421a 136 foreach my $gkey ( keys %{$graph_data->{'attr'}} ) {
137 my $val = $graph_data->{'attr'}->{$gkey};
138 $collation->$gkey( $val );
139 }
140
32014ec9 141 # Add the nodes to the graph.
142
143 my $extra_data = {}; # Keep track of data that needs to be processed
144 # after the nodes & edges are created.
f6066bac 145 print STDERR "Adding graph nodes\n";
32014ec9 146 foreach my $n ( @{$graph_data->{'nodes'}} ) {
94c00c71 147 # First extract the data that we can use without reference to
148 # anything else.
149 my %node_data = %$n; # Need $n itself untouched for edge processing
910a0a6d 150 my $nodeid = delete $node_data{$IDKEY};
151 my $reading = delete $node_data{$TOKENKEY};
94c00c71 152 my $class = delete $node_data{$CLASS_KEY} || '';
153 my $rank = delete $node_data{$RANK_KEY};
154
155 # Create the node. Current valid classes are common and meta.
0106ea2e 156 # Everything else is a normal reading.
94c00c71 157 my $gnode = $collation->add_reading( $nodeid );
158 $gnode->text( $reading );
159 $gnode->make_common if $class eq 'common';
160 $gnode->is_meta( 1 ) if $class eq 'meta';
0106ea2e 161 # This is a horrible hack.
162 $gnode->is_lacuna( $reading =~ /^\#LACUNA/ );
94c00c71 163 $gnode->rank( $rank ) if defined $rank;
164
165 # Now save the data that we need for post-processing,
910a0a6d 166 # if it exists.
167 if ( keys %node_data ) {
94c00c71 168 $extra_data->{$nodeid} = \%node_data
910a0a6d 169 }
32014ec9 170 }
910a0a6d 171
32014ec9 172 # Now add the edges.
f6066bac 173 print STDERR "Adding graph edges\n";
32014ec9 174 foreach my $e ( @{$graph_data->{'edges'}} ) {
94c00c71 175 my $from = $e->{$SOURCE_KEY};
176 my $to = $e->{$TARGET_KEY};
177 my $class = $e->{$CLASS_KEY};
178
179 # We may have more information depending on the class.
180 if( $class eq 'path' ) {
181 # We need the witness, and whether it is an 'extra' reading path.
182 my $wit = $e->{$WITNESS_KEY};
183 warn "No witness label on path edge!" unless $wit;
184 my $extra = $e->{$EXTRA_KEY};
185 my $label = $wit . ( $extra ? $collation->ac_label : '' );
186 $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label );
187 # Add the witness if we don't have it already.
188 unless( $witnesses{$wit} ) {
189 $tradition->add_witness( sigil => $wit );
190 $witnesses{$wit} = 1;
191 }
b74d89f9 192 $tradition->witness( $wit )->is_layered( 1 ) if $extra;
94c00c71 193 } elsif( $class eq 'relationship' ) {
c9bf3dbf 194 # We need the metadata about the relationship.
195 my $opts = { 'type' => $e->{$RELATIONSHIP_KEY} };
196 $opts->{'equal_rank'} = $e->{'equal_rank'}
197 if exists $e->{'equal_rank'};
198 $opts->{'non_correctable'} = $e->{'non_correctable'}
199 if exists $e->{'non_correctable'};
200 $opts->{'non_independent'} = $e->{'non_independent'}
201 if exists $e->{'non_independent'};
202 warn "No relationship type for relationship edge!" unless $opts->{'type'};
decc2a20 203 my( $ok, @result ) = $collation->add_relationship( $from->{$IDKEY}, $to->{$IDKEY}, $opts );
204 unless( $ok ) {
205 warn "Did not add relationship: @result";
206 }
94c00c71 207 }
32014ec9 208 }
209
210 ## Deal with node information (transposition, relationships, etc.) that
211 ## needs to be processed after all the nodes are created.
94c00c71 212 print STDERR "Adding second-pass node data\n";
32014ec9 213 foreach my $nkey ( keys %$extra_data ) {
910a0a6d 214 foreach my $edkey ( keys %{$extra_data->{$nkey}} ) {
215 my $this_reading = $collation->reading( $nkey );
216 if( $edkey eq $TRANSPOS_KEY ) {
217 my $other_reading = $collation->reading( $extra_data->{$nkey}->{$edkey} );
910a0a6d 218 $this_reading->set_identical( $other_reading );
910a0a6d 219 } else {
220 warn "Unfamiliar reading node data $edkey for $nkey";
221 }
222 }
32014ec9 223 }
32014ec9 224}
225
e867486f 2261;
227
228=head1 BUGS / TODO
229
230=over
231
232=item * Make this into a stream parser with GraphML
233
234=item * Simply field -> attribute correspondence for nodes and edges
235
236=item * Share key name constants with Collation.pm
237
32014ec9 238=back
239
240=head1 LICENSE
241
242This package is free software and is provided "as is" without express
243or implied warranty. You can redistribute it and/or modify it under
244the same terms as Perl itself.
245
246=head1 AUTHOR
247
e867486f 248Tara L Andrews E<lt>aurum@cpan.orgE<gt>