save the name of the text too
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Self.pm
CommitLineData
32014ec9 1package Text::Tradition::Parser::Self;
2
3use strict;
4use warnings;
e867486f 5use Text::Tradition::Parser::GraphML qw/ graphml_parse populate_witness_path /;
32014ec9 6
7=head1 NAME
8
9Text::Tradition::Parser::GraphML
10
e867486f 11=head1 SYNOPSIS
12
13 use Text::Tradition;
14
15 my $t_from_file = Text::Tradition->new(
16 'name' => 'my text',
17 'input' => 'Self',
18 'file' => '/path/to/tradition.xml'
19 );
20
21 my $t_from_string = Text::Tradition->new(
22 'name' => 'my text',
23 'input' => 'Self',
24 'string' => $tradition_xml,
25 );
26
32014ec9 27=head1 DESCRIPTION
28
29Parser module for Text::Tradition to read in its own GraphML output format.
e867486f 30GraphML is a relatively simple graph description language; a 'graph' element
31can have 'node' and 'edge' elements, and each of these can have simple 'data'
32elements for attributes to be saved.
32014ec9 33
e867486f 34The graph itself has attributes as in the Collation object:
35
36=over
37
38=item * linear
39
40=item * ac_label
41
42=item * baselabel
43
44=item * wit_list_separator
45
46=back
47
48The node objects have the following attributes:
32014ec9 49
50=over
51
e867486f 52=item * name
53
54=item * reading
55
56=item * identical
57
58=item * rank
59
60=item * class
61
62=back
63
64The edge objects have the following attributes:
65
66=over
67
68=item * class
69
70=item * witness (for 'path' class edges)
71
72=item * extra (for 'path' class edges)
73
74=item * relationship (for 'relationship' class edges)
75
76=item * equal_rank (for 'relationship' class edges)
32014ec9 77
e867486f 78=item * non_correctable (for 'relationship' class edges)
32014ec9 79
e867486f 80=item * non_independent (for 'relationship' class edges)
81
82=back
83
84=head1 METHODS
85
86=head2 B<parse>
87
88parse( $graph, $opts );
89
90Takes an initialized Text::Tradition object and a set of options; creates
91the appropriate nodes and edges on the graph. The options hash should
92include either a 'file' argument or a 'string' argument, depending on the
93source of the XML to be parsed.
94
95=begin testing
96
97use Text::Tradition;
98binmode STDOUT, ":utf8";
99binmode STDERR, ":utf8";
100eval { no warnings; binmode $DB::OUT, ":utf8"; };
101
102my $tradition = 't/data/florilegium_graphml.xml';
103my $t = Text::Tradition->new(
104 'name' => 'inline',
105 'input' => 'Self',
106 'file' => $tradition,
107 );
108
109is( ref( $t ), 'Text::Tradition', "Parsed our own GraphML" );
110if( $t ) {
111 is( scalar $t->collation->readings, 319, "Collation has all readings" );
112 is( scalar $t->collation->paths, 2854, "Collation has all paths" );
113 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
114}
115
116=end testing
32014ec9 117
118=cut
119
94c00c71 120my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $RANK_KEY, $CLASS_KEY,
121 $SOURCE_KEY, $TARGET_KEY, $WITNESS_KEY, $EXTRA_KEY, $RELATIONSHIP_KEY )
122 = qw/ name reading identical rank class
123 source target witness extra relationship/;
32014ec9 124
125sub parse {
dfc37e38 126 my( $tradition, $opts ) = @_;
e867486f 127 my $graph_data = graphml_parse( $opts );
94c00c71 128
32014ec9 129 my $collation = $tradition->collation;
130 my %witnesses;
e309421a 131
132 # Set up the graph-global attributes. They will appear in the
133 # hash under their accessor names.
e309421a 134 print STDERR "Setting graph globals\n";
e3196b2a 135 $tradition->name( $graph_data->{'name'} );
e309421a 136 foreach my $gkey ( keys %{$graph_data->{'attr'}} ) {
137 my $val = $graph_data->{'attr'}->{$gkey};
138 $collation->$gkey( $val );
139 }
140
32014ec9 141 # Add the nodes to the graph.
142
143 my $extra_data = {}; # Keep track of data that needs to be processed
144 # after the nodes & edges are created.
f6066bac 145 print STDERR "Adding graph nodes\n";
32014ec9 146 foreach my $n ( @{$graph_data->{'nodes'}} ) {
94c00c71 147 # First extract the data that we can use without reference to
148 # anything else.
149 my %node_data = %$n; # Need $n itself untouched for edge processing
910a0a6d 150 my $nodeid = delete $node_data{$IDKEY};
151 my $reading = delete $node_data{$TOKENKEY};
94c00c71 152 my $class = delete $node_data{$CLASS_KEY} || '';
153 my $rank = delete $node_data{$RANK_KEY};
154
155 # Create the node. Current valid classes are common and meta.
0106ea2e 156 # Everything else is a normal reading.
94c00c71 157 my $gnode = $collation->add_reading( $nodeid );
158 $gnode->text( $reading );
159 $gnode->make_common if $class eq 'common';
160 $gnode->is_meta( 1 ) if $class eq 'meta';
0106ea2e 161 # This is a horrible hack.
162 $gnode->is_lacuna( $reading =~ /^\#LACUNA/ );
94c00c71 163 $gnode->rank( $rank ) if defined $rank;
164
165 # Now save the data that we need for post-processing,
910a0a6d 166 # if it exists.
167 if ( keys %node_data ) {
94c00c71 168 $extra_data->{$nodeid} = \%node_data
910a0a6d 169 }
32014ec9 170 }
910a0a6d 171
32014ec9 172 # Now add the edges.
f6066bac 173 print STDERR "Adding graph edges\n";
e867486f 174 my $has_ante_corr = {};
32014ec9 175 foreach my $e ( @{$graph_data->{'edges'}} ) {
94c00c71 176 my $from = $e->{$SOURCE_KEY};
177 my $to = $e->{$TARGET_KEY};
178 my $class = $e->{$CLASS_KEY};
179
180 # We may have more information depending on the class.
181 if( $class eq 'path' ) {
182 # We need the witness, and whether it is an 'extra' reading path.
183 my $wit = $e->{$WITNESS_KEY};
184 warn "No witness label on path edge!" unless $wit;
185 my $extra = $e->{$EXTRA_KEY};
186 my $label = $wit . ( $extra ? $collation->ac_label : '' );
187 $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label );
188 # Add the witness if we don't have it already.
189 unless( $witnesses{$wit} ) {
190 $tradition->add_witness( sigil => $wit );
191 $witnesses{$wit} = 1;
192 }
e867486f 193 $has_ante_corr->{$wit} = 1 if $extra;
94c00c71 194 } elsif( $class eq 'relationship' ) {
c9bf3dbf 195 # We need the metadata about the relationship.
196 my $opts = { 'type' => $e->{$RELATIONSHIP_KEY} };
197 $opts->{'equal_rank'} = $e->{'equal_rank'}
198 if exists $e->{'equal_rank'};
199 $opts->{'non_correctable'} = $e->{'non_correctable'}
200 if exists $e->{'non_correctable'};
201 $opts->{'non_independent'} = $e->{'non_independent'}
202 if exists $e->{'non_independent'};
203 warn "No relationship type for relationship edge!" unless $opts->{'type'};
204 $collation->add_relationship( $from->{$IDKEY}, $to->{$IDKEY}, $opts );
94c00c71 205 }
32014ec9 206 }
207
208 ## Deal with node information (transposition, relationships, etc.) that
209 ## needs to be processed after all the nodes are created.
94c00c71 210 print STDERR "Adding second-pass node data\n";
32014ec9 211 foreach my $nkey ( keys %$extra_data ) {
910a0a6d 212 foreach my $edkey ( keys %{$extra_data->{$nkey}} ) {
213 my $this_reading = $collation->reading( $nkey );
214 if( $edkey eq $TRANSPOS_KEY ) {
215 my $other_reading = $collation->reading( $extra_data->{$nkey}->{$edkey} );
910a0a6d 216 $this_reading->set_identical( $other_reading );
910a0a6d 217 } else {
218 warn "Unfamiliar reading node data $edkey for $nkey";
219 }
220 }
32014ec9 221 }
e309421a 222
223 # Set the $witness->path arrays for each wit.
e867486f 224 populate_witness_path( $tradition, $has_ante_corr );
32014ec9 225}
226
e867486f 2271;
228
229=head1 BUGS / TODO
230
231=over
232
233=item * Make this into a stream parser with GraphML
234
235=item * Simply field -> attribute correspondence for nodes and edges
236
237=item * Share key name constants with Collation.pm
238
32014ec9 239=back
240
241=head1 LICENSE
242
243This package is free software and is provided "as is" without express
244or implied warranty. You can redistribute it and/or modify it under
245the same terms as Perl itself.
246
247=head1 AUTHOR
248
e867486f 249Tara L Andrews E<lt>aurum@cpan.orgE<gt>