parse our new GraphML format
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Self.pm
CommitLineData
32014ec9 1package Text::Tradition::Parser::Self;
2
3use strict;
4use warnings;
1f7aa795 5use Text::Tradition::Parser::GraphML qw/ graphml_parse /;
32014ec9 6
7=head1 NAME
8
9Text::Tradition::Parser::GraphML
10
e867486f 11=head1 SYNOPSIS
12
13 use Text::Tradition;
14
15 my $t_from_file = Text::Tradition->new(
16 'name' => 'my text',
17 'input' => 'Self',
18 'file' => '/path/to/tradition.xml'
19 );
20
21 my $t_from_string = Text::Tradition->new(
22 'name' => 'my text',
23 'input' => 'Self',
24 'string' => $tradition_xml,
25 );
26
32014ec9 27=head1 DESCRIPTION
28
29Parser module for Text::Tradition to read in its own GraphML output format.
e867486f 30GraphML is a relatively simple graph description language; a 'graph' element
31can have 'node' and 'edge' elements, and each of these can have simple 'data'
32elements for attributes to be saved.
32014ec9 33
e867486f 34The graph itself has attributes as in the Collation object:
35
36=over
37
38=item * linear
39
40=item * ac_label
41
42=item * baselabel
43
44=item * wit_list_separator
45
46=back
47
48The node objects have the following attributes:
32014ec9 49
50=over
51
e867486f 52=item * name
53
54=item * reading
55
56=item * identical
57
58=item * rank
59
60=item * class
61
62=back
63
64The edge objects have the following attributes:
65
66=over
67
68=item * class
69
70=item * witness (for 'path' class edges)
71
72=item * extra (for 'path' class edges)
73
74=item * relationship (for 'relationship' class edges)
75
76=item * equal_rank (for 'relationship' class edges)
32014ec9 77
e867486f 78=item * non_correctable (for 'relationship' class edges)
32014ec9 79
e867486f 80=item * non_independent (for 'relationship' class edges)
81
82=back
83
84=head1 METHODS
85
86=head2 B<parse>
87
88parse( $graph, $opts );
89
90Takes an initialized Text::Tradition object and a set of options; creates
91the appropriate nodes and edges on the graph. The options hash should
92include either a 'file' argument or a 'string' argument, depending on the
93source of the XML to be parsed.
94
95=begin testing
96
97use Text::Tradition;
98binmode STDOUT, ":utf8";
99binmode STDERR, ":utf8";
100eval { no warnings; binmode $DB::OUT, ":utf8"; };
101
102my $tradition = 't/data/florilegium_graphml.xml';
103my $t = Text::Tradition->new(
104 'name' => 'inline',
105 'input' => 'Self',
106 'file' => $tradition,
107 );
108
109is( ref( $t ), 'Text::Tradition', "Parsed our own GraphML" );
110if( $t ) {
111 is( scalar $t->collation->readings, 319, "Collation has all readings" );
255875b8 112 is( scalar $t->collation->paths, 376, "Collation has all paths" );
e867486f 113 is( scalar $t->witnesses, 13, "Collation has all witnesses" );
114}
115
116=end testing
32014ec9 117
118=cut
119
2626f709 120my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $RANK_KEY,
255875b8 121 $START_KEY, $END_KEY, $LACUNA_KEY,
122 $SOURCE_KEY, $TARGET_KEY, $WITNESS_KEY, $EXTRA_KEY, $RELATIONSHIP_KEY,
2626f709 123 $SCOPE_KEY, $CORRECT_KEY, $INDEP_KEY )
124 = qw/ id text identical rank
255875b8 125 is_start is_end is_lacuna
126 source target witness extra relationship
2626f709 127 scope non_correctable non_independent /;
32014ec9 128
129sub parse {
dfc37e38 130 my( $tradition, $opts ) = @_;
2626f709 131
132 # Collation data is in the first graph; relationship-specific stuff
133 # is in the second.
134 my( $graph_data, $rel_data ) = graphml_parse( $opts );
94c00c71 135
32014ec9 136 my $collation = $tradition->collation;
137 my %witnesses;
e309421a 138
0068967c 139 # print STDERR "Setting graph globals\n";
e3196b2a 140 $tradition->name( $graph_data->{'name'} );
2626f709 141 my $use_version;
255875b8 142 foreach my $gkey ( keys %{$graph_data->{'global'}} ) {
143 my $val = $graph_data->{'global'}->{$gkey};
144 if( $gkey eq 'version' ) {
145 $use_version = $val;
146 } else {
147 $collation->$gkey( $val );
148 }
149 }
e309421a 150
32014ec9 151 # Add the nodes to the graph.
152
0068967c 153 # print STDERR "Adding graph nodes\n";
2626f709 154 foreach my $n ( @{$graph_data->{'nodes'}} ) {
0174d6a9 155 # If it is the start or end node, we already have one, so
156 # grab the rank and go.
0068967c 157 next if( defined $n->{$START_KEY} );
0174d6a9 158 if( defined $n->{$END_KEY} ) {
159 $collation->end->rank( $n->{$RANK_KEY} );
160 next;
161 }
255875b8 162
94c00c71 163 # First extract the data that we can use without reference to
164 # anything else.
94c00c71 165
255875b8 166 # Create the node.
167 my $reading_options = {
2626f709 168 'id' => $n->{$IDKEY},
169 'is_lacuna' => $n->{$LACUNA_KEY},
255875b8 170 };
2626f709 171 my $rank = $n->{$RANK_KEY};
255875b8 172 $reading_options->{'rank'} = $rank if $rank;
2626f709 173 my $text = $n->{$TOKENKEY};
255875b8 174 $reading_options->{'text'} = $text if $text;
175
255875b8 176 my $gnode = $collation->add_reading( $reading_options );
32014ec9 177 }
910a0a6d 178
32014ec9 179 # Now add the edges.
0068967c 180 # print STDERR "Adding graph edges\n";
32014ec9 181 foreach my $e ( @{$graph_data->{'edges'}} ) {
94c00c71 182 my $from = $e->{$SOURCE_KEY};
183 my $to = $e->{$TARGET_KEY};
32014ec9 184
2626f709 185 # We need the witness, and whether it is an 'extra' reading path.
186 my $wit = $e->{$WITNESS_KEY};
187 warn "No witness label on path edge!" unless $wit;
188 my $extra = $e->{$EXTRA_KEY};
189 my $label = $wit . ( $extra ? $collation->ac_label : '' );
190 $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label );
191 # Add the witness if we don't have it already.
192 unless( $witnesses{$wit} ) {
193 $tradition->add_witness( sigil => $wit );
194 $witnesses{$wit} = 1;
255875b8 195 }
2626f709 196 $tradition->witness( $wit )->is_layered( 1 ) if $extra;
32014ec9 197 }
2626f709 198
199 ## Done with the main graph, now look at the relationships.
200 # Nodes are added via the call to add_reading above. We only need
201 # add the relationships themselves.
202 # TODO check that scoping does trt
203 foreach my $e ( @{$rel_data->{'edges'}} ) {
204 my $from = $e->{$SOURCE_KEY};
205 my $to = $e->{$TARGET_KEY};
206 my $relationship_opts = {
207 'type' => $e->{$RELATIONSHIP_KEY},
208 'scope' => $e->{$SCOPE_KEY},
209 };
210 $relationship_opts->{'non_correctable'} = $e->{$CORRECT_KEY}
211 if exists $e->{$CORRECT_KEY};
212 $relationship_opts->{'non_independent'} = $e->{$INDEP_KEY}
213 if exists $e->{$INDEP_KEY};
214 $collation->add_relationship( $from->{$IDKEY}, $to->{$IDKEY},
215 $relationship_opts );
216 }
32014ec9 217}
218
e867486f 2191;
220
221=head1 BUGS / TODO
222
223=over
224
225=item * Make this into a stream parser with GraphML
226
227=item * Simply field -> attribute correspondence for nodes and edges
228
229=item * Share key name constants with Collation.pm
230
32014ec9 231=back
232
233=head1 LICENSE
234
235This package is free software and is provided "as is" without express
236or implied warranty. You can redistribute it and/or modify it under
237the same terms as Perl itself.
238
239=head1 AUTHOR
240
e867486f 241Tara L Andrews E<lt>aurum@cpan.orgE<gt>