Commit | Line | Data |
32014ec9 |
1 | package Text::Tradition::Parser::Self; |
2 | |
3 | use strict; |
4 | use warnings; |
1f7aa795 |
5 | use Text::Tradition::Parser::GraphML qw/ graphml_parse /; |
32014ec9 |
6 | |
7 | =head1 NAME |
8 | |
9 | Text::Tradition::Parser::GraphML |
10 | |
e867486f |
11 | =head1 SYNOPSIS |
12 | |
13 | use Text::Tradition; |
14 | |
15 | my $t_from_file = Text::Tradition->new( |
16 | 'name' => 'my text', |
17 | 'input' => 'Self', |
18 | 'file' => '/path/to/tradition.xml' |
19 | ); |
20 | |
21 | my $t_from_string = Text::Tradition->new( |
22 | 'name' => 'my text', |
23 | 'input' => 'Self', |
24 | 'string' => $tradition_xml, |
25 | ); |
26 | |
32014ec9 |
27 | =head1 DESCRIPTION |
28 | |
29 | Parser module for Text::Tradition to read in its own GraphML output format. |
e867486f |
30 | GraphML is a relatively simple graph description language; a 'graph' element |
31 | can have 'node' and 'edge' elements, and each of these can have simple 'data' |
32 | elements for attributes to be saved. |
32014ec9 |
33 | |
e867486f |
34 | The graph itself has attributes as in the Collation object: |
35 | |
36 | =over |
37 | |
38 | =item * linear |
39 | |
40 | =item * ac_label |
41 | |
42 | =item * baselabel |
43 | |
44 | =item * wit_list_separator |
45 | |
46 | =back |
47 | |
48 | The node objects have the following attributes: |
32014ec9 |
49 | |
50 | =over |
51 | |
e867486f |
52 | =item * name |
53 | |
54 | =item * reading |
55 | |
56 | =item * identical |
57 | |
58 | =item * rank |
59 | |
60 | =item * class |
61 | |
62 | =back |
63 | |
64 | The edge objects have the following attributes: |
65 | |
66 | =over |
67 | |
68 | =item * class |
69 | |
70 | =item * witness (for 'path' class edges) |
71 | |
72 | =item * extra (for 'path' class edges) |
73 | |
74 | =item * relationship (for 'relationship' class edges) |
75 | |
76 | =item * equal_rank (for 'relationship' class edges) |
32014ec9 |
77 | |
e867486f |
78 | =item * non_correctable (for 'relationship' class edges) |
32014ec9 |
79 | |
e867486f |
80 | =item * non_independent (for 'relationship' class edges) |
81 | |
82 | =back |
83 | |
84 | =head1 METHODS |
85 | |
86 | =head2 B<parse> |
87 | |
88 | parse( $graph, $opts ); |
89 | |
90 | Takes an initialized Text::Tradition object and a set of options; creates |
91 | the appropriate nodes and edges on the graph. The options hash should |
92 | include either a 'file' argument or a 'string' argument, depending on the |
93 | source of the XML to be parsed. |
94 | |
95 | =begin testing |
96 | |
97 | use Text::Tradition; |
98 | binmode STDOUT, ":utf8"; |
99 | binmode STDERR, ":utf8"; |
100 | eval { no warnings; binmode $DB::OUT, ":utf8"; }; |
101 | |
102 | my $tradition = 't/data/florilegium_graphml.xml'; |
103 | my $t = Text::Tradition->new( |
104 | 'name' => 'inline', |
105 | 'input' => 'Self', |
106 | 'file' => $tradition, |
107 | ); |
108 | |
109 | is( ref( $t ), 'Text::Tradition', "Parsed our own GraphML" ); |
110 | if( $t ) { |
111 | is( scalar $t->collation->readings, 319, "Collation has all readings" ); |
255875b8 |
112 | is( scalar $t->collation->paths, 376, "Collation has all paths" ); |
e867486f |
113 | is( scalar $t->witnesses, 13, "Collation has all witnesses" ); |
114 | } |
115 | |
116 | =end testing |
32014ec9 |
117 | |
118 | =cut |
119 | |
94c00c71 |
120 | my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $RANK_KEY, $CLASS_KEY, |
255875b8 |
121 | $START_KEY, $END_KEY, $LACUNA_KEY, |
122 | $SOURCE_KEY, $TARGET_KEY, $WITNESS_KEY, $EXTRA_KEY, $RELATIONSHIP_KEY, |
123 | $COLO_KEY, $CORRECT_KEY, $INDEP_KEY ) |
124 | = qw/ name reading identical rank class |
125 | is_start is_end is_lacuna |
126 | source target witness extra relationship |
127 | equal_rank non_correctable non_independent /; |
32014ec9 |
128 | |
129 | sub parse { |
dfc37e38 |
130 | my( $tradition, $opts ) = @_; |
e867486f |
131 | my $graph_data = graphml_parse( $opts ); |
94c00c71 |
132 | |
32014ec9 |
133 | my $collation = $tradition->collation; |
134 | my %witnesses; |
e309421a |
135 | |
136 | # Set up the graph-global attributes. They will appear in the |
137 | # hash under their accessor names. |
255875b8 |
138 | my $use_version; |
0068967c |
139 | # print STDERR "Setting graph globals\n"; |
e3196b2a |
140 | $tradition->name( $graph_data->{'name'} ); |
255875b8 |
141 | foreach my $gkey ( keys %{$graph_data->{'global'}} ) { |
142 | my $val = $graph_data->{'global'}->{$gkey}; |
143 | if( $gkey eq 'version' ) { |
144 | $use_version = $val; |
145 | } else { |
146 | $collation->$gkey( $val ); |
147 | } |
148 | } |
149 | if( $use_version ) { |
150 | # Many of our tags changed. |
151 | $IDKEY = 'id'; |
152 | $TOKENKEY = 'text'; |
153 | $COLO_KEY = 'colocated'; |
e309421a |
154 | } |
155 | |
32014ec9 |
156 | # Add the nodes to the graph. |
157 | |
158 | my $extra_data = {}; # Keep track of data that needs to be processed |
159 | # after the nodes & edges are created. |
0068967c |
160 | # print STDERR "Adding graph nodes\n"; |
32014ec9 |
161 | foreach my $n ( @{$graph_data->{'nodes'}} ) { |
0068967c |
162 | unless( $use_version ) { |
163 | # Backwards compat! |
164 | $n->{$START_KEY} = 1 if $n->{$IDKEY} eq '#START#'; |
165 | $n->{$END_KEY} = 1 if $n->{$IDKEY} eq '#END#'; |
166 | } |
167 | |
0174d6a9 |
168 | # If it is the start or end node, we already have one, so |
169 | # grab the rank and go. |
0068967c |
170 | next if( defined $n->{$START_KEY} ); |
0174d6a9 |
171 | if( defined $n->{$END_KEY} ) { |
172 | $collation->end->rank( $n->{$RANK_KEY} ); |
173 | next; |
174 | } |
255875b8 |
175 | |
94c00c71 |
176 | # First extract the data that we can use without reference to |
177 | # anything else. |
178 | my %node_data = %$n; # Need $n itself untouched for edge processing |
94c00c71 |
179 | |
255875b8 |
180 | # Create the node. |
181 | my $reading_options = { |
182 | 'id' => delete $node_data{$IDKEY}, |
183 | 'is_lacuna' => delete $node_data{$LACUNA_KEY}, |
184 | }; |
185 | my $rank = delete $node_data{$RANK_KEY}; |
186 | $reading_options->{'rank'} = $rank if $rank; |
187 | my $text = delete $node_data{$TOKENKEY}; |
188 | $reading_options->{'text'} = $text if $text; |
189 | |
190 | # This is a horrible hack for backwards compatibility. |
191 | unless( $use_version ) { |
192 | $reading_options->{'is_lacuna'} = 1 |
193 | if $reading_options->{'text'} =~ /^\#LACUNA/; |
194 | } |
195 | |
196 | delete $node_data{$CLASS_KEY}; # Not actually used |
197 | my $gnode = $collation->add_reading( $reading_options ); |
94c00c71 |
198 | |
199 | # Now save the data that we need for post-processing, |
255875b8 |
200 | # if it exists. TODO this is unneeded after conversion |
910a0a6d |
201 | if ( keys %node_data ) { |
255875b8 |
202 | $extra_data->{$gnode->id} = \%node_data |
910a0a6d |
203 | } |
32014ec9 |
204 | } |
910a0a6d |
205 | |
32014ec9 |
206 | # Now add the edges. |
0068967c |
207 | # print STDERR "Adding graph edges\n"; |
32014ec9 |
208 | foreach my $e ( @{$graph_data->{'edges'}} ) { |
94c00c71 |
209 | my $from = $e->{$SOURCE_KEY}; |
210 | my $to = $e->{$TARGET_KEY}; |
211 | my $class = $e->{$CLASS_KEY}; |
212 | |
213 | # We may have more information depending on the class. |
214 | if( $class eq 'path' ) { |
215 | # We need the witness, and whether it is an 'extra' reading path. |
216 | my $wit = $e->{$WITNESS_KEY}; |
217 | warn "No witness label on path edge!" unless $wit; |
218 | my $extra = $e->{$EXTRA_KEY}; |
219 | my $label = $wit . ( $extra ? $collation->ac_label : '' ); |
220 | $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label ); |
221 | # Add the witness if we don't have it already. |
222 | unless( $witnesses{$wit} ) { |
223 | $tradition->add_witness( sigil => $wit ); |
224 | $witnesses{$wit} = 1; |
225 | } |
1f7aa795 |
226 | $tradition->witness( $wit )->is_layered( 1 ) if $extra; |
94c00c71 |
227 | } elsif( $class eq 'relationship' ) { |
c9bf3dbf |
228 | # We need the metadata about the relationship. |
229 | my $opts = { 'type' => $e->{$RELATIONSHIP_KEY} }; |
255875b8 |
230 | $opts->{$COLO_KEY} = $e->{$COLO_KEY} |
231 | if exists $e->{$COLO_KEY}; |
232 | $opts->{$CORRECT_KEY} = $e->{$CORRECT_KEY} |
233 | if exists $e->{$CORRECT_KEY}; |
234 | $opts->{$INDEP_KEY} = $e->{$INDEP_KEY} |
235 | if exists $e->{$INDEP_KEY}; |
c9bf3dbf |
236 | warn "No relationship type for relationship edge!" unless $opts->{'type'}; |
decc2a20 |
237 | my( $ok, @result ) = $collation->add_relationship( $from->{$IDKEY}, $to->{$IDKEY}, $opts ); |
238 | unless( $ok ) { |
0174d6a9 |
239 | my $relinfo = $opts->{'type'} . ' ' |
240 | . join( ' -> ', $from->{$IDKEY}, $to->{$IDKEY} ); |
241 | warn "Did not add relationship $relinfo: @result"; |
decc2a20 |
242 | } |
94c00c71 |
243 | } |
32014ec9 |
244 | } |
245 | |
246 | ## Deal with node information (transposition, relationships, etc.) that |
247 | ## needs to be processed after all the nodes are created. |
255875b8 |
248 | ## TODO unneeded after conversion |
249 | unless( $use_version ) { |
0068967c |
250 | # print STDERR "Adding second-pass node data\n"; |
255875b8 |
251 | foreach my $nkey ( keys %$extra_data ) { |
252 | foreach my $edkey ( keys %{$extra_data->{$nkey}} ) { |
253 | my $this_reading = $collation->reading( $nkey ); |
254 | if( $edkey eq $TRANSPOS_KEY ) { |
255 | my $other_reading = $collation->reading( $extra_data->{$nkey}->{$edkey} ); |
256 | $this_reading->set_identical( $other_reading ); |
257 | } else { |
258 | warn "Unfamiliar reading node data $edkey for $nkey"; |
259 | } |
260 | } |
261 | } |
32014ec9 |
262 | } |
32014ec9 |
263 | } |
264 | |
e867486f |
265 | 1; |
266 | |
267 | =head1 BUGS / TODO |
268 | |
269 | =over |
270 | |
271 | =item * Make this into a stream parser with GraphML |
272 | |
273 | =item * Simply field -> attribute correspondence for nodes and edges |
274 | |
275 | =item * Share key name constants with Collation.pm |
276 | |
32014ec9 |
277 | =back |
278 | |
279 | =head1 LICENSE |
280 | |
281 | This package is free software and is provided "as is" without express |
282 | or implied warranty. You can redistribute it and/or modify it under |
283 | the same terms as Perl itself. |
284 | |
285 | =head1 AUTHOR |
286 | |
e867486f |
287 | Tara L Andrews E<lt>aurum@cpan.orgE<gt> |