Commit | Line | Data |
cda6a45b |
1 | package Text::Tradition::Parser::CollateX; |
2 | |
3 | use strict; |
4 | use warnings; |
1f7aa795 |
5 | use Text::Tradition::Parser::GraphML qw/ graphml_parse /; |
cda6a45b |
6 | |
7 | =head1 NAME |
8 | |
9 | Text::Tradition::Parser::CollateX |
10 | |
e867486f |
11 | =head1 SYNOPSIS |
12 | |
13 | use Text::Tradition; |
14 | |
15 | my $t_from_file = Text::Tradition->new( |
16 | 'name' => 'my text', |
17 | 'input' => 'CollateX', |
18 | 'file' => '/path/to/collation.xml' |
19 | ); |
20 | |
21 | my $t_from_string = Text::Tradition->new( |
22 | 'name' => 'my text', |
23 | 'input' => 'CollateX', |
24 | 'string' => $collation_xml, |
25 | ); |
26 | |
cda6a45b |
27 | =head1 DESCRIPTION |
28 | |
29 | Parser module for Text::Tradition, given a GraphML file from the |
30 | CollateX program that describes a collation graph. For further |
31 | information on the GraphML format for text collation, see |
32 | http://gregor.middell.net/collatex/ |
33 | |
34 | =head1 METHODS |
35 | |
e867486f |
36 | =head2 B<parse> |
cda6a45b |
37 | |
dfc37e38 |
38 | parse( $tradition, $init_options ); |
cda6a45b |
39 | |
e867486f |
40 | Takes an initialized Text::Tradition object and a set of options; creates |
41 | the appropriate nodes and edges on the graph. The options hash should |
42 | include either a 'file' argument or a 'string' argument, depending on the |
43 | source of the XML to be parsed. |
44 | |
45 | =begin testing |
46 | |
47 | use Text::Tradition; |
48 | binmode STDOUT, ":utf8"; |
49 | binmode STDERR, ":utf8"; |
50 | eval { no warnings; binmode $DB::OUT, ":utf8"; }; |
51 | |
52 | my $cxfile = 't/data/Collatex-16.xml'; |
53 | my $t = Text::Tradition->new( |
54 | 'name' => 'inline', |
55 | 'input' => 'CollateX', |
56 | 'file' => $cxfile, |
57 | ); |
58 | |
59 | is( ref( $t ), 'Text::Tradition', "Parsed our own GraphML" ); |
60 | if( $t ) { |
61 | is( scalar $t->collation->readings, 26, "Collation has all readings" ); |
62 | is( scalar $t->collation->paths, 49, "Collation has all paths" ); |
63 | is( scalar $t->witnesses, 3, "Collation has all witnesses" ); |
64 | |
65 | # Check an 'identical' node |
66 | my $transposed = $t->collation->reading( 'n15' ); |
67 | ok( $transposed->has_primary, "Reading links to transposed primary" ); |
68 | is( $transposed->primary->name, 'n17', "Correct transposition link" ); |
69 | } |
70 | |
71 | =end testing |
cda6a45b |
72 | |
73 | =cut |
74 | |
75 | my $IDKEY = 'number'; |
76 | my $CONTENTKEY = 'token'; |
77 | my $TRANSKEY = 'identical'; |
78 | |
79 | sub parse { |
dfc37e38 |
80 | my( $tradition, $opts ) = @_; |
e867486f |
81 | my $graph_data = graphml_parse( $opts ); |
cda6a45b |
82 | my $collation = $tradition->collation; |
83 | my %witnesses; # Keep track of the witnesses we encounter as we |
84 | # run through the graph data. |
85 | |
86 | # Add the nodes to the graph. First delete the start node, because |
87 | # GraphML graphs will have their own start nodes. |
88 | $collation->del_reading( $collation->start() ); |
910a0a6d |
89 | $collation->del_reading( $collation->end() ); |
cda6a45b |
90 | |
91 | my $extra_data = {}; # Keep track of info to be processed after all |
92 | # nodes have been created |
93 | foreach my $n ( @{$graph_data->{'nodes'}} ) { |
910a0a6d |
94 | my %node_data = %$n; |
95 | my $nodeid = delete $node_data{$IDKEY}; |
96 | my $token = delete $node_data{$CONTENTKEY}; |
97 | unless( defined $nodeid && defined $token ) { |
98 | warn "Did not find an ID or token for graph node, can't add it"; |
99 | next; |
100 | } |
101 | my $gnode = $collation->add_reading( $nodeid ); |
102 | $gnode->text( $token ); |
103 | |
104 | # Whatever is left is extra info to be processed later. |
105 | if( keys %node_data ) { |
106 | $extra_data->{$nodeid} = \%node_data; |
107 | } |
cda6a45b |
108 | } |
910a0a6d |
109 | |
cda6a45b |
110 | # Now add the edges. |
111 | foreach my $e ( @{$graph_data->{'edges'}} ) { |
910a0a6d |
112 | my %edge_data = %$e; |
113 | my $from = delete $edge_data{'source'}; |
114 | my $to = delete $edge_data{'target'}; |
115 | |
116 | # In CollateX, we have a distinct witness data ID per witness, |
117 | # so that we can have multiple witnesses per edge. We want to |
118 | # translate this to one witness per edge in our own |
119 | # representation. |
120 | foreach my $ekey ( keys %edge_data ) { |
121 | my $wit = $edge_data{$ekey}; |
122 | # Create the witness object if it does not yet exist. |
123 | unless( $witnesses{$wit} ) { |
124 | $tradition->add_witness( 'sigil' => $wit ); |
125 | $witnesses{$wit} = 1; |
126 | } |
127 | $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $wit ); |
128 | } |
cda6a45b |
129 | } |
130 | |
131 | # Process the extra node data if it exists. |
132 | foreach my $nodeid ( keys %$extra_data ) { |
910a0a6d |
133 | my $ed = $extra_data->{$nodeid}; |
134 | if( exists $ed->{$TRANSKEY} ) { |
135 | |
136 | my $tn_reading = $collation->reading( $nodeid ); |
137 | my $main_reading = $collation->reading( $ed->{$TRANSKEY} ); |
138 | if( $collation->linear ) { |
139 | $tn_reading->set_identical( $main_reading ); |
140 | } else { |
141 | $collation->merge_readings( $main_reading, $tn_reading ); |
142 | } |
143 | } # else we don't have any other tags to process yet. |
cda6a45b |
144 | } |
145 | |
146 | # Find the beginning and end nodes of the graph. The beginning node |
147 | # has no incoming edges; the end node has no outgoing edges. |
148 | my( $begin_node, $end_node ); |
149 | foreach my $gnode ( $collation->readings() ) { |
910a0a6d |
150 | # print STDERR "Checking node " . $gnode->name . "\n"; |
151 | my @outgoing = $gnode->outgoing(); |
152 | my @incoming = $gnode->incoming(); |
153 | |
154 | unless( scalar @incoming ) { |
155 | warn "Already have a beginning node" if $begin_node; |
156 | $begin_node = $gnode; |
157 | $collation->start( $gnode ); |
158 | } |
159 | unless( scalar @outgoing ) { |
160 | warn "Already have an ending node" if $end_node; |
161 | $end_node = $gnode; |
162 | $collation->end( $gnode ); |
163 | } |
cda6a45b |
164 | } |
d9e873d0 |
165 | |
e867486f |
166 | # Rank the readings. |
94a077d6 |
167 | $collation->calculate_ranks() if $collation->linear; |
cda6a45b |
168 | } |
169 | |
e867486f |
170 | =head1 BUGS / TODO |
171 | |
172 | =over |
173 | |
174 | =item * Make this into a stream parser with GraphML |
175 | |
176 | =item * Use CollateX-calculated ranks instead of recalculating our own |
177 | |
cda6a45b |
178 | =back |
179 | |
180 | =head1 LICENSE |
181 | |
182 | This package is free software and is provided "as is" without express |
183 | or implied warranty. You can redistribute it and/or modify it under |
184 | the same terms as Perl itself. |
185 | |
186 | =head1 AUTHOR |
187 | |
188 | Tara L Andrews, aurum@cpan.org |
189 | |
190 | =cut |
191 | |
192 | 1; |