Commit | Line | Data |
cda6a45b |
1 | package Text::Tradition::Parser::CollateX; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use Text::Tradition::Parser::GraphML; |
6 | |
7 | =head1 NAME |
8 | |
9 | Text::Tradition::Parser::CollateX |
10 | |
11 | =head1 DESCRIPTION |
12 | |
13 | Parser module for Text::Tradition, given a GraphML file from the |
14 | CollateX program that describes a collation graph. For further |
15 | information on the GraphML format for text collation, see |
16 | http://gregor.middell.net/collatex/ |
17 | |
18 | =head1 METHODS |
19 | |
20 | =over |
21 | |
22 | =item B<parse> |
23 | |
24 | parse( $graph, $graphml_string ); |
25 | |
26 | Takes an initialized Text::Tradition::Graph object and a string |
27 | containing the GraphML; creates the appropriate nodes and edges on the |
28 | graph. |
29 | |
30 | =cut |
31 | |
32 | my $IDKEY = 'number'; |
33 | my $CONTENTKEY = 'token'; |
34 | my $TRANSKEY = 'identical'; |
35 | |
36 | sub parse { |
37 | my( $tradition, $graphml_str ) = @_; |
38 | my $graph_data = Text::Tradition::Parser::GraphML::parse( $graphml_str ); |
39 | my $collation = $tradition->collation; |
40 | my %witnesses; # Keep track of the witnesses we encounter as we |
41 | # run through the graph data. |
42 | |
43 | # Add the nodes to the graph. First delete the start node, because |
44 | # GraphML graphs will have their own start nodes. |
45 | $collation->del_reading( $collation->start() ); |
910a0a6d |
46 | $collation->del_reading( $collation->end() ); |
cda6a45b |
47 | |
48 | my $extra_data = {}; # Keep track of info to be processed after all |
49 | # nodes have been created |
50 | foreach my $n ( @{$graph_data->{'nodes'}} ) { |
910a0a6d |
51 | my %node_data = %$n; |
52 | my $nodeid = delete $node_data{$IDKEY}; |
53 | my $token = delete $node_data{$CONTENTKEY}; |
54 | unless( defined $nodeid && defined $token ) { |
55 | warn "Did not find an ID or token for graph node, can't add it"; |
56 | next; |
57 | } |
58 | my $gnode = $collation->add_reading( $nodeid ); |
59 | $gnode->text( $token ); |
60 | |
61 | # Whatever is left is extra info to be processed later. |
62 | if( keys %node_data ) { |
63 | $extra_data->{$nodeid} = \%node_data; |
64 | } |
cda6a45b |
65 | } |
910a0a6d |
66 | |
cda6a45b |
67 | # Now add the edges. |
68 | foreach my $e ( @{$graph_data->{'edges'}} ) { |
910a0a6d |
69 | my %edge_data = %$e; |
70 | my $from = delete $edge_data{'source'}; |
71 | my $to = delete $edge_data{'target'}; |
72 | |
73 | # In CollateX, we have a distinct witness data ID per witness, |
74 | # so that we can have multiple witnesses per edge. We want to |
75 | # translate this to one witness per edge in our own |
76 | # representation. |
77 | foreach my $ekey ( keys %edge_data ) { |
78 | my $wit = $edge_data{$ekey}; |
79 | # Create the witness object if it does not yet exist. |
80 | unless( $witnesses{$wit} ) { |
81 | $tradition->add_witness( 'sigil' => $wit ); |
82 | $witnesses{$wit} = 1; |
83 | } |
84 | $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $wit ); |
85 | } |
cda6a45b |
86 | } |
87 | |
88 | # Process the extra node data if it exists. |
89 | foreach my $nodeid ( keys %$extra_data ) { |
910a0a6d |
90 | my $ed = $extra_data->{$nodeid}; |
91 | if( exists $ed->{$TRANSKEY} ) { |
92 | |
93 | my $tn_reading = $collation->reading( $nodeid ); |
94 | my $main_reading = $collation->reading( $ed->{$TRANSKEY} ); |
95 | if( $collation->linear ) { |
96 | $tn_reading->set_identical( $main_reading ); |
97 | } else { |
98 | $collation->merge_readings( $main_reading, $tn_reading ); |
99 | } |
100 | } # else we don't have any other tags to process yet. |
cda6a45b |
101 | } |
102 | |
103 | # Find the beginning and end nodes of the graph. The beginning node |
104 | # has no incoming edges; the end node has no outgoing edges. |
105 | my( $begin_node, $end_node ); |
106 | foreach my $gnode ( $collation->readings() ) { |
910a0a6d |
107 | # print STDERR "Checking node " . $gnode->name . "\n"; |
108 | my @outgoing = $gnode->outgoing(); |
109 | my @incoming = $gnode->incoming(); |
110 | |
111 | unless( scalar @incoming ) { |
112 | warn "Already have a beginning node" if $begin_node; |
113 | $begin_node = $gnode; |
114 | $collation->start( $gnode ); |
115 | } |
116 | unless( scalar @outgoing ) { |
117 | warn "Already have an ending node" if $end_node; |
118 | $end_node = $gnode; |
119 | $collation->end( $gnode ); |
120 | } |
cda6a45b |
121 | } |
d9e873d0 |
122 | |
123 | # TODO Need to populate $wit->path / uncorrected_path |
cda6a45b |
124 | |
7e450e44 |
125 | # Now we have added the witnesses and their paths, so we can |
126 | # calculate their explicit positions. |
127 | # TODO CollateX does this, and we should just have it exported there. |
d9e873d0 |
128 | $collation->calculate_ranks(); |
cda6a45b |
129 | } |
130 | |
131 | =back |
132 | |
133 | =head1 LICENSE |
134 | |
135 | This package is free software and is provided "as is" without express |
136 | or implied warranty. You can redistribute it and/or modify it under |
137 | the same terms as Perl itself. |
138 | |
139 | =head1 AUTHOR |
140 | |
141 | Tara L Andrews, aurum@cpan.org |
142 | |
143 | =cut |
144 | |
145 | 1; |