Commit | Line | Data |
f6066bac |
1 | package Text::Tradition::Parser::TEI; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use XML::LibXML; |
6 | use XML::LibXML::XPathContext; |
7 | |
8 | =head1 NAME |
9 | |
10 | Text::Tradition::Parser::TEI |
11 | |
12 | =head1 DESCRIPTION |
13 | |
14 | Parser module for Text::Tradition, given a TEI parallel-segmentation |
15 | file that describes a text and its variants. |
16 | |
17 | =head1 METHODS |
18 | |
19 | =over |
20 | |
21 | =item B<parse> |
22 | |
23 | parse( $tei_string ); |
24 | |
25 | Takes an initialized tradition and a string containing the TEI; |
26 | creates the appropriate nodes and edges on the graph, as well as |
27 | the appropriate witness objects. |
28 | |
29 | =cut |
30 | |
31 | sub parse { |
32 | my( $tradition, $xml_str ) = @_; |
33 | |
34 | # First, parse the XML. |
35 | my $parser = XML::LibXML->new(); |
36 | my $doc = $parser->parse_string( $xml_str ); |
37 | my $tei = $doc->documentElement(); |
f2b9605f |
38 | my $xpc = XML::LibXML::XPathContext->new( $tei ); |
f6066bac |
39 | $xpc->registerNs( 'tei', 'http://www.tei-c.org/ns/1.0' ); |
40 | |
41 | # Then get the witnesses and create the witness objects. |
42 | foreach my $wit_el ( $xpc->findnodes( '//tei:listWit/tei:witness' ) ) { |
43 | my $sig = $wit_el->getAttribute( 'xml:id' ); |
f2b9605f |
44 | my $source = $wit_el->toString(); |
f6066bac |
45 | $tradition->add_witness( sigil => $sig, source => $source ); |
46 | } |
47 | |
48 | # Now go through the text and make the tokens. |
49 | # Assume for now that each word is tokenized in the XML. |
50 | my $text = {}; |
f2b9605f |
51 | map { $text->{$_->sigil} = [] } @{$tradition->witnesses}; |
52 | my $word_ctr = 0; |
53 | my %used_word_ids; |
f6066bac |
54 | foreach my $word_el ( $xpc->findnodes( '//tei:w|tei:seg' ) ) { |
55 | # If it is contained within a lem or a rdg, look at those witnesses. |
56 | # Otherwise it is common to all witnesses. |
57 | # Also common if it is the only lem/rdg within its app. |
58 | # Thus we are assuming non-nested apps. |
f2b9605f |
59 | |
f6066bac |
60 | my $parent_rdg = $xpc->find( 'parent::tei:lem|parent::tei:rdg', $word_el ); |
61 | my @wits = get_sigla( $parent_rdg ); |
f2b9605f |
62 | @wits = map { $_->sigil } @{$tradition->witnesses} unless @wits; |
63 | |
64 | # Create the node |
65 | my $reading = make_reading( $tradition->collation, $word_el ); |
66 | |
67 | # Figure out if it is a common node, that is, if it is outside an apparatus |
68 | # or the only rdg in an apparatus |
69 | my $common = 1; |
70 | if( $xpc->findnodes( 'ancestor::tei:app', $word_el ) ) { |
71 | # If we are in an app we are not a common node... |
72 | $common = 0; |
73 | if( $xpc->findnodes( 'ancestor::tei:app/tei:rdg' )->size == 1 ) { |
74 | # unless we are the only reading in the app. |
75 | $common = 1; |
76 | } |
77 | } |
78 | $reading->make_common if $common; |
79 | |
f6066bac |
80 | foreach my $sig ( @wits ) { |
81 | push( @{$text->{$sig}}, $reading ); |
82 | } |
83 | } |
84 | |
f2b9605f |
85 | $DB::single = 1; |
f6066bac |
86 | # Now we have the text paths through the witnesses, so we can make |
87 | # the edges. |
f2b9605f |
88 | my $end = $tradition->collation->add_reading( '#END#' ); |
89 | foreach my $sigil ( keys %$text ) { |
90 | my @nodes = @{$text->{$sigil}}; |
91 | my $source = $tradition->collation->start; |
92 | foreach my $n ( @nodes ) { |
93 | # print STDERR sprintf( "Joining %s -> %s for wit %s\n", $source->text, $n->text, $sigil ); |
94 | $tradition->collation->add_path( $source, $n, $sigil ); |
95 | $source = $n; |
96 | } |
97 | $tradition->collation->add_path( $source, $end, $sigil ); |
98 | } |
f6066bac |
99 | |
100 | # TODO think about relationships, transpositions, etc. |
101 | } |
102 | |
103 | sub get_sigla { |
104 | my( $rdg ) = @_; |
105 | # Cope if we have been handed a NodeList. There is only |
106 | # one reading here. |
107 | if( ref( $rdg ) eq 'XML::LibXML::NodeList' ) { |
108 | $rdg = $rdg->shift; |
109 | } |
110 | |
111 | my @wits; |
112 | if( ref( $rdg ) eq 'XML::LibXML::Element' ) { |
f2b9605f |
113 | @wits = split( /\s+/, $rdg->getAttribute( 'wit' ) ); |
f6066bac |
114 | map { $_ =~ s/^\#// } @wits; |
115 | } |
116 | return @wits; |
117 | } |
118 | |
f2b9605f |
119 | { |
120 | my $word_ctr = 0; |
121 | my %used_nodeids; |
122 | |
123 | sub make_reading { |
124 | my( $graph, $word_el) = @_; |
125 | my $xml_id = $word_el->getAttribute( 'xml:id' ); |
126 | if( $xml_id && exists $used_nodeids{$xml_id} ) { |
127 | warn "Already used assigned ID $xml_id"; |
128 | $xml_id = undef; |
129 | } |
130 | if( !$xml_id ) { |
131 | until( $xml_id ) { |
132 | my $try_id = 'w'.$word_ctr++; |
133 | next if exists $used_nodeids{$try_id}; |
134 | $xml_id = $try_id; |
135 | } |
136 | } |
137 | my $rdg = $graph->add_reading( $xml_id ); |
138 | $rdg->text( $word_el->textContent() ); |
139 | $used_nodeids{$xml_id} = $rdg; |
140 | return $rdg; |
141 | } |
142 | } |
143 | |
f6066bac |
144 | 1; |