Commit | Line | Data |
f6066bac |
1 | package Text::Tradition::Parser::TEI; |
2 | |
3 | use strict; |
4 | use warnings; |
5 | use XML::LibXML; |
6 | use XML::LibXML::XPathContext; |
7 | |
8 | =head1 NAME |
9 | |
10 | Text::Tradition::Parser::TEI |
11 | |
12 | =head1 DESCRIPTION |
13 | |
14 | Parser module for Text::Tradition, given a TEI parallel-segmentation |
15 | file that describes a text and its variants. |
16 | |
17 | =head1 METHODS |
18 | |
19 | =over |
20 | |
21 | =item B<parse> |
22 | |
23 | parse( $tei_string ); |
24 | |
25 | Takes an initialized tradition and a string containing the TEI; |
26 | creates the appropriate nodes and edges on the graph, as well as |
27 | the appropriate witness objects. |
28 | |
29 | =cut |
30 | |
31 | sub parse { |
32 | my( $tradition, $xml_str ) = @_; |
33 | |
34 | # First, parse the XML. |
35 | my $parser = XML::LibXML->new(); |
36 | my $doc = $parser->parse_string( $xml_str ); |
37 | my $tei = $doc->documentElement(); |
38 | $xpc = XML::LibXML::XPathContext->new( $tei ); |
39 | $xpc->registerNs( 'tei', 'http://www.tei-c.org/ns/1.0' ); |
40 | |
41 | # Then get the witnesses and create the witness objects. |
42 | foreach my $wit_el ( $xpc->findnodes( '//tei:listWit/tei:witness' ) ) { |
43 | my $sig = $wit_el->getAttribute( 'xml:id' ); |
44 | my $source = $wit_el->toString(); # Save all the XML info we have |
45 | $tradition->add_witness( sigil => $sig, source => $source ); |
46 | } |
47 | |
48 | # Now go through the text and make the tokens. |
49 | # Assume for now that each word is tokenized in the XML. |
50 | my $text = {}; |
51 | map { $text->{$_->sigil} = [ $tradition->start ] } @{$tradition->witnesses}; |
52 | foreach my $word_el ( $xpc->findnodes( '//tei:w|tei:seg' ) ) { |
53 | # If it is contained within a lem or a rdg, look at those witnesses. |
54 | # Otherwise it is common to all witnesses. |
55 | # Also common if it is the only lem/rdg within its app. |
56 | # Thus we are assuming non-nested apps. |
57 | my $node_id = $word_el->getAttribute( 'xml:id' ); |
58 | my $parent_rdg = $xpc->find( 'parent::tei:lem|parent::tei:rdg', $word_el ); |
59 | my @wits = get_sigla( $parent_rdg ); |
60 | @wits = map { $_->sigil } @{$tradition->witnesses} unless $wits; |
61 | |
62 | # TODO Create the node |
63 | my $reading = $word_el->textContent(); |
64 | |
65 | # TODO Figure out if it is a common node |
66 | |
67 | foreach my $sig ( @wits ) { |
68 | push( @{$text->{$sig}}, $reading ); |
69 | } |
70 | } |
71 | |
72 | # Now we have the text paths through the witnesses, so we can make |
73 | # the edges. |
74 | |
75 | # TODO think about relationships, transpositions, etc. |
76 | } |
77 | |
78 | sub get_sigla { |
79 | my( $rdg ) = @_; |
80 | # Cope if we have been handed a NodeList. There is only |
81 | # one reading here. |
82 | if( ref( $rdg ) eq 'XML::LibXML::NodeList' ) { |
83 | $rdg = $rdg->shift; |
84 | } |
85 | |
86 | my @wits; |
87 | if( ref( $rdg ) eq 'XML::LibXML::Element' ) { |
88 | @wits = split( /\s+/, $rdg->get_attribute( 'wit' ) ); |
89 | map { $_ =~ s/^\#// } @wits; |
90 | } |
91 | return @wits; |
92 | } |
93 | |
94 | 1; |