checkpoint, not sure what is here
[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / TEI.pm
CommitLineData
f6066bac 1package Text::Tradition::Parser::TEI;
2
3use strict;
4use warnings;
5use XML::LibXML;
6use XML::LibXML::XPathContext;
7
8=head1 NAME
9
10Text::Tradition::Parser::TEI
11
12=head1 DESCRIPTION
13
14Parser module for Text::Tradition, given a TEI parallel-segmentation
15file that describes a text and its variants.
16
17=head1 METHODS
18
19=over
20
21=item B<parse>
22
23parse( $tei_string );
24
25Takes an initialized tradition and a string containing the TEI;
26creates the appropriate nodes and edges on the graph, as well as
27the appropriate witness objects.
28
29=cut
30
31sub parse {
32 my( $tradition, $xml_str ) = @_;
33
34 # First, parse the XML.
35 my $parser = XML::LibXML->new();
36 my $doc = $parser->parse_string( $xml_str );
37 my $tei = $doc->documentElement();
38 $xpc = XML::LibXML::XPathContext->new( $tei );
39 $xpc->registerNs( 'tei', 'http://www.tei-c.org/ns/1.0' );
40
41 # Then get the witnesses and create the witness objects.
42 foreach my $wit_el ( $xpc->findnodes( '//tei:listWit/tei:witness' ) ) {
43 my $sig = $wit_el->getAttribute( 'xml:id' );
44 my $source = $wit_el->toString(); # Save all the XML info we have
45 $tradition->add_witness( sigil => $sig, source => $source );
46 }
47
48 # Now go through the text and make the tokens.
49 # Assume for now that each word is tokenized in the XML.
50 my $text = {};
51 map { $text->{$_->sigil} = [ $tradition->start ] } @{$tradition->witnesses};
52 foreach my $word_el ( $xpc->findnodes( '//tei:w|tei:seg' ) ) {
53 # If it is contained within a lem or a rdg, look at those witnesses.
54 # Otherwise it is common to all witnesses.
55 # Also common if it is the only lem/rdg within its app.
56 # Thus we are assuming non-nested apps.
57 my $node_id = $word_el->getAttribute( 'xml:id' );
58 my $parent_rdg = $xpc->find( 'parent::tei:lem|parent::tei:rdg', $word_el );
59 my @wits = get_sigla( $parent_rdg );
60 @wits = map { $_->sigil } @{$tradition->witnesses} unless $wits;
61
62 # TODO Create the node
63 my $reading = $word_el->textContent();
64
65 # TODO Figure out if it is a common node
66
67 foreach my $sig ( @wits ) {
68 push( @{$text->{$sig}}, $reading );
69 }
70 }
71
72 # Now we have the text paths through the witnesses, so we can make
73 # the edges.
74
75 # TODO think about relationships, transpositions, etc.
76}
77
78sub get_sigla {
79 my( $rdg ) = @_;
80 # Cope if we have been handed a NodeList. There is only
81 # one reading here.
82 if( ref( $rdg ) eq 'XML::LibXML::NodeList' ) {
83 $rdg = $rdg->shift;
84 }
85
86 my @wits;
87 if( ref( $rdg ) eq 'XML::LibXML::Element' ) {
88 @wits = split( /\s+/, $rdg->get_attribute( 'wit' ) );
89 map { $_ =~ s/^\#// } @wits;
90 }
91 return @wits;
92}
93
941;