1 package Text::Tradition::Parser::TEI;
6 use XML::LibXML::XPathContext;
10 Text::Tradition::Parser::TEI
14 Parser module for Text::Tradition, given a TEI parallel-segmentation
15 file that describes a text and its variants.
25 Takes an initialized tradition and a string containing the TEI;
26 creates the appropriate nodes and edges on the graph, as well as
27 the appropriate witness objects.
32 my( $tradition, $xml_str ) = @_;
34 # First, parse the XML.
35 my $parser = XML::LibXML->new();
36 my $doc = $parser->parse_string( $xml_str );
37 my $tei = $doc->documentElement();
38 my $xpc = XML::LibXML::XPathContext->new( $tei );
39 $xpc->registerNs( 'tei', 'http://www.tei-c.org/ns/1.0' );
41 # Then get the witnesses and create the witness objects.
42 foreach my $wit_el ( $xpc->findnodes( '//tei:listWit/tei:witness' ) ) {
43 my $sig = $wit_el->getAttribute( 'xml:id' );
44 my $source = $wit_el->toString();
45 $tradition->add_witness( sigil => $sig, source => $source );
48 # Now go through the text and make the tokens.
49 # Assume for now that each word is tokenized in the XML.
51 map { $text->{$_->sigil} = [] } @{$tradition->witnesses};
54 foreach my $word_el ( $xpc->findnodes( '//tei:w|tei:seg' ) ) {
55 # If it is contained within a lem or a rdg, look at those witnesses.
56 # Otherwise it is common to all witnesses.
57 # Also common if it is the only lem/rdg within its app.
58 # Thus we are assuming non-nested apps.
60 my $parent_rdg = $xpc->find( 'parent::tei:lem|parent::tei:rdg', $word_el );
61 my @wits = get_sigla( $parent_rdg );
62 @wits = map { $_->sigil } @{$tradition->witnesses} unless @wits;
65 my $reading = make_reading( $tradition->collation, $word_el );
67 # Figure out if it is a common node, that is, if it is outside an apparatus
68 # or the only rdg in an apparatus
70 if( $xpc->findnodes( 'ancestor::tei:app', $word_el ) ) {
71 # If we are in an app we are not a common node...
73 if( $xpc->findnodes( 'ancestor::tei:app/tei:rdg' )->size == 1 ) {
74 # unless we are the only reading in the app.
78 $reading->make_common if $common;
80 foreach my $sig ( @wits ) {
81 push( @{$text->{$sig}}, $reading );
86 # Now we have the text paths through the witnesses, so we can make
88 my $end = $tradition->collation->add_reading( '#END#' );
89 foreach my $sigil ( keys %$text ) {
90 my @nodes = @{$text->{$sigil}};
91 my $source = $tradition->collation->start;
92 foreach my $n ( @nodes ) {
93 # print STDERR sprintf( "Joining %s -> %s for wit %s\n", $source->text, $n->text, $sigil );
94 $tradition->collation->add_path( $source, $n, $sigil );
97 $tradition->collation->add_path( $source, $end, $sigil );
100 # TODO think about relationships, transpositions, etc.
105 # Cope if we have been handed a NodeList. There is only
107 if( ref( $rdg ) eq 'XML::LibXML::NodeList' ) {
112 if( ref( $rdg ) eq 'XML::LibXML::Element' ) {
113 @wits = split( /\s+/, $rdg->getAttribute( 'wit' ) );
114 map { $_ =~ s/^\#// } @wits;
124 my( $graph, $word_el) = @_;
125 my $xml_id = $word_el->getAttribute( 'xml:id' );
126 if( $xml_id && exists $used_nodeids{$xml_id} ) {
127 warn "Already used assigned ID $xml_id";
132 my $try_id = 'w'.$word_ctr++;
133 next if exists $used_nodeids{$try_id};
137 my $rdg = $graph->add_reading( $xml_id );
138 $rdg->text( $word_el->textContent() );
139 $used_nodeids{$xml_id} = $rdg;