[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / CTE.pm

package Text::Tradition::Parser::CTE;

use strict;
use warnings;
use XML::LibXML;
use XML::LibXML::XPathContext;

=head1 NAME

Text::Tradition::Parser::CTE

=head1 DESCRIPTION

Parser module for Text::Tradition, given a TEI file exported from
Classical Text Editor.

=head1 METHODS

=over

=item B<parse>

my @apparatus = read( $xml_file );

Takes a Tradition object and a TEI file exported from Classical Text
Editor; initializes the Tradition from the file.

=cut

my %seg_readings;  # Save the XML IDs for apparatus anchors.
my %sigil_for;     # Save the XML IDs for witnesses.
my %note_start;    # Save the readings where an apparatus entry is attached.

sub parse {
    my( $tradition, $xml_str ) = @_;
    my $c = $tradition->collation;  # Some shorthand
    
    # First, parse the XML.
    my $parser = XML::LibXML->new();
    my $doc = $parser->parse_string( $xml_str );
    my $tei = $doc->documentElement();
    my $xpc = XML::LibXML::XPathContext->new( $tei );

    # CTE uses a DTD rather than any xmlns-based parsing.  Thus we
    # need no namespace foo.

    # Get the witnesses and create the witness objects.
    foreach my $wit_el ( $xpc->findnodes( '//sourceDesc/listWit/witness' ) ) {
	# The witness xml:id is used internally, and is *not* the sigil name.
	my $id= $wit_el->getAttribute( 'xml:id' );
	$id =~ s/^M/sig/;  # Stupid but there you go.
	my @sig_parts = $xpc->findnodes( './abbr/descendant::text()', $wit_el );
	my $sig = join( '', grep { /\w/ } @sig_parts );
	$tradition->add_witness( sigil => $sig, source => $wit_el->toString() );
	$sigil_for{$id} = $sig;
    }

    # Now go through the text and find the base tokens.  Tokens are
    # either plain text to be split on whitespace, or they are wrapped
    # in <hi/> or <seg/> elements.
    my @base_text;
    my $ctr = 1;
    foreach my $pg_el ( $xpc->findnodes( '/TEI/text/p' ) ) {
	foreach my $xn ( $pg_el->childNodes ) {
	    push( @base_text, _get_readings( $tradition, $xn ) );
	}
    }

    # String together the base.
    my $source = $c->start;
    foreach my $b ( @base_text ) {
	$c->add_path( $source, $b, $c->baselabel );
	$source = $b;
    }
    $c->add_path( $source, $c->add_reading('#END#'), $c->baselabel );
		
    # Now go through the text and find all the apparatus notes, and parse them.
    foreach my $note_el( $xpc->findnodes( '//note[attribute::type = "a1"]' ) ) {
	my $app_start = $note_start{$note_el};
	my $apparatus = _parse_note( $note_el, $c, $app_start );
    }
}


## Recursive little helper function to help us navigate through nested
## XML, picking out the text.

sub _get_readings {
    my( $tradition, $xn ) = @_;
    my @readings;
    if( $xn->nodeType == XML_TEXT_NODE ) {
	my $str = $xn->data;
	$str =~ s/^\s+//;
	foreach my $w ( split( /\s+/, $str ) ) {
	    my $rdg = $tradition->collation->add_reading( 'n'.$ctr++ );
	    $rdg->text( $w );
	    push( @readings, $rdg );
	}
    } elsif( $xn->nodeName eq 'hi' ) {
	foreach( $xn->childNodes ) {
	    # Recurse as if the hi weren't there.
	    push( @readings, _get_readings( $tradition, $_ ) );
	}
    } elsif( $xn->nodeName eq 'seg' ) {
	# Read the reading, but also add the word in question as an anchor.
	my $seg_id = $xn->getAttribute( 'xml:id' );
	my @r;
	foreach( $xn->childNodes ) {
	    push( @r, _get_readings( $tradition, $_ ) );
	}
	warn "More than one reading found in seg $seg_id" unless @r == 1;
	$seg_readings{'#'.$seg_id} = $r[0];
	push( @readings, @r );
    } elsif( $xn->nodeName eq 'note' ) {
	# Save where we found this note.
	$note_start{$xn} = $readings[-1];
    }
    return @readings;
}

## Helper function to parse apparatus entries.  This could get nasty, I mean fun.
sub _parse_note {
    my( $xn, $c, $app_start ) = @_;
    my $app_end = $seg_readings{$xn->getAttribute( 'targetEnd' )};
    my $lemma = join( ' ', map { $_->text } $c->reading_sequence( $app_start, $app_end ) );

    my %seen_wits;
    # TODO A list of active witnesses should be passed really.
    map { $seen_wits{$_} = 0 } vals( %sigil_for );

    # The note has a <p/> tag, then <mentioned/>, then 0-1 text nodes,
    # then an assortment of <hi/> or <abbr/> elements.  If the hi
    # contains an abbr, then it goes before, otherwise it probably
    # goes after.
    my @p = $xn->getChildrenByTagName( 'p' );
    warn "More than one pg in note" unless @p == 1;

    # Strip the <hi/> elements.
    my @childnodes;
    foreach ( $p[0]->childNodes ) {
	if( $_->nodeName eq 'hi' ) {
	    push( @childnodes, $_->childNodes );
	} else {
	    push( @childnodes, $_ );
	}
    }
	
    # Go through and try to parse the sucker.
    my $apparatus;
    my $curr_rdg = '';
    my $reading_sigla = 0;
    my @curr_wits;
    foreach my $pxn ( $p[0]->childNodes ) {
	next if $pxn->nodeName eq 'mentioned';  # Redundant for us.
	if( $pxn->nodeType == XML_TEXT_NODE ) {
	    my $pxn_str = $pxn->data;
	    $pxn_str =~ s/^\s+//;
	    $pxn_str =~ s/\s+$//;
	    my @parts = split( /,\s*/, $pxn_str );
	    if( @parts > 1 ) {
		# Comma separation means that we are starting a new reading.
		my $last = shift @parts;
		if( $last =~ /^\s*a\.\s*c\.\s*$/ ) {
		    my $sig = pop @curr_wits;
		    $sig .= '_ac';
		    push( @curr_wits, $sig );
		}
		$pxn_str = join( ', ', @parts );
		# Trigger a reading interpretation.
		$reading_sigla = 1;
	    }
	    if( $reading_sigla ) {
		my @wits = keys %curr_wits;
		$apparatus->{ interpret( $curr_rdg, $lemma ) } = \@wits;
		$curr_rdg = '';
		$reading_sigla = 0;
		@curr_wits = ();
	    }

	    if( $pxn_str =~ /^\s*a\.\s*c\.\s*$/ ) {
		my $sig = pop @curr_wits;
		$sig .= '_ac';
		push( @curr_wits, $sig );
	    } else {
		$curr_rdg .= $pxn_str;
	    }
	} elsif( $pxn->nodeName eq 'abbr' ) {
	    # It is a witness, stick it in @curr_wits
	    my $wit = $sigil_for{$pxn->getAttribute( 'n' )}
	    push( @curr_wits, $wit ) unless $curr_wits[-1] eq $wit;
	    $seen_wits{$wit} += 1;  # Keep track of a 'seen' count in case there is an a.c.
	    $reading_sigla = 1;
	}
    }
    $apparatus->{ interpret( $curr_rdg, $lemma ) } = \@wits if $curr_rdg;
    $apparatus->{ $lemma } = grep { $seen_wits{$_} == 0 } keys %seen_wits;

    return $apparatus;
}


sub interpret {
    # A utility function to change apparatus-ese into a full variant.
    my( $reading, $lemma ) = @_;
    return $reading if $reading eq $lemma;
    my $oldreading = $reading;
    $lemma =~ s/\s+[[:punct:]]+$//;
    $reading =~ s/\s*\(?sic([\s\w.]+)?\)?$//;
    my @words = split( /\s+/, $lemma );
    if( $reading =~ /^(.*) praem.$/ ) {
	$reading = "$1 $lemma";
    } elsif( $reading =~ /^(.*) add.$/ ) {
        $reading = "$lemma $1";
    } elsif( $reading eq 'om.' ) {
	$reading = '';
    } elsif( $reading eq 'inv.' ) {
        # Hope it is two words.
        print STDERR "WARNING: want to invert a lemma that is not two words\n" 
            unless scalar( @words ) == 2;
        $reading = join( ' ', reverse( @words ) );
    } elsif( $reading eq 'iter.' ) {
        # Repeat the lemma
	$reading = "$lemma $lemma";
     } elsif( $reading =~ /^(.*) \.\.\. (.*)$/ ) {
        # The first and last N words captured should replace the first and
        # last N words of the lemma.
	my @begin = split( /\s+/, $1 );
	my @end = split( /\s+/, $2 );
	if( scalar( @begin ) + scalar ( @end ) > scalar( @words ) ) {
            # Something is wrong and we can't do the splice.
	    print STDERR "ERROR: $lemma is too short to accommodate $oldreading\n";
	} else {
	    splice( @words, 0, scalar @begin, @begin );
	    splice( @words, -(scalar @end), scalar @end, @end );
	    $reading = join( ' ', @words );
	}
    }
    print STDERR "Interpreted $oldreading as $reading given $lemma\n";
    return $reading;
}

=back

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews, aurum@cpan.org

=cut

1;
Commit	Line	Data
6f4946fb	1	package Text::Tradition::Parser::CTE;
	2
	3	use strict;
	4	use warnings;
	5	use XML::LibXML;
	6	use XML::LibXML::XPathContext;
	7
	8	=head1 NAME
	9
	10	Text::Tradition::Parser::CTE
	11
	12	=head1 DESCRIPTION
	13
	14	Parser module for Text::Tradition, given a TEI file exported from
	15	Classical Text Editor.
	16
	17	=head1 METHODS
	18
	19	=over
	20
	21	=item B<parse>
	22
	23	my @apparatus = read( $xml_file );
	24
	25	Takes a Tradition object and a TEI file exported from Classical Text
	26	Editor; initializes the Tradition from the file.
	27
	28	=cut
	29
	30	my %seg_readings; # Save the XML IDs for apparatus anchors.
	31	my %sigil_for; # Save the XML IDs for witnesses.
	32	my %note_start; # Save the readings where an apparatus entry is attached.
	33
	34	sub parse {
	35	my( $tradition, $xml_str ) = @_;
	36	my $c = $tradition->collation; # Some shorthand
	37
	38	# First, parse the XML.
	39	my $parser = XML::LibXML->new();
	40	my $doc = $parser->parse_string( $xml_str );
	41	my $tei = $doc->documentElement();
	42	my $xpc = XML::LibXML::XPathContext->new( $tei );
	43
	44	# CTE uses a DTD rather than any xmlns-based parsing. Thus we
	45	# need no namespace foo.
	46
	47	# Get the witnesses and create the witness objects.
	48	foreach my $wit_el ( $xpc->findnodes( '//sourceDesc/listWit/witness' ) ) {
	49	# The witness xml:id is used internally, and is not the sigil name.
	50	my $id= $wit_el->getAttribute( 'xml:id' );
	51	$id =~ s/^M/sig/; # Stupid but there you go.
	52	my @sig_parts = $xpc->findnodes( './abbr/descendant::text()', $wit_el );
	53	my $sig = join( '', grep { /\w/ } @sig_parts );
	54	$tradition->add_witness( sigil => $sig, source => $wit_el->toString() );
	55	$sigil_for{$id} = $sig;
	56	}
	57
	58	# Now go through the text and find the base tokens. Tokens are
	59	# either plain text to be split on whitespace, or they are wrapped
	60	# in <hi/> or <seg/> elements.
	61	my @base_text;
	62	my $ctr = 1;
	63	foreach my $pg_el ( $xpc->findnodes( '/TEI/text/p' ) ) {
	64	foreach my $xn ( $pg_el->childNodes ) {
65	push( @base_text, _get_readings( $tradition, $xn ) );
66	}
67	}
68
69	# String together the base.
70	my $source = $c->start;
71	foreach my $b ( @base_text ) {
72	$c->add_path( $source, $b, $c->baselabel );
73	$source = $b;
74	}
75	$c->add_path( $source, $c->add_reading('#END#'), $c->baselabel );
76
77	# Now go through the text and find all the apparatus notes, and parse them.
78	foreach my $note_el( $xpc->findnodes( '//note[attribute::type = "a1"]' ) ) {
79	my $app_start = $note_start{$note_el};
80	my $apparatus = _parse_note( $note_el, $c, $app_start );
81	}
82	}
83
84
85	## Recursive little helper function to help us navigate through nested
86	## XML, picking out the text.
87
88	sub _get_readings {
89	my( $tradition, $xn ) = @_;
90	my @readings;
91	if( $xn->nodeType == XML_TEXT_NODE ) {
92	my $str = $xn->data;
93	$str =~ s/^\s+//;
94	foreach my $w ( split( /\s+/, $str ) ) {
95	my $rdg = $tradition->collation->add_reading( 'n'.$ctr++ );
96	$rdg->text( $w );
97	push( @readings, $rdg );
98	}
99	} elsif( $xn->nodeName eq 'hi' ) {
100	foreach( $xn->childNodes ) {
101	# Recurse as if the hi weren't there.
102	push( @readings, _get_readings( $tradition, $_ ) );
103	}
104	} elsif( $xn->nodeName eq 'seg' ) {
105	# Read the reading, but also add the word in question as an anchor.
106	my $seg_id = $xn->getAttribute( 'xml:id' );
107	my @r;
108	foreach( $xn->childNodes ) {
109	push( @r, _get_readings( $tradition, $_ ) );
110	}
111	warn "More than one reading found in seg $seg_id" unless @r == 1;
112	$seg_readings{'#'.$seg_id} = $r[0];
113	push( @readings, @r );
114	} elsif( $xn->nodeName eq 'note' ) {
115	# Save where we found this note.
116	$note_start{$xn} = $readings[-1];
117	}
118	return @readings;
119	}
120
121	## Helper function to parse apparatus entries. This could get nasty, I mean fun.
122	sub _parse_note {
123	my( $xn, $c, $app_start ) = @_;
124	my $app_end = $seg_readings{$xn->getAttribute( 'targetEnd' )};
125	my $lemma = join( ' ', map { $_->text } $c->reading_sequence( $app_start, $app_end ) );
126
127	my %seen_wits;
128	# TODO A list of active witnesses should be passed really.
129	map { $seen_wits{$_} = 0 } vals( %sigil_for );
130
131	# The note has a <p/> tag, then <mentioned/>, then 0-1 text nodes,
132	# then an assortment of <hi/> or <abbr/> elements. If the hi
133	# contains an abbr, then it goes before, otherwise it probably
134	# goes after.
135	my @p = $xn->getChildrenByTagName( 'p' );
136	warn "More than one pg in note" unless @p == 1;
137
138	# Strip the <hi/> elements.
139	my @childnodes;
140	foreach ( $p[0]->childNodes ) {
141	if( $_->nodeName eq 'hi' ) {
142	push( @childnodes, $_->childNodes );
143	} else {
144	push( @childnodes, $_ );
145	}
146	}
147
148	# Go through and try to parse the sucker.
149	my $apparatus;
150	my $curr_rdg = '';
151	my $reading_sigla = 0;
152	my @curr_wits;
153	foreach my $pxn ( $p[0]->childNodes ) {
154	next if $pxn->nodeName eq 'mentioned'; # Redundant for us.
155	if( $pxn->nodeType == XML_TEXT_NODE ) {
156	my $pxn_str = $pxn->data;
157	$pxn_str =~ s/^\s+//;
158	$pxn_str =~ s/\s+$//;
159	my @parts = split( /,\s*/, $pxn_str );
160	if( @parts > 1 ) {
161	# Comma separation means that we are starting a new reading.
162	my $last = shift @parts;
163	if( $last =~ /^\sa\.\sc\.\s*$/ ) {
164	my $sig = pop @curr_wits;
165	$sig .= '_ac';
166	push( @curr_wits, $sig );
167	}
168	$pxn_str = join( ', ', @parts );
169	# Trigger a reading interpretation.
170	$reading_sigla = 1;
171	}
172	if( $reading_sigla ) {
173	my @wits = keys %curr_wits;
174	$apparatus->{ interpret( $curr_rdg, $lemma ) } = \@wits;
175	$curr_rdg = '';
176	$reading_sigla = 0;
177	@curr_wits = ();
178	}
179
180	if( $pxn_str =~ /^\sa\.\sc\.\s*$/ ) {
181	my $sig = pop @curr_wits;
182	$sig .= '_ac';
183	push( @curr_wits, $sig );
184	} else {
185	$curr_rdg .= $pxn_str;
186	}
187	} elsif( $pxn->nodeName eq 'abbr' ) {
188	# It is a witness, stick it in @curr_wits
189	my $wit = $sigil_for{$pxn->getAttribute( 'n' )}
190	push( @curr_wits, $wit ) unless $curr_wits[-1] eq $wit;
191	$seen_wits{$wit} += 1; # Keep track of a 'seen' count in case there is an a.c.
192	$reading_sigla = 1;
193	}
194	}
195	$apparatus->{ interpret( $curr_rdg, $lemma ) } = \@wits if $curr_rdg;
196	$apparatus->{ $lemma } = grep { $seen_wits{$_} == 0 } keys %seen_wits;
197
198	return $apparatus;
199	}
200
201
202	sub interpret {
203	# A utility function to change apparatus-ese into a full variant.
204	my( $reading, $lemma ) = @_;
205	return $reading if $reading eq $lemma;
206	my $oldreading = $reading;
207	$lemma =~ s/\s+[[:punct:]]+$//;
208	$reading =~ s/\s*\(?sic([\s\w.]+)?\)?$//;
209	my @words = split( /\s+/, $lemma );
210	if( $reading =~ /^(.*) praem.$/ ) {
211	$reading = "$1 $lemma";
212	} elsif( $reading =~ /^(.*) add.$/ ) {
213	$reading = "$lemma $1";
214	} elsif( $reading eq 'om.' ) {
215	$reading = '';
216	} elsif( $reading eq 'inv.' ) {
217	# Hope it is two words.
218	print STDERR "WARNING: want to invert a lemma that is not two words\n"
219	unless scalar( @words ) == 2;
220	$reading = join( ' ', reverse( @words ) );
221	} elsif( $reading eq 'iter.' ) {
222	# Repeat the lemma
223	$reading = "$lemma $lemma";
224	} elsif( $reading =~ /^(.) \.\.\. (.)$/ ) {
225	# The first and last N words captured should replace the first and
226	# last N words of the lemma.
227	my @begin = split( /\s+/, $1 );
228	my @end = split( /\s+/, $2 );
229	if( scalar( @begin ) + scalar ( @end ) > scalar( @words ) ) {
230	# Something is wrong and we can't do the splice.
231	print STDERR "ERROR: $lemma is too short to accommodate $oldreading\n";
232	} else {
233	splice( @words, 0, scalar @begin, @begin );
234	splice( @words, -(scalar @end), scalar @end, @end );
235	$reading = join( ' ', @words );
236	}
237	}
238	print STDERR "Interpreted $oldreading as $reading given $lemma\n";
239	return $reading;
240	}
241
242	=back
243
244	=head1 LICENSE
245
246	This package is free software and is provided "as is" without express
247	or implied warranty. You can redistribute it and/or modify it under
248	the same terms as Perl itself.
249
250	=head1 AUTHOR
251
252	Tara L Andrews, aurum@cpan.org
253
254	=cut
255
256	1;
257