From: Tara L Andrews Date: Mon, 19 Aug 2013 21:40:29 +0000 (+0200) Subject: INCOMPLETE overhaul to CTE parsing core, issue #6 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=c9f61c7056e267a816de9377efeee8b538b5c3d8;p=scpubgit%2Fstemmatology.git INCOMPLETE overhaul to CTE parsing core, issue #6 --- diff --git a/base/lib/Text/Tradition/Parser/CTE.pm b/base/lib/Text/Tradition/Parser/CTE.pm index fa0f286..b28a68b 100644 --- a/base/lib/Text/Tradition/Parser/CTE.pm +++ b/base/lib/Text/Tradition/Parser/CTE.pm @@ -271,7 +271,6 @@ sub _add_readings { # Get the lemma, which is all the readings between app and anchor, # excluding other apps or anchors. my @lemma = _return_lemma( $c, $app_id, $anchor ); - my $lemma_str = join( ' ', map { $_->text } grep { !$_->is_ph } @lemma ); # For each reading, send its text to 'interpret' along with the lemma, # and then save the list of witnesses that these tokens belong to. @@ -281,82 +280,54 @@ sub _add_readings { $tag =~ s/^\__APP_(.*)\__$/$1/; foreach my $rdg ( $xn->getChildrenByTagName( 'rdg' ) ) { - my @witlist = split( /\s+/, $rdg->getAttribute( 'wit' ) ); - my @text; - foreach ( $rdg->childNodes ) { - push( @text, _get_base( $_ ) ); - } - my( $interpreted, $flag ) = ( '', undef ); - if( @text ) { - ( $interpreted, $flag ) = interpret( - join( ' ', map { $_->{'content'} } @text ), $lemma_str, $anchor, $opts ); - } - next if( $interpreted eq $lemma_str ) && !keys %$flag; # Reading is lemma. - - my @rdg_nodes; - if( $interpreted eq '#LACUNA#' ) { - push( @rdg_nodes, $c->add_reading( { id => 'r'.$tag.".".$ctr++, - is_lacuna => 1 } ) ); - } elsif( $flag->{'TR'} ) { - # Our reading is transposed to after the given string. Look - # down the collation base text and try to find it. - # The @rdg_nodes should remain blank here, so that the correct - # omission goes into the graph. - my @transp_nodes; - foreach my $w ( split( /\s+/, $interpreted ) ) { - my $r = $c->add_reading( { id => 'r'.$tag.".".$ctr++, - text => $w } ); - push( @transp_nodes, $r ); - } - if( $anchor && @lemma ) { - my $success = _attach_transposition( $c, \@lemma, $anchor, - \@transp_nodes, \@witlist, $flag->{'TR'} ); - unless( $success ) { - # If we didn't manage to insert the displaced reading, - # then restore it here rather than silently deleting it. - push( @rdg_nodes, @transp_nodes ); - } - } - } else { - foreach my $w ( split( /\s+/, $interpreted ) ) { - my $r = $c->add_reading( { id => 'r'.$tag.".".$ctr++, - text => $w } ); - push( @rdg_nodes, $r ); - } - } - - # For each listed wit, save the reading. - # If an A.C. or P.C. reading is implied rather than explicitly noted, - # this is where it will be dealt with. - foreach my $wit ( @witlist ) { - $wit .= '_ac' if $flag->{'AC'}; - $wit_rdgs{$wit} = \@rdg_nodes; - # If the PC flag is set, there is a corresponding AC that - # follows the lemma and has to be explicitly declared. - if( $flag->{'PC'} ) { - $wit_rdgs{$wit.'_ac'} = \@lemma; - } - } - + # Get the relevant witnesses. + my @witlist = map { $sigil_for{$_} } + split( /\s+/, $rdg->getAttribute( 'wit' ) ); + # Does the reading have an ID? If so it probably has a witDetail # attached, and we need to read it. If an A.C. or P.C. reading is # declared explicity, this is where it will be dealt with. if( $rdg->hasAttribute( 'xml:id' ) ) { - warn "Witdetail on meta reading" if $flag; # this could get complicated. my $rid = $rdg->getAttribute( 'xml:id' ); my $xpc = XML::LibXML::XPathContext->new( $xn ); my @details = $xpc->findnodes( './witDetail[@target="'.$rid.'"]' ); foreach my $d ( @details ) { - _parse_wit_detail( $d, \%wit_rdgs, \@lemma ); + @witlist = _parse_wit_detail( $d, @witlist ); } } + + # Now we have our list of relevant witnesses for the reading, annotated + # with AC or PC if applicable. Interpret the reading in light of the + # lemma, anything we already have for the witness, etc. + # If an A.C. or P.C. reading is implied rather than explicitly noted, + # this is where it will be dealt with. + + foreach my $wit ( @witlist ) { + # The lemma for this witness is either the actual lemma, or the + # reading that we have already determined. + my $hascorr; + if( $wit =~ /^(.*)_pc$/ ) { + $wit = $1; + $hascorr = 1; + } + ## TODO think through ac/pc interaction from these specs + my $wit_lemma = $wit_rdgs{$wit} || \@lemma; + my @rdg_nodes; + ( $wit, @rdg_nodes )= _read_reading( $rdg, $wit_lemma, $wit, + $tag, $ctr, $anchor, $opts ); + $wit_rdgs{$wit} = \@rdg_nodes; + # If the PC flag is set, there is a corresponding AC that + # follows the lemma and has to be explicitly declared. + if( $flag->{'PC'} ) { + $wit_rdgs{$wit.'_ac'} = $wit_lemma; + } + } } # Now collate the variant readings, since it is not done for us. collate_variants( $c, \@lemma, values %wit_rdgs ); - # Now add the witness paths for each reading. If we don't have an anchor - # (e.g. with an initial witStart) there was no witness path to speak of. + # Now add the witness paths for each reading. foreach my $wit_id ( keys %wit_rdgs ) { my $witstr = _get_sigil( $wit_id, $c->ac_label ); my $rdg_list = $wit_rdgs{$wit_id}; @@ -378,6 +349,93 @@ sub _return_lemma { return @nodes; } +sub _parse_wit_detail { + my $detail = shift; + my %wits; + map { $wits{$_} = $_ } @_; + my @changewits = map { $sigil_for{$_} } + split( /\s+/, $detail->getAttribute( 'wit' ) ); + my $content = $detail->textContent; + if( $content =~ /^a\.?\s*c(orr)?\.$/ ) { + # Replace the key in the $readings hash + map { $wits{$_} = $_.'_ac' } @changewits; + } elsif( $content =~ /^p\.?\s*c(orr)?\.$/ || $content =~ /^s\.?\s*l\.$/ + || $content =~ /^in marg\.?$/ ) { + # If no key for the wit a.c. exists, add one pointing to the lemma + map { $wits{$_} = $_.'_pc' } @changewits; + } else { #...not sure what it is? + say STDERR "WARNING: Unrecognized sigil annotation $content"; + } + my @newwits = sort values %wits; + return @newwits; +} + +sub _read_reading { + my( $rdg, $lemma, $witness, $tag, $ctr, $anchor, $opts ) = @_; + + # Get the text of the lemma. + my $lemma_str = join( ' ', map { $_->text } grep { !$_->is_ph } @$lemma ); + + my @text; + foreach ( $rdg->childNodes ) { + push( @text, _get_base( $_ ) ); + } + my( $interpreted, $flag ) = ( '', undef ); + if( @text ) { + ( $interpreted, $flag ) = interpret( + join( ' ', map { $_->{'content'} } @text ), $lemma_str, $anchor, $opts ); + } + if( ( $interpreted eq $lemma_str || $interpreted eq '__LEMMA__' ) + && !keys %$flag ) { + # The reading is the lemma. Pass it back. + return( $wit, @$lemma ); + } + + my @rdg_nodes; + if( $interpreted eq '#LACUNA#' ) { + push( @rdg_nodes, $c->add_reading( { id => 'r'.$tag.".".$ctr++, + is_lacuna => 1 } ) ); + } elsif( $flag->{'TR'} ) { + # Our reading is transposed to after the given string. Look + # down the collation base text and try to find it. + # The @rdg_nodes should remain blank here, so that the correct + # omission goes into the graph. + my @transp_nodes; + foreach my $w ( split( /\s+/, $interpreted ) ) { + my $r = $c->add_reading( { id => 'r'.$tag.".".$ctr++, + text => $w } ); + push( @transp_nodes, $r ); + } + if( $anchor && @lemma ) { + my $success = _attach_transposition( $c, $lemma, $anchor, + \@transp_nodes, $witlist, $flag->{'TR'} ); + unless( $success ) { + # If we didn't manage to insert the displaced reading, + # then restore it here rather than silently deleting it. + push( @rdg_nodes, @transp_nodes ); + } + } + } else { + foreach my $w ( split( /\s+/, $interpreted ) ) { + if( $w eq '__LEMMA__' ) { + push( @rdg_nodes, @lemma ); + } else { + my $r = $c->add_reading( { id => 'r'.$tag.".".$ctr++, + text => $w } ); + push( @rdg_nodes, $r ); + } + } + } + + # Note if the interpretation said that we're dealing with a correction. + if( $flag->{'AC'} ) { + $wit .= '_ac'; + } elsif( $flag->{'PC'} ) { + $wit .= '_pc'; + } + return( $wit, @rdg_nodes ); +} + # Make a best-effort attempt to attach a transposition farther down the line. # $lemmaseq contains the Reading objects of the lemma # $anchor contains the point at which we should start scanning for a match @@ -450,11 +508,16 @@ sub interpret { # Discard any 'sic' notation - that rather goes without saying. $reading =~ s/([[:punct:]]+)?sic([[:punct:]]+)?//g; + # Look to see if there is an implied add or praem masked by the XML. + # If so, undo it for purposes of reading identity. + $reading =~ s/^$lemma\b/__LEMMA__/; + $reading =~ s/\b$lemma$/__LEMMA__/; + # Now look for common jargon. if( $reading =~ /^(.*) praem.$/ || $reading =~ /^praem\. (.*)$/ ) { - $reading = "$1 $lemma"; + $reading = "$1 __LEMMA__"; } elsif( $reading =~ /^(.*) add.$/ || $reading =~ /^add\. (.*)$/ ) { - $reading = "$lemma $1"; + $reading = "__LEMMA__ $1"; } elsif( $reading =~ /locus [uv]acuus/ || $reading eq 'def.' || $reading eq 'illeg.' @@ -471,7 +534,7 @@ sub interpret { $reading = join( ' ', reverse( @words ) ); } elsif( $reading =~ /^iter(\.|at)$/ ) { # Repeat the lemma - $reading = "$lemma $lemma"; + $reading = "__LEMMA__ $lemma"; } elsif( $reading =~ /^(.*?)\s*\(?in marg\.\)?$/ ) { $reading = $1; if( $reading ) { @@ -501,32 +564,14 @@ sub interpret { # Try to deal with transposed readings ## DEBUG say STDERR "Will attempt transposition: $reading at $anchor"; + # Copy the lemma into the reading string for insertion later + # in the text. $reading = $lemma; $flag->{'TR'} = $+{lem}; } return( $reading, $flag ); } -sub _parse_wit_detail { - my( $detail, $readings, $lemma ) = @_; - my $wit = $detail->getAttribute( 'wit' ); - my $content = $detail->textContent; - if( $content =~ /^a\.?\s*c(orr)?\.$/ ) { - # Replace the key in the $readings hash - my $rdg = delete $readings->{$wit}; - $readings->{$wit.'_ac'} = $rdg; - $has_ac{$sigil_for{$wit}} = 1; - } elsif( $content =~ /^p\.?\s*c(orr)?\.$/ || $content =~ /^s\.?\s*l\.$/ ) { - # If no key for the wit a.c. exists, add one pointing to the lemma - unless( exists $readings->{$wit.'_ac'} ) { - $readings->{$wit.'_ac'} = $lemma; - } - $has_ac{$sigil_for{$wit}} = 1; - } else { #...not sure what it is? - say STDERR "WARNING: Unrecognized sigil annotation $content"; - } -} - sub _add_lacunae { my( $c, @app_id ) = @_; # Go through the apparatus entries in order, noting where to start and stop our