From: tla Date: Tue, 13 Aug 2013 11:40:10 +0000 (+0200) Subject: Current state of transposition parsing X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=e9442e1c0df576ef6f95bfb2b110812a14ed3757;p=scpubgit%2Fstemmatology.git Current state of transposition parsing --- diff --git a/base/lib/Text/Tradition/Parser/CTE.pm b/base/lib/Text/Tradition/Parser/CTE.pm index 2926a4a..bd49559 100644 --- a/base/lib/Text/Tradition/Parser/CTE.pm +++ b/base/lib/Text/Tradition/Parser/CTE.pm @@ -38,6 +38,9 @@ sub parse { my( $tradition, $opts ) = @_; my $c = $tradition->collation; # Some shorthand + ## DEBUG/TEST + $opts->{interpret_transposition} = 1; + # First, parse the XML. my( $tei, $xpc ) = _remove_formatting( $opts ); return unless $tei; # we have already warned. @@ -104,9 +107,8 @@ sub parse { # Now we can parse the apparatus entries, and add the variant readings # to the graph. - foreach my $app_id ( keys %apps ) { - _add_readings( $c, $app_id ); + _add_readings( $c, $app_id, $opts ); } # Finally, add explicit witness paths, remove the base paths, and remove @@ -235,7 +237,7 @@ sub _append_tokens { } sub _add_readings { - my( $c, $app_id ) = @_; + my( $c, $app_id, $opts ) = @_; my $xn = $apps{$app_id}; # If the app is of type a1, it is an apparatus criticus. # If it is of type a2, it is an apparatus codicum and might not @@ -262,6 +264,7 @@ sub _add_readings { $tag =~ s/^\__APP_(.*)\__$/$1/; foreach my $rdg ( $xn->getChildrenByTagName( 'rdg' ) ) { + my @witlist = split( /\s+/, $rdg->getAttribute( 'wit' ) ); my @text; foreach ( $rdg->childNodes ) { push( @text, _get_base( $_ ) ); @@ -269,39 +272,52 @@ sub _add_readings { my( $interpreted, $flag ) = ( '', undef ); if( @text ) { ( $interpreted, $flag ) = interpret( - join( ' ', map { $_->{'content'} } @text ), $lemma_str ); + join( ' ', map { $_->{'content'} } @text ), $lemma_str, $anchor, $opts ); } - next if( $interpreted eq $lemma_str ) && !$flag; # Reading is lemma. + next if( $interpreted eq $lemma_str ) && !keys %$flag; # Reading is lemma. my @rdg_nodes; + my @transp_nodes; if( $interpreted eq '#LACUNA#' ) { push( @rdg_nodes, $c->add_reading( { id => 'r'.$tag.".".$ctr++, is_lacuna => 1 } ) ); + } elsif( $flag->{'TR'} ) { + # Our reading is transposed to after the given string. Look + # down the collation base text and try to find it. + # The @rdg_nodes should remain blank here, so that the correct + # omission goes into the graph. + foreach my $w ( split( /\s+/, $interpreted ) ) { + my $r = $c->add_reading( { id => 'r'.$tag.".".$ctr++, + text => $w } ); + push( @transp_nodes, $r ); + } + if( $anchor && @lemma ) { + _attach_transposition( $c, \@lemma, $anchor, \@transp_nodes, + \@witlist, $flag->{'TR'} ); + } } else { - if ( $flag && $flag eq 'START' + if ( $flag->{'START'} && $c->prior_reading( $app_id, $c->baselabel ) ne $c->start ) { # Add a lacuna for the witness start. push( @rdg_nodes, $c->add_reading( { id => 'r'.$tag.".".$ctr++, is_lacuna => 1 } ) ); - $flag = ''; } foreach my $w ( split( /\s+/, $interpreted ) ) { my $r = $c->add_reading( { id => 'r'.$tag.".".$ctr++, text => $w } ); push( @rdg_nodes, $r ); } - if( $flag && $flag eq 'END' + if( $flag->{'END'} && $c->next_reading( $app_id, $c->baselabel ) ne $c->end ) { # Add a lacuna for the witness end. push( @rdg_nodes, $c->add_reading( { id => 'r'.$tag.".".$ctr++, is_lacuna => 1 } ) ); - $flag = ''; } } # For each listed wit, save the reading. - foreach my $wit ( split( /\s+/, $rdg->getAttribute( 'wit' ) ) ) { - $wit .= $flag if $flag; + foreach my $wit ( @witlist ) { + $wit .= '_ac' if $flag->{'AC'}; $wit_rdgs{$wit} = \@rdg_nodes; } @@ -347,6 +363,57 @@ sub _return_lemma { return @nodes; } +# Make a best-effort attempt to attach a transposition farther down the line. +# $lemmaseq contains the Reading objects of the lemma +# $anchor contains the point at which we should start scanning for a match +# $rdgseq contains the Reading objects of the transposed reading +# (should be identical to the lemma) +# $witlist contains the list of applicable witnesses +# $reftxt contains the text to match, after which the $rdgseq should go. +sub _attach_transposition { + my( $c, $lemmaseq, $anchor, $rdgseq, $witlist, $reftxt ) = @_; + my @refwords = split( /\s+/, $reftxt ); + my $checked = $c->reading( $anchor ); + my $found; + while( $checked ne $c->end && !$found ) { + my $next = $c->next_reading( $checked, $c->baselabel ); + if( $next->text eq $refwords[0] ) { + # See if the entire sequence of words matches. + $found = $next; + foreach my $w ( 1..$#refwords ) { + $found = $c->next_reading( $next, $c->baselabel ); + unless( $found->text eq $refwords[$w] ) { + $found = undef; + last; + } + } + } + $checked = $next; + } + if( $found ) { + # The $found variable should now contain the reading after which we + # should stick the transposition. + my $fnext = $c->next_reading( $found, $c->baselabel ); + my $aclabel = $c->ac_label; + foreach my $wit_id ( @$witlist ) { + my $witstr = _get_sigil( $wit_id, $aclabel ); + _add_wit_path( $c, $rdgseq, $found->id, $fnext->id, $witstr ); + } + # ...and add the transposition relationship between lemma and rdgseq. + if( @$lemmaseq == @$rdgseq ) { + foreach my $i ( 0..$#{$lemmaseq} ) { + $c->add_relationship( $lemmaseq->[$i], $rdgseq->[$i], + { type => 'transposition', annotation => 'Detected by CTE' } ); + } + } else { + say STDERR "ERROR: lemma and transposed sequence different lengths?!" + } + } else { + say STDERR "WARNING: Unable to find $reftxt in base text for transposition"; + map { $c->del_reading( $_ ) } @$rdgseq; + } +} + =head2 interpret( $reading, $lemma ) Given a string in $reading and a corresponding lemma in $lemma, interpret what @@ -357,11 +424,11 @@ marking transpositions, prefixed or suffixed words, and the like. sub interpret { # A utility function to change apparatus-ese into a full variant. - my( $reading, $lemma ) = @_; + my( $reading, $lemma, $anchor, $opts ) = @_; return $reading if $reading eq $lemma; my $oldreading = $reading; # $lemma =~ s/\s+[[:punct:]]+$//; - my $flag; # In case of p.c. indications + my $flag = {}; # To pass back extra info about the interpretation my @words = split( /\s+/, $lemma ); $reading =~ s/[[:punct:]]?\bsic\b([[:punct:]]+)?//g; if( $reading =~ /^(.*) praem.$/ ) { @@ -394,7 +461,14 @@ sub interpret { } elsif( $reading eq 'in marg.' ) { # There was nothing before a correction. $reading = ''; - $flag = '_ac'; + $flag->{'AC'} = 1; + } elsif( $reading =~ /^(.*?)\s*\(?sic([\s\w!.]+)?\)?$/ ) { + # Discard any 'sic' notation; indeed, indeed. + $reading = $1; + if( $reading =~ /^(\W+)$/ ) { + # Nothing left but punctuation, so effectively it's the lemma. + $reading = $lemma; + } } elsif( $reading =~ /^(.*) \.\.\. (.*)$/ ) { # The first and last N words captured should replace the first and # last N words of the lemma. @@ -408,17 +482,20 @@ sub interpret { splice( @words, -(scalar @end), scalar @end, @end ); $reading = join( ' ', @words ); } + } elsif( $opts->{interpret_transposition} && + $reading =~ /^\s*post\s*(.*?)\s*(tr(ans(p)?)?)?\.?\s*$/ ) { + # Try to deal with transposed readings + ## DEBUG + say STDERR "Will attempt transposition: $reading at $anchor"; + $reading = $lemma; + $flag->{'TR'} = $1; + # Look for processed witStart and witEnd tags } elsif( $reading =~ /^\#START\#\s*(.*)$/ ) { $reading = $1; - $flag = 'START'; + $flag->{'START'} = 1; } elsif( $reading =~ /^(.*?)\s*\#END\#$/ ) { $reading = $1; - $flag = 'END'; - } - if( $oldreading ne $reading || $flag || $oldreading =~ /\./ ) { - my $int = $reading; - $int .= " ($flag)" if $flag; - # say STDERR "Interpreted $oldreading as $int given $lemma"; + $flag->{'END'} = 1; } return( $reading, $flag ); }