use strict;
use warnings;
use Module::Load;
-use Algorithm::Diff;
+use Text::Tradition::Parser::Util qw( collate_variants cmp_str check_for_repeated add_hash_entry );
=head1 NAME
=head1 DESCRIPTION
For an overview of the package, see the documentation for the
-Text::Tradition::Graph module.
+Text::Tradition module.
This module is meant for use with certain of the other Parser classes
- whenever a list of variants is given with reference to a base text,
=item B<parse>
-parse( $graph, %opts );
+parse( $graph, $opts );
-Takes an initialized graph and a set of options, which must include:
+Takes an initialized graph and a hashref of options, which must include:
- 'base' - the base text referenced by the variants
- 'format' - the format of the variant list
- 'data' - the variants, in the given format.
=cut
sub parse {
- my( $tradition, %opts ) = @_;
+ my( $tradition, $opts ) = @_;
- my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
+ my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
load( $format_mod );
- my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
- merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
+ # TODO Handle a string someday if we ever have a format other than KUL
+ my @apparatus_entries = $format_mod->can('read')->( $opts );
+ merge_base( $tradition->collation, $opts, @apparatus_entries );
}
=item B<merge_base>
=cut
-my $SHORTEND; # Debug var - set this to limit the number of lines parsed
+my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
my %base_text_index;
my $edits_required = {};
# edits_required -> wit -> [ { start_idx, end_idx, items } ]
sub merge_base {
- my( $collation, $base_file, @app_entries ) = @_;
- my @base_line_starts = read_base( $base_file, $collation );
+ my( $collation, $opts, @app_entries ) = @_;
+ my @base_line_starts = read_base( $opts->{'base'}, $collation );
my %all_witnesses;
- my @unwitnessed_lemma_nodes;
foreach my $app ( @app_entries ) {
- my( $line, $num ) = split( /\./, $app->{_id} );
- # DEBUG with a short graph
- last if $SHORTEND && $line > $SHORTEND;
- # DEBUG for problematic entries
- my $scrutinize = '';
- my $first_line_reading = $base_line_starts[ $line ];
- my $too_far = $base_line_starts[ $line+1 ];
-
- my $lemma = $app->{rdg_0};
- my $seq = 1;
- # Is this the Nth occurrence of this reading in the line?
- if( $lemma =~ s/(_)?(\d)$// ) {
- $seq = $2;
- }
- my @lemma_words = split( /\s+/, $lemma );
-
- # Now search for the lemma words within this line.
- my $lemma_start = $first_line_reading;
- my $lemma_end;
- my %seen;
- while( $lemma_start ne $too_far ) {
- # Loop detection
- if( $seen{ $lemma_start->name() } ) {
- warn "Detected loop at " . $lemma_start->name() .
- ", ref $line,$num";
- last;
- }
- $seen{ $lemma_start->name() } = 1;
-
- # Try to match the lemma.
- my $unmatch = 0;
- print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
- $lemma_words[0] . "...\n"
- if "$line.$num" eq $scrutinize;
- if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
- # Skip it if we need a match that is not the first.
- if( --$seq < 1 ) {
- # Now we have to compare the rest of the words here.
- if( scalar( @lemma_words ) > 1 ) {
- my $next_reading =
- $collation->next_reading( $lemma_start );
- foreach my $w ( @lemma_words[1..$#lemma_words] ) {
- printf STDERR "Now matching %s against %s\n",
- cmp_str($next_reading), $w
- if "$line.$num" eq $scrutinize;
- if( $w ne cmp_str($next_reading) ) {
- $unmatch = 1;
- last;
- } else {
- $lemma_end = $next_reading;
- $next_reading =
- $collation->next_reading( $lemma_end );
- }
- }
- } else {
- $lemma_end = $lemma_start;
- }
- } else {
- $unmatch = 1;
- }
- }
- last unless ( $unmatch || !defined( $lemma_end ) );
- $lemma_end = undef;
- $lemma_start = $collation->next_reading( $lemma_start );
- }
-
- unless( $lemma_end ) {
- warn "No match found for @lemma_words at $line.$num";
- next;
- }
-
- # Now we have found the lemma; we will record an 'edit', in
- # terms of a splice operation, for each subsequent reading.
- # We also note which witnesses take the given edit.
-
- my @lemma_set = $collation->reading_sequence( $lemma_start,
- $lemma_end );
- my @reading_sets = [ @lemma_set ];
-
- # For each reading that is not rdg_0, we create the variant
- # reading nodes, and store the range as an edit operation on
- # the base text.
- my $variant_objects;
- my %pc_seen; # Keep track of mss with explicit post-corr data
- foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
- my @mss = grep { $app->{$_} eq $k } keys( %$app );
- push( @unwitnessed_lemma_nodes, @lemma_set )
- if !@mss && $k eq 'rdg_0';
-
- # Keep track of what witnesses we have seen.
- @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
- # Keep track of which witnesses bear corrected readings here.
- foreach my $m ( @mss ) {
- my $base = _is_post_corr( $m );
- next unless $base;
- $pc_seen{$base} = 1;
- }
- next if $k eq 'rdg_0';
-
- # TODO don't hardcode the reading split operation
- my @variant = split( /\s+/, $app->{$k} );
- @variant = () if $app->{$k} eq '/'; # This is an omission.
-
- # Make the variant into a set of readings.
- my @variant_readings;
- my $ctr = 0;
- foreach my $vw ( @variant ) {
- my $vwname = "$k/$line.$num.$ctr"; $ctr++;
- my $vwreading = $collation->add_reading( $vwname );
- $vwreading->text( $vw );
- push( @variant_readings, $vwreading );
- }
-
- $variant_objects->{$k} = { 'mss' => \@mss,
- 'reading' => \@variant_readings,
- };
- push( @reading_sets, \@variant_readings );
- }
-
- # Now collate and collapse the identical readings within the
- # collated sets. Modifies the reading sets that were passed.
- collate_variants( $collation, @reading_sets );
-
- # TODO Here would be a very good place to set up relationships
- # between the nodes and the lemma.
- set_relationships( $app, \@lemma_set, $variant_objects );
-
- # Now create the splice-edit objects that will be used
- # to reconstruct each witness.
-
- foreach my $rkey ( keys %$variant_objects ) {
- # Object is argument list for splice, so:
- # offset, length, replacements
- my $edit_object = [ $lemma_start->name,
- scalar( @lemma_set ),
- $variant_objects->{$rkey}->{reading} ];
- foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
- # Is this a p.c. entry?
- my $base = _is_post_corr( $ms );
- if( $base ) { # this is a post-corr witness
- my $pc_key = $base . "_post";
- _add_hash_entry( $edits_required, $pc_key, $edit_object );
- } else { # this is an ante-corr witness
- my $pc_key = $ms . "_post";
- _add_hash_entry( $edits_required, $ms, $edit_object );
- unless( $pc_seen{$ms} ) {
- # If this witness carries no correction, add this
- # same object to its post-corrected state.
- _add_hash_entry( $edits_required, $pc_key,
- $edit_object );
- }
- }
- }
- }
+ my( $line, $num ) = split( /\./, $app->{_id} );
+ # DEBUG with a short graph
+ last if $SHORTEND && $line > $SHORTEND;
+ # DEBUG for problematic entries
+ my $scrutinize = '';
+ my $first_line_reading = $base_line_starts[ $line ];
+ my $too_far = $base_line_starts[ $line+1 ];
+
+ my $lemma = $app->{rdg_0};
+ my $seq = 1;
+ # Is this the Nth occurrence of this reading in the line?
+ if( $lemma =~ s/(_)?(\d)$// ) {
+ $seq = $2;
+ }
+ my @lemma_words = split( /\s+/, $lemma );
+
+ # Now search for the lemma words within this line.
+ my $lemma_start = $first_line_reading;
+ my $lemma_end;
+ my %seen;
+ while( $lemma_start ne $too_far ) {
+ # Loop detection
+ if( $seen{ $lemma_start->id() } ) {
+ warn "Detected loop at " . $lemma_start->id() .
+ ", ref $line,$num";
+ last;
+ }
+ $seen{ $lemma_start->id() } = 1;
+
+ # Try to match the lemma.
+ my $unmatch = 0;
+ print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
+ $lemma_words[0] . "...\n"
+ if "$line.$num" eq $scrutinize;
+ if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
+ # Skip it if we need a match that is not the first.
+ if( --$seq < 1 ) {
+ # Now we have to compare the rest of the words here.
+ if( scalar( @lemma_words ) > 1 ) {
+ my $next_reading =
+ $collation->next_reading( $lemma_start );
+ foreach my $w ( @lemma_words[1..$#lemma_words] ) {
+ printf STDERR "Now matching %s against %s\n",
+ cmp_str($next_reading), $w
+ if "$line.$num" eq $scrutinize;
+ if( $w ne cmp_str($next_reading) ) {
+ $unmatch = 1;
+ last;
+ } else {
+ $lemma_end = $next_reading;
+ $next_reading =
+ $collation->next_reading( $lemma_end );
+ }
+ }
+ } else {
+ $lemma_end = $lemma_start;
+ }
+ } else {
+ $unmatch = 1;
+ }
+ }
+ last unless ( $unmatch || !defined( $lemma_end ) );
+ $lemma_end = undef;
+ $lemma_start = $collation->next_reading( $lemma_start );
+ }
+
+ unless( $lemma_end ) {
+ warn "No match found for @lemma_words at $line.$num";
+ next;
+ }
+
+ # Now we have found the lemma; we will record an 'edit', in
+ # terms of a splice operation, for each subsequent reading.
+ # We also note which witnesses take the given edit.
+
+ my @lemma_set = $collation->reading_sequence( $lemma_start,
+ $lemma_end );
+ my @reading_sets = [ @lemma_set ];
+
+ # For each reading that is not rdg_0, we create the variant
+ # reading nodes, and store the range as an edit operation on
+ # the base text.
+ my $variant_objects;
+ my %pc_seen; # Keep track of mss with explicit post-corr data
+ foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
+ my @mss = grep { $app->{$_} eq $k } keys( %$app );
+
+ # Keep track of what witnesses we have seen.
+ @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
+ # Keep track of which witnesses bear corrected readings here.
+ foreach my $m ( @mss ) {
+ my $base = _is_post_corr( $m );
+ next unless $base;
+ $pc_seen{$base} = 1;
+ }
+ next if $k eq 'rdg_0';
+
+ # Parse the variant into reading tokens.
+ # TODO don't hardcode the reading split operation
+ my @variant = split( /\s+/, $app->{$k} );
+ @variant = () if $app->{$k} eq '/'; # This is an omission.
+
+ my @variant_readings;
+ my $ctr = 0;
+ foreach my $vw ( @variant ) {
+ my $vwname = "$k/$line.$num.$ctr"; $ctr++;
+ my $vwreading = $collation->add_reading( {
+ 'id' => $vwname,
+ 'text' => $vw } );
+ push( @variant_readings, $vwreading );
+ }
+
+ $variant_objects->{$k} = { 'mss' => \@mss,
+ 'reading' => \@variant_readings,
+ };
+ push( @reading_sets, \@variant_readings );
+ }
+
+ # Now collate and collapse the identical readings within the
+ # collated sets. Modifies the reading sets that were passed.
+ collate_variants( $collation, @reading_sets );
+
+ # Record any stated relationships between the nodes and the lemma.
+ set_relationships( $collation, $app, \@lemma_set, $variant_objects );
+
+ # Now create the splice-edit objects that will be used
+ # to reconstruct each witness.
+
+ foreach my $rkey ( keys %$variant_objects ) {
+ # Object is argument list for splice, so:
+ # offset, length, replacements
+ my $edit_object = [ $lemma_start->id,
+ scalar( @lemma_set ),
+ $variant_objects->{$rkey}->{reading} ];
+ foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
+ # Is this a p.c. entry?
+ my $base = _is_post_corr( $ms );
+ if( $base ) { # this is a post-corr witness
+ my $pc_key = $base . "_post";
+ add_hash_entry( $edits_required, $pc_key, $edit_object );
+ } else { # this is an ante-corr witness
+ my $pc_key = $ms . "_post";
+ add_hash_entry( $edits_required, $ms, $edit_object );
+ unless( $pc_seen{$ms} ) {
+ # If this witness carries no correction, add this
+ # same object to its post-corrected state.
+ add_hash_entry( $edits_required, $pc_key,
+ $edit_object );
+ }
+ }
+ }
+ }
} # Finished going through the apparatus entries
# Now make the witness objects, and create their text sequences
foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
- print STDERR "Creating witness $w\n";
- my $witness_obj = $collation->tradition->add_witness( sigil => $w );
- my $debug = $w eq 'Vb10';
- my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
- my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
- if exists( $edits_required->{$w."_post"} );
-
- my @repeated = _check_for_repeated( @ante_corr_seq );
- warn "Repeated elements @repeated in $w a.c."
- if @repeated;
- @repeated = _check_for_repeated( @post_corr_seq );
- warn "Repeated elements @repeated in $w p.c."
- if @repeated;
-
- # Now save these paths in my witness object
- if( @post_corr_seq ) {
- $witness_obj->path( \@post_corr_seq );
- $witness_obj->uncorrected_path( \@ante_corr_seq );
- } else {
- $witness_obj->path( \@ante_corr_seq );
- }
+ print STDERR "Creating witness $w\n";
+ my $witness_obj = $collation->tradition->add_witness( sigil => $w );
+ my $debug; # = $w eq 'Vb11';
+ my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
+ my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
+ if exists( $edits_required->{$w."_post"} );
+
+ my @repeated = check_for_repeated( @ante_corr_seq );
+ warn "Repeated elements @repeated in $w a.c."
+ if @repeated;
+ @repeated = check_for_repeated( @post_corr_seq );
+ warn "Repeated elements @repeated in $w p.c."
+ if @repeated;
+
+ # Now save these paths in my witness object
+ if( @post_corr_seq ) {
+ $witness_obj->path( \@post_corr_seq );
+ $witness_obj->uncorrected_path( \@ante_corr_seq );
+ } else {
+ $witness_obj->path( \@ante_corr_seq );
+ }
}
# Now remove our 'base text' edges, which is to say, the only
- # ones we have created so far. Also remove any nodes that didn't
- # appear in any witnesses.
+ # ones we have created so far. Also remove any unwitnessed
+ # lemma nodes (TODO unless we are treating base as witness)
foreach ( $collation->paths() ) {
- $collation->del_path( $_ );
+ $collation->del_path( $_, $collation->baselabel );
}
- foreach( @unwitnessed_lemma_nodes ) {
- $collation->del_reading( $_ );
- }
-
- # Now walk paths and calculate positions.
- my @common_readings =
- $collation->make_witness_paths();
- $collation->calculate_positions( @common_readings );
-}
-sub _check_for_repeated {
- my @seq = @_;
- my %unique;
- my @repeated;
- foreach ( @seq ) {
- if( exists $unique{$_->name} ) {
- push( @repeated, $_->name );
- } else {
- $unique{$_->name} = 1;
+ ### HACKY HACKY Do some one-off path corrections here.
+ if( $opts->{'input'} eq 'KUL' ) {
+ require 'data/boodts/s158.HACK';
+ KUL::HACK::pre_path_hack( $collation );
}
- }
- return @repeated;
+
+ # Now walk paths and calculate positional rank.
+ $collation->make_witness_paths();
+ # Now delete any orphaned readings.
+ foreach my $r ( $collation->sequence->isolated_vertices ) {
+ print STDERR "Deleting unconnected reading $r / " .
+ $collation->reading( $r )->text . "\n";
+ $collation->del_reading( $r );
+ }
+
+ KUL::HACK::post_path_hack( $collation ) if $opts->{'input'} eq 'KUL';
+ # Have to check relationship validity at this point, because before that
+ # we had no paths.
+# foreach my $rel ( $collation->relationships ) {
+# next unless $rel->equal_rank;
+# unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
+# warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
+# $rel->type, $rel->from->id, $rel->to->id );
+# }
+# }
+ # $collation->calculate_ranks();
}
=item B<read_base>
# This array gives the first reading for each line. We put the
# common starting point in line zero.
- my $last_reading = $collation->start();
- $base_text_index{$last_reading->name} = 0;
+ my $last_reading = $collation->start;
+ $base_text_index{$last_reading->id} = 0;
my $lineref_array = [ $last_reading ]; # There is no line zero.
open( BASE, $base_file ) or die "Could not open file $base_file: $!";
my $i = 1;
while(<BASE>) {
- # Make the readings, and connect them up for the base, but
- # also save the first reading of each line in an array for the
- # purpose.
- # TODO use configurable reading separator
- chomp;
- my @words = split;
- my $started = 0;
- my $wordref = 0;
- my $lineref = scalar @$lineref_array;
- last if $SHORTEND && $lineref > $SHORTEND;
- foreach my $w ( @words ) {
- my $readingref = join( ',', $lineref, ++$wordref );
- my $reading = $collation->add_reading( $readingref );
- $reading->text( $w );
- unless( $started ) {
- push( @$lineref_array, $reading );
- $started = 1;
- }
- # Add edge paths in the graph, for easier tracking when
- # we start applying corrections. These paths will be
- # removed when we're done.
- my $path = $collation->add_path( $last_reading, $reading,
- $collation->baselabel );
- $last_reading = $reading;
-
- # Note an array index for the reading, for later correction splices.
- $base_text_index{$readingref} = $i++;
- }
+ # Make the readings, and connect them up for the base, but
+ # also save the first reading of each line in an array for the
+ # purpose.
+ # TODO use configurable reading separator
+ chomp;
+ my @words = split;
+ my $started = 0;
+ my $wordref = 0;
+ my $lineref = scalar @$lineref_array;
+ last if $SHORTEND && $lineref > $SHORTEND;
+ foreach my $w ( @words ) {
+ my $readingref = join( ',', $lineref, ++$wordref );
+ my $reading = $collation->add_reading( { id => $readingref, text => $w } );
+ unless( $started ) {
+ push( @$lineref_array, $reading );
+ $started = 1;
+ }
+ # Add edge paths in the graph, for easier tracking when
+ # we start applying corrections. These paths will be
+ # removed when we're done.
+ my $path = $collation->add_path( $last_reading, $reading,
+ $collation->baselabel );
+ $last_reading = $reading;
+
+ # Note an array index for the reading, for later correction splices.
+ $base_text_index{$readingref} = $i++;
+ }
}
close BASE;
# Ending point for all texts
- my $endpoint = $collation->add_reading( '#END#' );
- $collation->add_path( $last_reading, $endpoint, $collation->baselabel );
- push( @$lineref_array, $endpoint );
- $base_text_index{$endpoint->name} = $i;
+ $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
+ push( @$lineref_array, $collation->end );
+ $base_text_index{$collation->end->id} = $i;
return( @$lineref_array );
}
-=item B<collate_variants>
-
-collate_variants( $collation, @reading_ranges )
-
-Given a set of readings in the form
-( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
-walks through each to identify those readings that are identical. The
-collation is a Text::Tradition::Collation object; the elements of
-@readings are Text::Tradition::Collation::Reading objects that appear
-on the collation graph.
-
-TODO: Handle collapsed and non-collapsed transpositions.
-
-=cut
-
-sub collate_variants {
- my( $collation, @reading_sets ) = @_;
-
- # Two different ways to do this, depending on whether we want
- # transposed reading nodes to be merged into one (producing a
- # nonlinear, bidirectional graph) or not (producing a relatively
- # linear, unidirectional graph.)
- return $collation->linear ? collate_linearly( @_ )
- : collate_nonlinearly( @_ );
-}
-
-sub collate_linearly {
- my( $collation, $lemma_set, @variant_sets ) = @_;
-
- my @unique;
- push( @unique, @$lemma_set );
- while( @variant_sets ) {
- my $variant_set = shift @variant_sets;
- # Use diff to do this job
- my $diff = Algorithm::Diff->new( \@unique, $variant_set,
- {'keyGen' => \&_collation_hash} );
- my @new_unique;
- my %merged;
- while( $diff->Next ) {
- if( $diff->Same ) {
- # merge the nodes
- my @l = $diff->Items( 1 );
- my @v = $diff->Items( 2 );
- foreach my $i ( 0 .. $#l ) {
- if( !$merged{$l[$i]->name} ) {
- $collation->merge_readings( $l[$i], $v[$i] );
- $merged{$l[$i]->name} = 1;
- } else {
- print STDERR "Would have double merged " . $l[$i]->name . "\n";
- }
- }
- # splice the lemma nodes into the variant set
- my( $offset ) = $diff->Get( 'min2' );
- splice( @$variant_set, $offset, scalar( @l ), @l );
- push( @new_unique, @l );
- } else {
- # Keep the old unique readings
- push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
- # Add the new readings to the 'unique' list
- push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
- }
- }
- @unique = @new_unique;
- }
-}
-
-sub collate_nonlinearly {
- my( $collation, $lemma_set, @variant_sets ) = @_;
-
- my @unique;
- push( @unique, @$lemma_set );
- while( @variant_sets ) {
- my $variant_set = shift @variant_sets;
- # Simply match the first reading that carries the same word, so
- # long as that reading has not yet been used to match another
- # word in this variant. That way lies loopy madness.
- my @distinct;
- my %merged;
- foreach my $idx ( 0 .. $#{$variant_set} ) {
- my $vw = $variant_set->[$idx];
- my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
- my $matched;
- if( @same ) {
- foreach my $i ( 0 .. $#same ) {
- unless( $merged{$same[$i]->name} ) {
- print STDERR sprintf( "Merging %s into %s\n",
- $vw->name,
- $same[$i]->name );
- $collation->merge_readings( $same[$i], $vw );
- $merged{$same[$i]->name} = 1;
- $matched = $i;
- $variant_set->[$idx] = $same[$i];
- }
- }
- }
- unless( @same && defined($matched) ) {
- push( @distinct, $vw );
- }
- }
- push( @unique, @distinct );
- }
-}
-
-
-
-sub _collation_hash {
- my $node = shift;
- return cmp_str( $node );
-}
-
sub set_relationships {
- my( $app, $lemma, $variants ) = @_;
+ my( $collation, $app, $lemma, $variants ) = @_;
foreach my $rkey ( keys %$variants ) {
- my $var = $variants->{$rkey}->{'reading'};
- my $typekey = sprintf( "_%s_type", $rkey );
- my $type = $app->{$typekey};
-
- # Transposition: look for nodes with the same label but different IDs
- # and mark them as transposed-identical.
-
- # Lexical / Grammatical / Spelling: look for non-identical nodes.
- # Need to work out how to handle many-to-many mapping.
+ my $var = $variants->{$rkey}->{'reading'};
+ my $type = $app->{sprintf( "_%s_type", $rkey )};
+ my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
+ my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
+
+ my %rel_options = ();
+ $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
+ $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
+
+ if( $type =~ /^(inv|tr|rep)$/i ) {
+ # Transposition or repetition: look for nodes with the
+ # same label but different IDs and mark them.
+ $type = 'repetition' if $type =~ /^rep/i;
+ $rel_options{'type'} = $type;
+ $rel_options{'equal_rank'} = undef;
+ my %labels;
+ foreach my $r ( @$lemma ) {
+ $labels{cmp_str( $r )} = $r;
+ }
+ foreach my $r( @$var ) {
+ if( exists $labels{$r->text} &&
+ $r->id ne $labels{$r->text}->id ) {
+ if( $type eq 'repetition' ) {
+ # Repetition
+ $collation->add_relationship( $r, $labels{$r->text}, \%rel_options );
+ } else {
+ # Transposition
+ $r->set_identical( $labels{$r->text} );
+ }
+ }
+ }
+ } elsif( $type =~ /^(gr|sp(el)?)$/i ) {
+
+ # Grammar/spelling/lexical: this can be a one-to-one or
+ # one-to-many mapping. We should think about merging
+ # readings if it is one-to-many.
+
+ $type = 'grammatical' if $type =~ /gr/i;
+ $type = 'spelling' if $type =~ /sp/i;
+ $type = 'repetition' if $type =~ /rep/i;
+ # $type = 'lexical' if $type =~ /lex/i;
+ $rel_options{'type'} = $type;
+ $rel_options{'equal_rank'} = 1;
+ if( @$lemma == @$var ) {
+ foreach my $i ( 0 .. $#{$lemma} ) {
+ $collation->add_relationship( $var->[$i], $lemma->[$i],
+ \%rel_options );
+ }
+ } else {
+ # An uneven many-to-many mapping. Skip for now.
+ # We really want to make a segment out of whatever we have.
+ # my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
+ # my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
+ # $collation->add_relationship( $varseg, $lemseg, \%rel_options );
+ if( @$lemma == 1 && @$var == 1 ) {
+ $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
+ }
+ }
+ } elsif( $type !~ /^(add|om|lex)$/i ) {
+ warn "Unrecognized type $type";
+ }
}
}
-
+
sub apply_edits {
my( $collation, $edit_sequence, $debug ) = @_;
- my @lemma_text = $collation->reading_sequence( $collation->start,
- $collation->reading( '#END#' ) );
+ my @lemma_text = $collation->reading_sequence(
+ $collation->start, $collation->end );
my $drift = 0;
foreach my $correction ( @$edit_sequence ) {
- my( $lemma_start, $length, $items ) = @$correction;
- my $offset = $base_text_index{$lemma_start};
- my $realoffset = $offset + $drift;
- if( $debug ||
- $lemma_text[$realoffset]->name ne $lemma_start ) {
- my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
- my @base_phrase;
- my $i = $realoffset;
- my $l = $collation->reading( $lemma_start );
- while( $i < $realoffset+$length ) {
- push( @base_phrase, $l );
- $l = $collation->next_reading( $l );
- $i++;
- }
-
- print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
- "with %s (%s) with drift %d\n",
- join( ' ', map {$_->label} @base_phrase ),
- join( ' ', map {$_->name} @base_phrase ),
- $realoffset,
- join( ' ', map {$_->label} @$items ),
- join( ' ', map {$_->name} @$items ),
- $drift,
- ) if $debug;
-
- warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
- "but %s (%s) is there instead",
- join( ' ', map {$_->label} @base_phrase ),
- join( ' ', map {$_->name} @base_phrase ),
- join( ' ', map {$_->label} @$items ),
- join( ' ', map {$_->name} @$items ),
- join( ' ', map {$_->label} @this_phrase ),
- join( ' ', map {$_->name} @this_phrase ),
- ) )
- if $lemma_text[$realoffset]->name ne $lemma_start;
- }
- splice( @lemma_text, $realoffset, $length, @$items );
- $drift += @$items - $length;
+ my( $lemma_start, $length, $items ) = @$correction;
+ my $offset = $base_text_index{$lemma_start};
+ my $realoffset = $offset + $drift;
+ if( $debug ||
+ $lemma_text[$realoffset]->id ne $lemma_start ) {
+ my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
+ my @base_phrase;
+ my $i = $realoffset;
+ my $l = $collation->reading( $lemma_start );
+ while( $i < $realoffset+$length ) {
+ push( @base_phrase, $l );
+ $l = $collation->next_reading( $l );
+ $i++;
+ }
+
+ print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
+ "with %s (%s) with drift %d\n",
+ join( ' ', map {$_->text} @base_phrase ),
+ join( ' ', map {$_->id} @base_phrase ),
+ $realoffset,
+ join( ' ', map {$_->text} @$items ),
+ join( ' ', map {$_->id} @$items ),
+ $drift,
+ ) if $debug;
+
+ if( $lemma_text[$realoffset]->id ne $lemma_start ) {
+ warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
+ "but %s (%s) is there instead",
+ join( ' ', map {$_->text} @base_phrase ),
+ join( ' ', map {$_->id} @base_phrase ),
+ join( ' ', map {$_->text} @$items ),
+ join( ' ', map {$_->id} @$items ),
+ join( ' ', map {$_->text} @this_phrase ),
+ join( ' ', map {$_->id} @this_phrase ),
+ ) );
+ # next;
+ }
+ }
+ splice( @lemma_text, $realoffset, $length, @$items );
+ $drift += @$items - $length;
}
return @lemma_text;
}
-
+
# Helper function. Given a witness sigil, if it is a post-correctione
# sigil,return the base witness. If not, return a false value.
sub _is_post_corr {
my( $sigil ) = @_;
if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
- return $1;
+ return $1;
}
return undef;
}
-sub _add_hash_entry {
- my( $hash, $key, $entry ) = @_;
- if( exists $hash->{$key} ) {
- push( @{$hash->{$key}}, $entry );
- } else {
- $hash->{$key} = [ $entry ];
- }
-}
-
-
-=item B<cmp_str>
-
-Pretend you never saw this method. Really it needs to not be hardcoded.
-
-=cut
-
-sub cmp_str {
- my( $reading ) = @_;
- my $word = $reading->label();
- $word = lc( $word );
- $word =~ s/\W//g;
- $word =~ s/v/u/g;
- $word =~ s/j/i/g;
- $word =~ s/cha/ca/g;
- $word =~ s/quatuor/quattuor/g;
- $word =~ s/ioannes/iohannes/g;
- return $word;
-}
=back