=cut
-my $DETRANSPOSE = 0;
sub parse {
my( $tradition, %opts ) = @_;
my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
load( $format_mod );
my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
- $DETRANSPOSE = 1 if $opts{'linear'};
merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
}
=cut
-my $SHORT = 25; # Debug var - set this to limit the number of lines parsed
+my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
my %base_text_index;
my $edits_required = {};
foreach my $app ( @app_entries ) {
my( $line, $num ) = split( /\./, $app->{_id} );
# DEBUG with a short graph
- last if $SHORT && $line > $SHORT;
+ last if $SHORTEND && $line > $SHORTEND;
# DEBUG for problematic entries
my $scrutinize = '';
my $first_line_reading = $base_line_starts[ $line ];
my %pc_seen; # Keep track of mss with explicit post-corr data
foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
my @mss = grep { $app->{$_} eq $k } keys( %$app );
+
+ # Keep track of lemma nodes that don't actually appear in
+ # any MSS; we will want to remove them from the collation.
push( @unwitnessed_lemma_nodes, @lemma_set )
if !@mss && $k eq 'rdg_0';
}
next if $k eq 'rdg_0';
+ # Parse the variant into reading tokens.
# TODO don't hardcode the reading split operation
my @variant = split( /\s+/, $app->{$k} );
@variant = () if $app->{$k} eq '/'; # This is an omission.
- # Make the variant into a set of readings.
my @variant_readings;
my $ctr = 0;
foreach my $vw ( @variant ) {
# Now collate and collapse the identical readings within the
# collated sets. Modifies the reading sets that were passed.
- $DB::single = 1 if "$line.$num" eq '16.2';
collate_variants( $collation, @reading_sets );
+ # Record any stated relationships between the nodes and the lemma.
+ set_relationships( $collation, $app, \@lemma_set, $variant_objects );
+
# Now create the splice-edit objects that will be used
# to reconstruct each witness.
foreach my $rkey ( keys %$variant_objects ) {
# Object is argument list for splice, so:
# offset, length, replacements
- my $edit_object = [ $base_text_index{$lemma_start->name},
+ my $edit_object = [ $lemma_start->name,
scalar( @lemma_set ),
$variant_objects->{$rkey}->{reading} ];
foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
# Now make the witness objects, and create their text sequences
foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
+ print STDERR "Creating witness $w\n";
my $witness_obj = $collation->tradition->add_witness( sigil => $w );
- my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w} );
- my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"} )
+ my $debug; # = $w eq 'Vb11';
+ my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
+ my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
if exists( $edits_required->{$w."_post"} );
+ my @repeated = _check_for_repeated( @ante_corr_seq );
+ warn "Repeated elements @repeated in $w a.c."
+ if @repeated;
+ @repeated = _check_for_repeated( @post_corr_seq );
+ warn "Repeated elements @repeated in $w p.c."
+ if @repeated;
+
# Now save these paths in my witness object
if( @post_corr_seq ) {
$witness_obj->path( \@post_corr_seq );
- my @ante_corr = make_witness_uncorrections( \@post_corr_seq,
- \@ante_corr_seq );
- $witness_obj->ante_corr( \@ante_corr );
+ $witness_obj->uncorrected_path( \@ante_corr_seq );
} else {
$witness_obj->path( \@ante_corr_seq );
}
}
# Now remove our 'base text' edges, which is to say, the only
- # ones we have created so far. Also remove any nodes that didn't
- # appear in any witnesses.
+ # ones we have created so far. Also remove any unwitnessed
+ # lemma nodes (TODO unless we are treating base as witness)
foreach ( $collation->paths() ) {
$collation->del_path( $_ );
}
$collation->del_reading( $_ );
}
+ ### HACKY HACKY Do some one-off path corrections here.
+ if( $collation->linear ) {
+ my $c = $collation;
+ my $end = $SHORTEND ? $SHORTEND : 155;
+ my $path = $c->tradition->witness('Vb11')->path;
+ if( $end > 16 ) {
+ $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
+ splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) );
+ }
+ # What else?
+ } else {
+ my $c = $collation;
+ my $end = $SHORTEND ? $SHORTEND : 155;
+ # Vb5:
+ my $path = $c->tradition->witness('Vb5')->path;
+ splice( @$path, 1436, 0, $c->reading('106,14') ) if $end > 106;
+ # Vb11:
+ $path = $c->tradition->witness('Vb11')->path;
+ if( $end > 16 ) {
+ $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
+ splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( '16,1' ) );
+ }
+ # Vb12 a.c.:
+ $path = $c->tradition->witness('Vb12')->uncorrected_path;
+ splice( @$path, 1828, 1, $c->reading('rdg_2/137.5.0') ) if $end > 137;
+ # Vb13:
+ $path = $c->tradition->witness('Vb13')->path;
+ splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58;
+ # Vb20 a.c.:
+ $path = $c->tradition->witness('Vb20')->uncorrected_path;
+ splice( @$path, 1251, 1, $c->reading( '94,6' ) ) if $end > 94;
+ # Vb26:
+ $path = $c->tradition->witness('Vb26')->path;
+ splice( @$path, 618, 0, $c->reading('46,2') ) if $end > 46;
+ }
+
# Now walk paths and calculate positions.
my @common_readings =
$collation->make_witness_paths();
$collation->calculate_positions( @common_readings );
}
+sub _check_for_repeated {
+ my @seq = @_;
+ my %unique;
+ my @repeated;
+ foreach ( @seq ) {
+ if( exists $unique{$_->name} ) {
+ push( @repeated, $_->name );
+ } else {
+ $unique{$_->name} = 1;
+ }
+ }
+ return @repeated;
+}
+
=item B<read_base>
my @line_beginnings = read_base( 'reference.txt', $collation );
my $started = 0;
my $wordref = 0;
my $lineref = scalar @$lineref_array;
- last if $SHORT && $lineref > $SHORT;
+ last if $SHORTEND && $lineref > $SHORTEND;
foreach my $w ( @words ) {
my $readingref = join( ',', $lineref, ++$wordref );
my $reading = $collation->add_reading( $readingref );
sub collate_variants {
my( $collation, @reading_sets ) = @_;
- # Merge the nodes across the sets so that there is only one node
- # for any given reading. Use diff to identify the 'same' nodes.
+ # Two different ways to do this, depending on whether we want
+ # transposed reading nodes to be merged into one (producing a
+ # nonlinear, bidirectional graph) or not (producing a relatively
+ # linear, unidirectional graph.)
+ return $collation->linear ? collate_linearly( @_ )
+ : collate_nonlinearly( @_ );
+}
- my $lemma_set = shift @reading_sets;
+sub collate_linearly {
+ my( $collation, $lemma_set, @variant_sets ) = @_;
my @unique;
push( @unique, @$lemma_set );
-
- while( @reading_sets ) {
- my $variant_set = shift @reading_sets;
- if( $DETRANSPOSE ) {
- # Use diff to do this job
- my $diff = Algorithm::Diff->new( \@unique, $variant_set,
- {'keyGen' => \&_collation_hash} );
- my @new_unique;
- while( $diff->Next ) {
- if( $diff->Same ) {
- # merge the nodes
- my @l = $diff->Items( 1 );
- my @v = $diff->Items( 2 );
- foreach my $i ( 0 .. $#l ) {
+ while( @variant_sets ) {
+ my $variant_set = shift @variant_sets;
+ # Use diff to do this job
+ my $diff = Algorithm::Diff->new( \@unique, $variant_set,
+ {'keyGen' => \&_collation_hash} );
+ my @new_unique;
+ my %merged;
+ while( $diff->Next ) {
+ if( $diff->Same ) {
+ # merge the nodes
+ my @l = $diff->Items( 1 );
+ my @v = $diff->Items( 2 );
+ foreach my $i ( 0 .. $#l ) {
+ if( !$merged{$l[$i]->name} ) {
$collation->merge_readings( $l[$i], $v[$i] );
+ $merged{$l[$i]->name} = 1;
+ } else {
+ print STDERR "Would have double merged " . $l[$i]->name . "\n";
}
- # splice the lemma nodes into the variant set
- my( $offset ) = $diff->Get( 'min2' );
- splice( @$variant_set, $offset, scalar( @l ), @l );
- push( @new_unique, @l );
- } else {
- # Keep the old unique readings
- push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
- # Add the new readings to the 'unique' list
- push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
}
+ # splice the lemma nodes into the variant set
+ my( $offset ) = $diff->Get( 'min2' );
+ splice( @$variant_set, $offset, scalar( @l ), @l );
+ push( @new_unique, @l );
+ } else {
+ # Keep the old unique readings
+ push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
+ # Add the new readings to the 'unique' list
+ push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
}
- @unique = @new_unique;
- } else {
- # It becomes a much simpler job
- $DB::single = 1;
- my @distinct;
- foreach my $idx ( 0 .. $#{$variant_set} ) {
- my $vw = $variant_set->[$idx];
- my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
- if( @same ) {
- $collation->merge_readings( $same[0], $vw );
- $variant_set->[$idx] = $same[0];
- } else {
- push( @distinct, $vw );
+ }
+ @unique = @new_unique;
+ }
+}
+
+sub collate_nonlinearly {
+ my( $collation, $lemma_set, @variant_sets ) = @_;
+
+ my @unique;
+ push( @unique, @$lemma_set );
+ while( @variant_sets ) {
+ my $variant_set = shift @variant_sets;
+ # Simply match the first reading that carries the same word, so
+ # long as that reading has not yet been used to match another
+ # word in this variant. That way lies loopy madness.
+ my @distinct;
+ my %merged;
+ foreach my $idx ( 0 .. $#{$variant_set} ) {
+ my $vw = $variant_set->[$idx];
+ my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
+ my $matched;
+ if( @same ) {
+ foreach my $i ( 0 .. $#same ) {
+ unless( $merged{$same[$i]->name} ) {
+ #print STDERR sprintf( "Merging %s into %s\n",
+ # $vw->name,
+ # $same[$i]->name );
+ $collation->merge_readings( $same[$i], $vw );
+ $merged{$same[$i]->name} = 1;
+ $matched = $i;
+ $variant_set->[$idx] = $same[$i];
+ }
}
}
- push( @unique, @distinct );
+ unless( @same && defined($matched) ) {
+ push( @distinct, $vw );
+ }
}
+ push( @unique, @distinct );
}
-
- return;
}
+
sub _collation_hash {
my $node = shift;
return cmp_str( $node );
}
-sub apply_edits {
- my( $collation, $edit_sequence ) = @_;
- my @lemma_names = sort { $base_text_index{$a} <=> $base_text_index{$b} }
- keys %base_text_index;
- my @lemma_text = map { $collation->reading( $_ ) } @lemma_names;
+sub set_relationships {
+ my( $collation, $app, $lemma, $variants ) = @_;
+ foreach my $rkey ( keys %$variants ) {
+ my $var = $variants->{$rkey}->{'reading'};
+ my $type = $app->{sprintf( "_%s_type", $rkey )};
+ my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
+ my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
+
+ my %rel_options = ();
+ $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
+ $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
+
+ if( $type =~ /^(inv|tr|rep)$/i ) {
+ # Transposition or repetition: look for nodes with the
+ # same label but different IDs and mark them.
+ $type = 'repetition' if $type =~ /^rep/i;
+ $rel_options{'type'} = $type;
+ my %labels;
+ foreach my $r ( @$lemma ) {
+ $labels{cmp_str( $r )} = $r;
+ }
+ foreach my $r( @$var ) {
+ if( exists $labels{$r->label} &&
+ $r->name ne $labels{$r->label}->name ) {
+ if( $type eq 'repetition' ) {
+ # Repetition
+ $collation->add_relationship( $r, $labels{$r->label}, \%rel_options );
+ } else {
+ # Transposition
+ $r->set_identical( $labels{$r->label} );
+ }
+ }
+ }
+ } elsif( $type =~ /^(gr|lex|sp(el)?)$/i ) {
+
+ # Grammar/spelling/lexical: this can be a one-to-one or
+ # one-to-many mapping. We should think about merging
+ # readings if it is one-to-many.
+
+ $type = 'grammatical' if $type =~ /gr/i;
+ $type = 'spelling' if $type =~ /sp/i;
+ $type = 'repetition' if $type =~ /rep/i;
+ $type = 'lexical' if $type =~ /lex/i;
+ $rel_options{'type'} = $type;
+ if( @$lemma == @$var ) {
+ foreach my $i ( 0 .. $#{$lemma} ) {
+ $collation->add_relationship( $var->[$i], $lemma->[$i],
+ \%rel_options );
+ }
+ } else {
+ # An uneven many-to-many mapping. Make a segment out of
+ # whatever we have.
+ my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
+ my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
+ $collation->add_relationship( $varseg, $lemseg, \%rel_options );
+ }
+ } elsif( $type !~ /^(add|om)$/i ) {
+ warn "Unrecognized type $type";
+ }
+ }
+}
+
+
+sub apply_edits {
+ my( $collation, $edit_sequence, $debug ) = @_;
+ my @lemma_text = $collation->reading_sequence( $collation->start,
+ $collation->reading( '#END#' ) );
my $drift = 0;
foreach my $correction ( @$edit_sequence ) {
- my( $offset, $length, $items ) = @$correction;
+ my( $lemma_start, $length, $items ) = @$correction;
+ my $offset = $base_text_index{$lemma_start};
my $realoffset = $offset + $drift;
+ if( $debug ||
+ $lemma_text[$realoffset]->name ne $lemma_start ) {
+ my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
+ my @base_phrase;
+ my $i = $realoffset;
+ my $l = $collation->reading( $lemma_start );
+ while( $i < $realoffset+$length ) {
+ push( @base_phrase, $l );
+ $l = $collation->next_reading( $l );
+ $i++;
+ }
+
+ print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
+ "with %s (%s) with drift %d\n",
+ join( ' ', map {$_->label} @base_phrase ),
+ join( ' ', map {$_->name} @base_phrase ),
+ $realoffset,
+ join( ' ', map {$_->label} @$items ),
+ join( ' ', map {$_->name} @$items ),
+ $drift,
+ ) if $debug;
+
+ if( $lemma_text[$realoffset]->name ne $lemma_start ) {
+ warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
+ "but %s (%s) is there instead",
+ join( ' ', map {$_->label} @base_phrase ),
+ join( ' ', map {$_->name} @base_phrase ),
+ join( ' ', map {$_->label} @$items ),
+ join( ' ', map {$_->name} @$items ),
+ join( ' ', map {$_->label} @this_phrase ),
+ join( ' ', map {$_->name} @this_phrase ),
+ ) );
+ # next;
+ }
+ }
splice( @lemma_text, $realoffset, $length, @$items );
$drift += @$items - $length;
}
return @lemma_text;
}
-
-sub make_witness_uncorrections {
- my( $path, $uncorr_path ) = @_;
- my $diff = Algorithm::Diff->new( $path, $uncorr_path,
- { 'keyGen' => \&_collation_hash } );
- # We basically just want to make a bunch of splice arguments that
- # will reconstruct the ante-corr text from the post-corr.
- my @diff_list;
- while( $diff->Next ) {
- next if $diff->Same;
- my( $offset ) = $diff->Get( 'min1' );
- my $length = scalar( $diff->Items( 1 ) );
- my $items = []; push( @$items, $diff->Items( 2 ) );
- push( @diff_list, [ $offset, $length, $items ] );
- }
- return @diff_list;
-}
# Helper function. Given a witness sigil, if it is a post-correctione