X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FParser%2FBaseText.pm;h=ed2a447c0c8dd26d92d7c6bd80f89c676e29ced5;hb=e309421ae7b04ee2dcb71cdae2f206ffc6b6c384;hp=d7d090bf79cf9c1868d3baf03f3bcd6754c69b9c;hpb=e290206835dd4bc540d751ea2d1849255dd192f2;p=scpubgit%2Fstemmatology.git diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm index d7d090b..ed2a447 100644 --- a/lib/Text/Tradition/Parser/BaseText.pm +++ b/lib/Text/Tradition/Parser/BaseText.pm @@ -3,6 +3,7 @@ package Text::Tradition::Parser::BaseText; use strict; use warnings; use Module::Load; +use Text::Tradition::Parser::Util qw( collate_variants cmp_str check_for_repeated add_hash_entry ); =head1 NAME @@ -75,165 +76,284 @@ underscore in its name. =cut +my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed + +my %base_text_index; +my $edits_required = {}; + +# edits_required -> wit -> [ { start_idx, end_idx, items } ] + sub merge_base { my( $collation, $base_file, @app_entries ) = @_; my @base_line_starts = read_base( $base_file, $collation ); my %all_witnesses; + my @unwitnessed_lemma_nodes; foreach my $app ( @app_entries ) { - my( $line, $num ) = split( /\./, $app->{_id} ); - # DEBUG with a short graph - # last if $line > 2; - # DEBUG for problematic entries - my $scrutinize = ""; - my $first_line_reading = $base_line_starts[ $line ]; - my $too_far = $base_line_starts[ $line+1 ]; - - my $lemma = $app->{rdg_0}; - my $seq = 1; - # Is this the Nth occurrence of this reading in the line? - if( $lemma =~ s/(_)?(\d)$// ) { - $seq = $2; - } - my @lemma_words = split( /\s+/, $lemma ); - - # Now search for the lemma words within this line. - my $lemma_start = $first_line_reading; - my $lemma_end; - my %seen; - while( $lemma_start ne $too_far ) { - # Loop detection - if( $seen{ $lemma_start->name() } ) { - warn "Detected loop at " . $lemma_start->name() . - ", ref $line,$num"; - last; - } - $seen{ $lemma_start->name() } = 1; - - # Try to match the lemma. - my $unmatch = 0; - print STDERR "Matching " . cmp_str( $lemma_start) . " against " . - $lemma_words[0] . "...\n" - if "$line.$num" eq $scrutinize; - if( cmp_str( $lemma_start ) eq $lemma_words[0] ) { - # Skip it if we need a match that is not the first. - if( --$seq < 1 ) { - # Now we have to compare the rest of the words here. - if( scalar( @lemma_words ) > 1 ) { - my $next_reading = - $collation->next_reading( $lemma_start ); - foreach my $w ( @lemma_words[1..$#lemma_words] ) { - printf STDERR "Now matching %s against %s\n", - cmp_str($next_reading), $w - if "$line.$num" eq $scrutinize; - if( $w ne cmp_str($next_reading) ) { - $unmatch = 1; - last; - } else { - $lemma_end = $next_reading; - $next_reading = - $collation->next_reading( $lemma_end ); - } - } - } else { - $lemma_end = $lemma_start; - } - } else { - $unmatch = 1; - } - } - last unless ( $unmatch || !defined( $lemma_end ) ); - $lemma_end = undef; - $lemma_start = $collation->next_reading( $lemma_start ); - } - - unless( $lemma_end ) { - warn "No match found for @lemma_words at $line.$num"; - next; - } else { - # These are no longer common readings; unmark them as such. - my @lemma_readings = $collation->reading_sequence( $lemma_start, - $lemma_end ); - map { $_->set_attribute( 'class', 'lemma' ) } @lemma_readings; - } - - # Now we have our lemma readings; we add the variant readings - # to the collation. - - # Keep track of the start and end point of each reading for later - # reading collapse. - my @readings = ( $lemma_start, $lemma_end ); - - # For each reading that is not rdg_0, we make a chain of readings - # and connect them to the anchor. Edges are named after the mss - # that are relevant. - foreach my $k ( grep { /^rdg/ } keys( %$app ) ) { - next if $k eq 'rdg_0'; # that's the lemma. - # TODO look at the lemma for any p.c. readings, and add - # them explicitly! - my @variant = split( /\s+/, $app->{$k} ); - @variant = () if $app->{$k} eq '/'; # This is an omission. - my @mss = grep { $app->{$_} eq $k } keys( %$app ); - - unless( @mss ) { - print STDERR "Skipping '@variant' at $line.$num: no mss\n"; - next; - } - - # Keep track of what witnesses we have seen. - @all_witnesses{ @mss } = ( 1 ) x scalar( @mss ); - - # Make the variant into a set of readings. - my $ctr = 0; - my $last_reading = $collation->prior_reading( $lemma_start ); - my $var_start; - foreach my $vw ( @variant ) { - my $vwname = "$k/$line.$num.$ctr"; $ctr++; - my $vwreading = $collation->add_reading( $vwname ); - $vwreading->text( $vw ); - $vwreading->make_variant(); - foreach ( @mss ) { - $collation->add_path( $last_reading, $vwreading, $_ ); - } - $var_start = $vwreading unless $var_start; - $last_reading = $vwreading; - } - # Now hook it up at the end. - foreach ( @mss ) { - $collation->add_path( $last_reading, - $collation->next_word( $lemma_end ), - $_ ); - } - - if( $var_start ) { # if it wasn't an empty reading - push( @readings, $var_start, $last_reading ); - } - } - - # Now collate and collapse the identical readings within the collation. - collate_variants( $collation, @readings ); + my( $line, $num ) = split( /\./, $app->{_id} ); + # DEBUG with a short graph + last if $SHORTEND && $line > $SHORTEND; + # DEBUG for problematic entries + my $scrutinize = ''; + my $first_line_reading = $base_line_starts[ $line ]; + my $too_far = $base_line_starts[ $line+1 ]; + + my $lemma = $app->{rdg_0}; + my $seq = 1; + # Is this the Nth occurrence of this reading in the line? + if( $lemma =~ s/(_)?(\d)$// ) { + $seq = $2; + } + my @lemma_words = split( /\s+/, $lemma ); + + # Now search for the lemma words within this line. + my $lemma_start = $first_line_reading; + my $lemma_end; + my %seen; + while( $lemma_start ne $too_far ) { + # Loop detection + if( $seen{ $lemma_start->name() } ) { + warn "Detected loop at " . $lemma_start->name() . + ", ref $line,$num"; + last; + } + $seen{ $lemma_start->name() } = 1; + + # Try to match the lemma. + my $unmatch = 0; + print STDERR "Matching " . cmp_str( $lemma_start) . " against " . + $lemma_words[0] . "...\n" + if "$line.$num" eq $scrutinize; + if( cmp_str( $lemma_start ) eq $lemma_words[0] ) { + # Skip it if we need a match that is not the first. + if( --$seq < 1 ) { + # Now we have to compare the rest of the words here. + if( scalar( @lemma_words ) > 1 ) { + my $next_reading = + $collation->next_reading( $lemma_start ); + foreach my $w ( @lemma_words[1..$#lemma_words] ) { + printf STDERR "Now matching %s against %s\n", + cmp_str($next_reading), $w + if "$line.$num" eq $scrutinize; + if( $w ne cmp_str($next_reading) ) { + $unmatch = 1; + last; + } else { + $lemma_end = $next_reading; + $next_reading = + $collation->next_reading( $lemma_end ); + } + } + } else { + $lemma_end = $lemma_start; + } + } else { + $unmatch = 1; + } + } + last unless ( $unmatch || !defined( $lemma_end ) ); + $lemma_end = undef; + $lemma_start = $collation->next_reading( $lemma_start ); + } + + unless( $lemma_end ) { + warn "No match found for @lemma_words at $line.$num"; + next; + } + + # Now we have found the lemma; we will record an 'edit', in + # terms of a splice operation, for each subsequent reading. + # We also note which witnesses take the given edit. + + my @lemma_set = $collation->reading_sequence( $lemma_start, + $lemma_end ); + my @reading_sets = [ @lemma_set ]; + + # For each reading that is not rdg_0, we create the variant + # reading nodes, and store the range as an edit operation on + # the base text. + my $variant_objects; + my %pc_seen; # Keep track of mss with explicit post-corr data + foreach my $k ( grep { /^rdg/ } keys( %$app ) ) { + my @mss = grep { $app->{$_} eq $k } keys( %$app ); + + # Keep track of lemma nodes that don't actually appear in + # any MSS; we will want to remove them from the collation. + push( @unwitnessed_lemma_nodes, @lemma_set ) + if !@mss && $k eq 'rdg_0'; + + # Keep track of what witnesses we have seen. + @all_witnesses{ @mss } = ( 1 ) x scalar( @mss ); + # Keep track of which witnesses bear corrected readings here. + foreach my $m ( @mss ) { + my $base = _is_post_corr( $m ); + next unless $base; + $pc_seen{$base} = 1; + } + next if $k eq 'rdg_0'; + + # Parse the variant into reading tokens. + # TODO don't hardcode the reading split operation + my @variant = split( /\s+/, $app->{$k} ); + @variant = () if $app->{$k} eq '/'; # This is an omission. + + my @variant_readings; + my $ctr = 0; + foreach my $vw ( @variant ) { + my $vwname = "$k/$line.$num.$ctr"; $ctr++; + my $vwreading = $collation->add_reading( $vwname ); + $vwreading->text( $vw ); + push( @variant_readings, $vwreading ); + } + + $variant_objects->{$k} = { 'mss' => \@mss, + 'reading' => \@variant_readings, + }; + push( @reading_sets, \@variant_readings ); + } + + # Now collate and collapse the identical readings within the + # collated sets. Modifies the reading sets that were passed. + collate_variants( $collation, @reading_sets ); + + # Record any stated relationships between the nodes and the lemma. + set_relationships( $collation, $app, \@lemma_set, $variant_objects ); + + # Now create the splice-edit objects that will be used + # to reconstruct each witness. + + foreach my $rkey ( keys %$variant_objects ) { + # Object is argument list for splice, so: + # offset, length, replacements + my $edit_object = [ $lemma_start->name, + scalar( @lemma_set ), + $variant_objects->{$rkey}->{reading} ]; + foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) { + # Is this a p.c. entry? + my $base = _is_post_corr( $ms ); + if( $base ) { # this is a post-corr witness + my $pc_key = $base . "_post"; + add_hash_entry( $edits_required, $pc_key, $edit_object ); + } else { # this is an ante-corr witness + my $pc_key = $ms . "_post"; + add_hash_entry( $edits_required, $ms, $edit_object ); + unless( $pc_seen{$ms} ) { + # If this witness carries no correction, add this + # same object to its post-corrected state. + add_hash_entry( $edits_required, $pc_key, + $edit_object ); + } + } + } + } + } # Finished going through the apparatus entries + + # Now make the witness objects, and create their text sequences + foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) { + print STDERR "Creating witness $w\n"; + my $witness_obj = $collation->tradition->add_witness( sigil => $w ); + my $debug; # = $w eq 'Vb11'; + my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug ); + my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug ) + if exists( $edits_required->{$w."_post"} ); + + my @repeated = check_for_repeated( @ante_corr_seq ); + warn "Repeated elements @repeated in $w a.c." + if @repeated; + @repeated = check_for_repeated( @post_corr_seq ); + warn "Repeated elements @repeated in $w p.c." + if @repeated; + + # Now save these paths in my witness object + if( @post_corr_seq ) { + $witness_obj->path( \@post_corr_seq ); + $witness_obj->uncorrected_path( \@ante_corr_seq ); + } else { + $witness_obj->path( \@ante_corr_seq ); + } } - # Now make the witness objects - foreach my $w ( keys %all_witnesses ) { - my $base = _is_post_corr( $w ); - if( $base ) { - my $pctag = substr( $w, length( $base ) ); - my $existing_wit = $collation->tradition->witness( $base ); - unless( $existing_wit ) { - $existing_wit = $collation->tradition->add_witness( $base ); - } - $existing_wit->post_correctione( $pctag ); - } else { - $collation->tradition->add_witness( $w ) - unless $collation->tradition->witness( $w ); - } + # Now remove our 'base text' edges, which is to say, the only + # ones we have created so far. Also remove any unwitnessed + # lemma nodes (TODO unless we are treating base as witness) + foreach ( $collation->paths() ) { + $collation->del_path( $_ ); + } + foreach( @unwitnessed_lemma_nodes ) { + $collation->del_reading( $_ ); + # TODO do we need to delete any relationship paths here? } - # Now walk paths and calculate positions. - my @common_readings = - $collation->walk_and_expand_base( $collation->reading( '#END#' ) ); - $collation->calculate_positions( @common_readings ); + ### HACKY HACKY Do some one-off path corrections here. + if( $collation->linear ) { + my $c = $collation; + my $end = $SHORTEND ? $SHORTEND : 155; + # Vb11 + my $path; + if( $end > 16 ) { + $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') ); + $path = $c->tradition->witness('Vb11')->path; + splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) ); + $path = $c->tradition->witness('Vb11')->uncorrected_path; + splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) ); + } + # What else? + # Vb26: + $path = $c->tradition->witness('Vb26')->path; + splice( @$path, 618, 0, $c->reading('rdg_1/46.1.1') ) if $end > 46; + # Vb13: + $path = $c->tradition->witness('Vb13')->path; + splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58; + $path = $c->tradition->witness('Vb13')->uncorrected_path; + splice( @$path, 758, 0, $c->reading( '58,5' ) ) if $end > 58; + # Vb20 a.c.: + $path = $c->tradition->witness('Vb20')->uncorrected_path; + splice( @$path, 1251, 1, $c->reading( '94,4' ) ) if $end > 94; + # Vb5: + $path = $c->tradition->witness('Vb5')->path; + splice( @$path, 1436, 0, $c->reading('rdg_1/106.5.1') ) if $end > 106; + # extraneous: + $c->del_reading( 'rdg_2/147.6.13' ); + $c->del_reading( 'rdg_2/147.6.14' ); + $c->del_reading( 'rdg_2/147.6.15' ); + + } else { + my $c = $collation; + my $end = $SHORTEND ? $SHORTEND : 155; + # Vb5: + my $path = $c->tradition->witness('Vb5')->path; + splice( @$path, 1436, 0, $c->reading('106,14') ) if $end > 106; + # Vb11: + $path = $c->tradition->witness('Vb11')->path; + if( $end > 16 ) { + $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') ); + splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( '16,1' ) ); + } + # Vb13: + $path = $c->tradition->witness('Vb13')->path; + splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58; + $path = $c->tradition->witness('Vb13')->uncorrected_path; + splice( @$path, 758, 0, $c->reading( '58,5' ) ) if $end > 58; + # Vb20 a.c.: + $path = $c->tradition->witness('Vb20')->uncorrected_path; + splice( @$path, 1251, 1, $c->reading( '94,4' ) ) if $end > 94; + # Vb26: + $path = $c->tradition->witness('Vb26')->path; + splice( @$path, 618, 0, $c->reading('46,2') ) if $end > 46; + } + + # Now walk paths and calculate positional rank. + $collation->make_witness_paths(); + # Have to check relationship validity at this point, because before that + # we had no paths. +# foreach my $rel ( $collation->relationships ) { +# next unless $rel->equal_rank; +# unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) { +# warn sprintf( "Relationship type %s between %s and %s is invalid, deleting", +# $rel->type, $rel->from->name, $rel->to->name ); +# } +# } + $collation->calculate_ranks(); } =item B @@ -254,203 +374,180 @@ sub read_base { # This array gives the first reading for each line. We put the # common starting point in line zero. my $last_reading = $collation->start(); + $base_text_index{$last_reading->name} = 0; my $lineref_array = [ $last_reading ]; # There is no line zero. open( BASE, $base_file ) or die "Could not open file $base_file: $!"; + my $i = 1; while() { - # Make the readings, and connect them up for the base, but - # also save the first reading of each line in an array for the - # purpose. - # TODO use configurable reading separator - chomp; - my @words = split; - my $started = 0; - my $wordref = 0; - my $lineref = scalar @$lineref_array; - foreach my $w ( @words ) { - my $readingref = join( ',', $lineref, ++$wordref ); - my $reading = $collation->add_reading( $readingref ); - $reading->text( $w ); - $reading->make_common(); - unless( $started ) { - push( @$lineref_array, $reading ); - $started = 1; - } - if( $last_reading ) { - my $path = $collation->add_path( $last_reading, $reading, - "base text" ); - $path->set_attribute( 'class', 'basetext' ); - $last_reading = $reading; - } # TODO there should be no else here... - } + # Make the readings, and connect them up for the base, but + # also save the first reading of each line in an array for the + # purpose. + # TODO use configurable reading separator + chomp; + my @words = split; + my $started = 0; + my $wordref = 0; + my $lineref = scalar @$lineref_array; + last if $SHORTEND && $lineref > $SHORTEND; + foreach my $w ( @words ) { + my $readingref = join( ',', $lineref, ++$wordref ); + my $reading = $collation->add_reading( $readingref ); + $reading->text( $w ); + unless( $started ) { + push( @$lineref_array, $reading ); + $started = 1; + } + # Add edge paths in the graph, for easier tracking when + # we start applying corrections. These paths will be + # removed when we're done. + my $path = $collation->add_path( $last_reading, $reading, + $collation->baselabel ); + $last_reading = $reading; + + # Note an array index for the reading, for later correction splices. + $base_text_index{$readingref} = $i++; + } } close BASE; # Ending point for all texts - my $endpoint = $collation->add_reading( '#END#' ); - $collation->add_path( $last_reading, $endpoint, "base text" ); - push( @$lineref_array, $endpoint ); + $collation->add_path( $last_reading, $collation->end, $collation->baselabel ); + push( @$lineref_array, $collation->end ); + $base_text_index{$collation->end->name} = $i; return( @$lineref_array ); } -=item B - -collate_variants( $collation, @readings ) - -Given a set of readings in the form -( lemma_start, lemma_end, rdg1_start, rdg1_end, ... ) -walks through each to identify those readings that are identical. The -collation is a Text::Tradition::Collation object; the elements of -@readings are Text::Tradition::Collation::Reading objects that appear -on the collation graph. - -TODO: Handle collapsed and non-collapsed transpositions. - -=cut - -sub collate_variants { - my( $collation, @readings ) = @_; - my $lemma_start = shift @readings; - my $lemma_end = shift @readings; - my $detranspose = 0; - - # We need to calculate positions at this point, which is where - # we are getting the implicit information from the apparatus. - - # Start the list of distinct readings with those readings in the lemma. - my @distinct_readings; - my $position = 0; - while( $lemma_start ne $lemma_end ) { - push( @distinct_readings, [ $lemma_start, 'base text', $position++ ] ); - $lemma_start = $collation->next_word( $lemma_start ); - } - push( @distinct_readings, [ $lemma_end, 'base text', $position++ ] ); - - - while( scalar @readings ) { - my( $var_start, $var_end ) = splice( @readings, 0, 2 ); - - # I want to look at the readings in the variant and lemma, and - # collapse readings that are the same word. This is mini-collation. - # Each word in the 'main' list can only be collapsed once with a - # word from the current reading. - my %collapsed = (); - - # Get the label. There will only be one outgoing path to start - # with, so this is safe. - my @out = $var_start->outgoing(); - my $var_label = $out[0]->label(); - - my @variant_readings; - while( $var_start ne $var_end ) { - push( @variant_readings, $var_start ); - $var_start = $collation->next_word( $var_start, $var_label ); - } - push( @variant_readings, $var_end ); - - # Go through the variant readings, and if we find a lemma reading that - # hasn't yet been collapsed with a reading, equate them. If we do - # not, keep them to push onto the end of all_readings. - my @remaining_readings; - my $last_index = 0; - my $curr_pos = 0; - foreach my $w ( @variant_readings ) { - my $word = $w->label(); - my $matched = 0; - foreach my $idx ( $last_index .. $#distinct_readings ) { - my( $l, $pathlabel, $pos ) = @{$distinct_readings[$idx]}; - if( $word eq cmp_str( $l ) ) { - next if exists( $collapsed{ $l->label } ) - && $collapsed{ $l->label } eq $l; - $matched = 1; - $last_index = $idx if $detranspose; - # Collapse the readings. - printf STDERR "Merging readings %s/%s and %s/%s\n", - $l->name, $l->label, $w->name, $w->label; - $collation->merge_readings( $l, $w ); - $collapsed{ $l->label } = $l; - # Now collapse any multiple paths to and from the reading. - remove_duplicate_paths( $collation, - $collation->prior_word( $l, $pathlabel ), $l ); - remove_duplicate_paths( $collation, $l, - $collation->next_word( $l, $pathlabel ) ); - $curr_pos = $pos; - last; - } - } - push( @remaining_readings, [ $w, $var_label, $curr_pos++ ] ) unless $matched; - } - push( @distinct_readings, @remaining_readings) if scalar( @remaining_readings ); - } - - # Now set the positions of all the readings in this variation. - #$DB::single = 1; - print STDERR "Readings and their positions are:\n"; - foreach my $n ( @distinct_readings ) { - printf STDERR "\t%s (position %s)\n", $n->[0]->label(), $n->[2]; +sub set_relationships { + my( $collation, $app, $lemma, $variants ) = @_; + foreach my $rkey ( keys %$variants ) { + my $var = $variants->{$rkey}->{'reading'}; + my $type = $app->{sprintf( "_%s_type", $rkey )}; + my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )}; + my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )}; + + my %rel_options = (); + $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/; + $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/; + + if( $type =~ /^(inv|tr|rep)$/i ) { + # Transposition or repetition: look for nodes with the + # same label but different IDs and mark them. + $type = 'repetition' if $type =~ /^rep/i; + $rel_options{'type'} = $type; + $rel_options{'equal_rank'} = undef; + my %labels; + foreach my $r ( @$lemma ) { + $labels{cmp_str( $r )} = $r; + } + foreach my $r( @$var ) { + if( exists $labels{$r->label} && + $r->name ne $labels{$r->label}->name ) { + if( $type eq 'repetition' ) { + # Repetition + $collation->add_relationship( $r, $labels{$r->label}, \%rel_options ); + } else { + # Transposition + $r->set_identical( $labels{$r->label} ); + } + } + } + } elsif( $type =~ /^(gr|sp(el)?)$/i ) { + + # Grammar/spelling/lexical: this can be a one-to-one or + # one-to-many mapping. We should think about merging + # readings if it is one-to-many. + + $type = 'grammatical' if $type =~ /gr/i; + $type = 'spelling' if $type =~ /sp/i; + $type = 'repetition' if $type =~ /rep/i; + # $type = 'lexical' if $type =~ /lex/i; + $rel_options{'type'} = $type; + $rel_options{'equal_rank'} = 1; + if( @$lemma == @$var ) { + foreach my $i ( 0 .. $#{$lemma} ) { + $collation->add_relationship( $var->[$i], $lemma->[$i], + \%rel_options ); + } + } else { + # An uneven many-to-many mapping. Skip for now. + # We really want to make a segment out of whatever we have. + # my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0]; + # my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0]; + # $collation->add_relationship( $varseg, $lemseg, \%rel_options ); + if( @$lemma == 1 && @$var == 1 ) { + $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options ); + } + } + } elsif( $type !~ /^(add|om|lex)$/i ) { + warn "Unrecognized type $type"; + } } } - -=item B - -remove_duplicate_paths( $collation, $from, $to ); - -Given two readings, reduce the number of paths between those readings to -one. If neither path represents a base text, combine their labels. - -=cut - -sub remove_duplicate_paths { - my( $collation, $from, $to ) = @_; - my @paths = $from->paths_to( $to ); - if( scalar @paths > 1 ) { - my @base = grep { $_->label eq 'base text' } @paths; - if ( scalar @base ) { - # Remove the paths that are not base. - foreach my $e ( @paths ) { - $collation->del_path( $e ) - unless $e eq $base[0]; - } - } else { - # Combine the paths into one. - my $new_path_name = join( ', ', map { $_->label() } @paths ); - my $new_path = shift @paths; - $new_path->set_attribute( 'label', $new_path_name ); - foreach my $e ( @paths ) { - $collation->del_path( $e ); - } - } + + + +sub apply_edits { + my( $collation, $edit_sequence, $debug ) = @_; + my @lemma_text = $collation->reading_sequence( $collation->start, + $collation->reading( '#END#' ) ); + my $drift = 0; + foreach my $correction ( @$edit_sequence ) { + my( $lemma_start, $length, $items ) = @$correction; + my $offset = $base_text_index{$lemma_start}; + my $realoffset = $offset + $drift; + if( $debug || + $lemma_text[$realoffset]->name ne $lemma_start ) { + my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1]; + my @base_phrase; + my $i = $realoffset; + my $l = $collation->reading( $lemma_start ); + while( $i < $realoffset+$length ) { + push( @base_phrase, $l ); + $l = $collation->next_reading( $l ); + $i++; + } + + print STDERR sprintf( "Trying to replace %s (%s) starting at %d " . + "with %s (%s) with drift %d\n", + join( ' ', map {$_->label} @base_phrase ), + join( ' ', map {$_->name} @base_phrase ), + $realoffset, + join( ' ', map {$_->label} @$items ), + join( ' ', map {$_->name} @$items ), + $drift, + ) if $debug; + + if( $lemma_text[$realoffset]->name ne $lemma_start ) { + warn( sprintf( "Should be replacing %s (%s) with %s (%s) " . + "but %s (%s) is there instead", + join( ' ', map {$_->label} @base_phrase ), + join( ' ', map {$_->name} @base_phrase ), + join( ' ', map {$_->label} @$items ), + join( ' ', map {$_->name} @$items ), + join( ' ', map {$_->label} @this_phrase ), + join( ' ', map {$_->name} @this_phrase ), + ) ); + # next; + } + } + splice( @lemma_text, $realoffset, $length, @$items ); + $drift += @$items - $length; } + return @lemma_text; } + # Helper function. Given a witness sigil, if it is a post-correctione # sigil,return the base witness. If not, return a false value. sub _is_post_corr { my( $sigil ) = @_; - if( $sigil =~ /^(.*?)(\s*\(p\.\s*c\.\))$/ ) { - return $1; + if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) { + return $1; } return undef; } -=item B - -Pretend you never saw this method. Really it needs to not be hardcoded. - -=cut - -sub cmp_str { - my( $reading ) = @_; - my $word = $reading->label(); - $word = lc( $word ); - $word =~ s/\W//g; - $word =~ s/v/u/g; - $word =~ s/j/i/g; - $word =~ s/cha/ca/g; - $word =~ s/quatuor/quattuor/g; - $word =~ s/ioannes/iohannes/g; - return $word; -} =back