From: tla Date: Wed, 25 May 2011 13:05:22 +0000 (+0200) Subject: working with new base text merge routine, up to line 25 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=6a222840d60d8ef64c47e5ce0c4f033db1f72e2b;p=scpubgit%2Fstemmatology.git working with new base text merge routine, up to line 25 --- diff --git a/lib/Text/Tradition.pm b/lib/Text/Tradition.pm index 8e028a9..c2deeae 100644 --- a/lib/Text/Tradition.pm +++ b/lib/Text/Tradition.pm @@ -72,10 +72,10 @@ sub BUILD { if( $format ) { my @parseargs; if( $format =~ /^(CSV|CTE)$/ ) { - @parseargs = ( 'base' => $init_args->{'base'}, - 'data' => $init_args->{$format}, - 'format' => $format ); + $init_args->{'data'} = $init_args->{$format}; + $init_args->{'format'} = $format; $format = 'BaseText'; + @parseargs = %$init_args; } else { @parseargs = ( $init_args->{ $format } ); } diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index ea194ef..4686d5e 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -299,15 +299,15 @@ sub collapse_graph_edges { $label = join( ', ', @{$newlabels->{$newdest}} ); } else { ## TODO FIX THIS HACK - my @pclabels; + my @aclabels; foreach my $wit ( @{$newlabels->{$newdest}} ) { - if( $wit =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) { - push( @pclabels, $wit ); + if( $wit =~ /^(.*?)(\s*\(?a\.\s*c\.\)?)$/ ) { + push( @aclabels, $wit ); } else { push( @compressed_wits, $wit ); } } - $label = join( ', ', 'majority', @pclabels ); + $label = join( ', ', 'majority', @aclabels ); } my $newedge = @@ -555,86 +555,60 @@ sub _remove_common { # An alternative to walk_witness_paths, for use when a collation is -# constructed from a base text and an apparatus. Also modifies the -# collation graph to remove all 'base text' paths and replace them -# with real witness paths. +# constructed from a base text and an apparatus. We have the +# sequences of readings and just need to add path edges. -sub walk_and_expand_base { - my( $self, $end ) = @_; +sub make_witness_paths { + my( $self ) = @_; my @common_readings; foreach my $wit ( @{$self->tradition->witnesses} ) { - my $sig = $wit->sigil; - $DB::single = 1 if $sig eq 'Vb5'; - my $post_sig; - $post_sig = $wit->post_correctione - if $wit->has_post_correctione; - - my @wit_path = $self->reading_sequence( $self->start, $end, $sig ); - $wit->path( \@wit_path ); - $self->connect_readings_for_witness( $wit ); - @common_readings = _find_common( \@common_readings, \@wit_path ); - - # If there is a post-correctio, get its path and compare. - # Add a correction range for each divergence. - if( $post_sig ) { - my @corr_wit_path = $self->reading_sequence( $self->start, $end, - "$sig$post_sig", $sig ); - - # Map ante-corr readings to their indices - my %in_orig; - my $i = 0; - map { $in_orig{$_->name} = $i++ } @wit_path; - - # Look for divergences - my $diverged = 0; - my $last_common; - my @correction; - foreach my $rdg ( @corr_wit_path ) { - if( exists( $in_orig{$rdg->name} ) && !$diverged ) { - # We are reading the same here - $last_common = $in_orig{$rdg->name}; - } elsif ( exists( $in_orig{$rdg->name} ) ) { - # We have been diverging but are reading the same again. - # Add the correction to the witness. - my $offset = $last_common + 1; - my $length = $in_orig{$rdg->name} - $offset; - $wit->add_correction( $offset, $length, @correction ); - $diverged = 0; - @common_readings = _remove_common( \@common_readings, \@correction ); - @correction = (); - $last_common = $in_orig{$rdg->name}; - } elsif( $diverged ) { - # We are in the middle of a divergence. - push( @correction, $rdg ); - } else { - # We have started to diverge. Note it. - $diverged = 1; - push( @correction, $rdg ); - } - } - # Add any divergence that is at the end of the text - if( $diverged ) { - $wit->add_correction( $last_common+1, $#wit_path, \@correction ); + $self->make_witness_path( $wit ); + @common_readings = _find_common( \@common_readings, $wit->path ); + + # If we have pre-corrected readings, we need to add paths + # for those as well. + if( $wit->has_ante_corr ) { + my @path = @{$wit->path}; + foreach my $ac ( @{$wit->ante_corr} ) { + # my( $offset, $length, $items ) = @$ac; + # Figure out where the path needs to start and + # end its divergence. + my $start = $ac->[0] - 1; + my $end = $ac->[0] + $ac->[1]; + my @chain; + push( @chain, $path[$start] ); + push( @chain, @{$ac->[2]} ); + push( @chain, $path[$end] ); + $self->make_path_uncorrection( $wit->sigil, @chain ); } + @common_readings = _find_common( \@common_readings, + $wit->uncorrected_path ); } } - - # Remove any 'base text' paths. - foreach my $path ( $self->paths ) { - $self->del_path( $path ) - if $path->label eq $self->baselabel; - } + return @common_readings; } -sub connect_readings_for_witness { +sub make_witness_path { my( $self, $wit ) = @_; my @chain = @{$wit->path}; + $self->connect_readings_for_witness( $wit->sigil, @chain ); +} + +sub make_path_uncorrection { + my( $self, $sig, @chain ) = @_; + $sig .= ' (a.c.)'; + $self->connect_readings_for_witness( $sig, @chain ); +} + +sub connect_readings_for_witness { + my( $self, $sig, @chain ) = @_; foreach my $idx ( 0 .. $#chain-1 ) { - $self->add_path( $chain[$idx], $chain[$idx+1], $wit->sigil ); + $self->add_path( $chain[$idx], $chain[$idx+1], $sig ); } } + sub common_readings { my $self = shift; my @common = grep { $_->is_common } $self->readings(); @@ -656,8 +630,8 @@ sub calculate_positions { foreach my $wit ( @{$self->tradition->witnesses} ) { print STDERR "Calculating positions in " . $wit->sigil . "\n"; _update_positions_from_path( $wit->path, @ordered_common ); - _update_positions_from_path( $wit->corrected_path, @ordered_common ) - if $wit->has_post_correctione; + _update_positions_from_path( $wit->uncorrected_path, @ordered_common ) + if $wit->has_ante_corr; } # DEBUG @@ -676,7 +650,6 @@ sub _update_positions_from_path { # that corresponds to its eventual position identifier. Common # nodes always start a new row, and are thus always in the first # column. - my $wit_matrix = []; my $cn = 0; # We should hit the common readings in order. my $row = []; @@ -703,6 +676,8 @@ sub _update_positions_from_path { foreach my $di ( 1..scalar(@{$wit_matrix->[$li-1]}) ) { my $reading = $wit_matrix->[$li-1]->[$di-1]; my $position = "$li,$di"; + $DB::single = 1 unless ref( $reading ) eq 'Text::Tradition::Collation::Reading'; + # If we have seen this node before, we need to compare # its position with what went before. unless( $reading->has_position && diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm index eb9fd74..ae5235c 100644 --- a/lib/Text/Tradition/Parser/BaseText.pm +++ b/lib/Text/Tradition/Parser/BaseText.pm @@ -40,12 +40,14 @@ Takes an initialized graph and a set of options, which must include: =cut +my $DETRANSPOSE = 0; sub parse { my( $tradition, %opts ) = @_; my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'}; load( $format_mod ); my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} ); + $DETRANSPOSE = 1 if $opts{'linear'}; merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries ); } @@ -76,10 +78,10 @@ underscore in its name. =cut -my $SHORT = undef; # Debug var - set this to limit the number of lines parsed +my $SHORT = 25; # Debug var - set this to limit the number of lines parsed my %base_text_index; -my $edits_required; +my $edits_required = {}; # edits_required -> wit -> [ { start_idx, end_idx, items } ] @@ -88,6 +90,7 @@ sub merge_base { my @base_line_starts = read_base( $base_file, $collation ); my %all_witnesses; + my @unwitnessed_lemma_nodes; foreach my $app ( @app_entries ) { my( $line, $num ) = split( /\./, $app->{_id} ); # DEBUG with a short graph @@ -164,26 +167,27 @@ sub merge_base { # terms of a splice operation, for each subsequent reading. # We also note which witnesses take the given edit. - my @lemma_set = $collation->reading_sequence( $lemma_start, $lemma_end ); + my @lemma_set = $collation->reading_sequence( $lemma_start, + $lemma_end ); my @reading_sets = [ @lemma_set ]; # For each reading that is not rdg_0, we create the variant # reading nodes, and store the range as an edit operation on # the base text. my $variant_objects; - my %pc_lemma; # Keep track of mss that have been corrected back to lemma - my %pc_variant; # Keep track of mss with other corrections + my %pc_seen; # Keep track of mss with explicit post-corr data foreach my $k ( grep { /^rdg/ } keys( %$app ) ) { my @mss = grep { $app->{$_} eq $k } keys( %$app ); + push( @unwitnessed_lemma_nodes, @lemma_set ) + if !@mss && $k eq 'rdg_0'; + # Keep track of what witnesses we have seen. @all_witnesses{ @mss } = ( 1 ) x scalar( @mss ); - my $pc_hash = $k eq 'rdg_0' ? \%pc_lemma : \%pc_variant; - # Keep track of which witnesses bear corrected readings here. foreach my $m ( @mss ) { my $base = _is_post_corr( $m ); next unless $base; - $pc_hash->{$base} = 1; + $pc_seen{$base} = 1; } next if $k eq 'rdg_0'; @@ -209,6 +213,7 @@ sub merge_base { # Now collate and collapse the identical readings within the # collated sets. Modifies the reading sets that were passed. + $DB::single = 1 if "$line.$num" eq '16.2'; collate_variants( $collation, @reading_sets ); # Now create the splice-edit objects that will be used @@ -228,12 +233,12 @@ sub merge_base { _add_hash_entry( $edits_required, $pc_key, $edit_object ); } else { # this is an ante-corr witness my $pc_key = $ms . "_post"; - _add_hash_entry( $edits_required, $_, $edit_object ); - unless( !$pc_lemma{$ms} && !$pc_variant{$ms} ) { - # If this witness carries no correction, add this same object - # to its post-corrected state. - # TODO combine these hashes? - _add_hash_entry( $edits_required, $pc_key, $edit_object ); + _add_hash_entry( $edits_required, $ms, $edit_object ); + unless( $pc_seen{$ms} ) { + # If this witness carries no correction, add this + # same object to its post-corrected state. + _add_hash_entry( $edits_required, $pc_key, + $edit_object ); } } } @@ -241,26 +246,36 @@ sub merge_base { } # Finished going through the apparatus entries # Now make the witness objects, and create their text sequences - foreach my $w ( grep { $_ !~ /_base$/ } keys %$edits_required ) { + foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) { my $witness_obj = $collation->tradition->add_witness( sigil => $w ); - my @ante_corr_seq = apply_edits( $edits_required->{$w} ); - my @post_corr_seq = apply_edits( $edits_required->{$w."_post"} ) + my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w} ); + my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"} ) if exists( $edits_required->{$w."_post"} ); - # Now how to save these paths in my witness object? + # Now save these paths in my witness object if( @post_corr_seq ) { - $witness_obj->add_path( @post_corr_seq ); - $witness_obj->add_uncorrected_path( @ante_corr_seq ); + $witness_obj->path( \@post_corr_seq ); + my @ante_corr = make_witness_uncorrections( \@post_corr_seq, + \@ante_corr_seq ); + $witness_obj->ante_corr( \@ante_corr ); } else { - $witness_obj->add_path( @ante_corr_seq ); + $witness_obj->path( \@ante_corr_seq ); } } - # TODO Now remove all the 'base text' links. + # Now remove our 'base text' edges, which is to say, the only + # ones we have created so far. Also remove any nodes that didn't + # appear in any witnesses. + foreach ( $collation->paths() ) { + $collation->del_path( $_ ); + } + foreach( @unwitnessed_lemma_nodes ) { + $collation->del_reading( $_ ); + } # Now walk paths and calculate positions. my @common_readings = - $collation->walk_and_expand_base( $collation->reading( '#END#' ) ); + $collation->make_witness_paths(); $collation->calculate_positions( @common_readings ); } @@ -282,10 +297,11 @@ sub read_base { # This array gives the first reading for each line. We put the # common starting point in line zero. my $last_reading = $collation->start(); + $base_text_index{$last_reading->name} = 0; my $lineref_array = [ $last_reading ]; # There is no line zero. open( BASE, $base_file ) or die "Could not open file $base_file: $!"; - my $i = 0; + my $i = 1; while() { # Make the readings, and connect them up for the base, but # also save the first reading of each line in an array for the @@ -321,6 +337,7 @@ sub read_base { my $endpoint = $collation->add_reading( '#END#' ); $collation->add_path( $last_reading, $endpoint, $collation->baselabel ); push( @$lineref_array, $endpoint ); + $base_text_index{$endpoint->name} = $i; return( @$lineref_array ); } @@ -342,7 +359,6 @@ TODO: Handle collapsed and non-collapsed transpositions. sub collate_variants { my( $collation, @reading_sets ) = @_; - # my $detranspose = 1; # TODO handle merging transposed nodes # Merge the nodes across the sets so that there is only one node # for any given reading. Use diff to identify the 'same' nodes. @@ -354,28 +370,47 @@ sub collate_variants { while( @reading_sets ) { my $variant_set = shift @reading_sets; - my $diff = Algorithm::Diff->new( \@unique, $variant_set, \&_collation_hash ); - my @new_unique; - push( @new_unique, @unique ); - while( $diff->Next ) { - if( $diff->Same ) { - # merge the nodes - my @l = $diff->Items( 1 ); - my @v = $diff->Items( 2 ); - foreach my $i ( 0 .. $#l ) { - $collation->merge_readings( $l[$i], $v[$i] ); + if( $DETRANSPOSE ) { + # Use diff to do this job + my $diff = Algorithm::Diff->new( \@unique, $variant_set, + {'keyGen' => \&_collation_hash} ); + my @new_unique; + while( $diff->Next ) { + if( $diff->Same ) { + # merge the nodes + my @l = $diff->Items( 1 ); + my @v = $diff->Items( 2 ); + foreach my $i ( 0 .. $#l ) { + $collation->merge_readings( $l[$i], $v[$i] ); + } + # splice the lemma nodes into the variant set + my( $offset ) = $diff->Get( 'min2' ); + splice( @$variant_set, $offset, scalar( @l ), @l ); + push( @new_unique, @l ); + } else { + # Keep the old unique readings + push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 ); + # Add the new readings to the 'unique' list + push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 ); } - # splice the lemma nodes into the variant set - splice( @$variant_set, $diff->Get( 'min2' ), scalar( @l ), @l ); - push( @new_unique, @l ); - } else { - # Keep the old unique readings - push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 ); - # Add the new readings to the 'unique' list - push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 ); } + @unique = @new_unique; + } else { + # It becomes a much simpler job + $DB::single = 1; + my @distinct; + foreach my $idx ( 0 .. $#{$variant_set} ) { + my $vw = $variant_set->[$idx]; + my @same = grep { cmp_str( $_ ) eq $vw->label } @unique; + if( @same ) { + $collation->merge_readings( $same[0], $vw ); + $variant_set->[$idx] = $same[0]; + } else { + push( @distinct, $vw ); + } + } + push( @unique, @distinct ); } - @unique = @new_unique; } return; @@ -384,12 +419,14 @@ sub collate_variants { sub _collation_hash { my $node = shift; - return _cmp_str( $node->label ); + return cmp_str( $node ); } sub apply_edits { - my $edit_sequence = shift; - my @lemma_text = map { $base_text_index{$_} } sort( keys %base_text_index ); + my( $collation, $edit_sequence ) = @_; + my @lemma_names = sort { $base_text_index{$a} <=> $base_text_index{$b} } + keys %base_text_index; + my @lemma_text = map { $collation->reading( $_ ) } @lemma_names; my $drift = 0; foreach my $correction ( @$edit_sequence ) { @@ -398,9 +435,26 @@ sub apply_edits { splice( @lemma_text, $realoffset, $length, @$items ); $drift += @$items - $length; } - return \@lemma_text; + return @lemma_text; } +sub make_witness_uncorrections { + my( $path, $uncorr_path ) = @_; + my $diff = Algorithm::Diff->new( $path, $uncorr_path, + { 'keyGen' => \&_collation_hash } ); + # We basically just want to make a bunch of splice arguments that + # will reconstruct the ante-corr text from the post-corr. + my @diff_list; + while( $diff->Next ) { + next if $diff->Same; + my( $offset ) = $diff->Get( 'min1' ); + my $length = scalar( $diff->Items( 1 ) ); + my $items = []; push( @$items, $diff->Items( 2 ) ); + push( @diff_list, [ $offset, $length, $items ] ); + } + return @diff_list; +} + # Helper function. Given a witness sigil, if it is a post-correctione # sigil,return the base witness. If not, return a false value. @@ -412,6 +466,16 @@ sub _is_post_corr { return undef; } +sub _add_hash_entry { + my( $hash, $key, $entry ) = @_; + if( exists $hash->{$key} ) { + push( @{$hash->{$key}}, $entry ); + } else { + $hash->{$key} = [ $entry ]; + } +} + + =item B Pretend you never saw this method. Really it needs to not be hardcoded. diff --git a/lib/Text/Tradition/Witness.pm b/lib/Text/Tradition/Witness.pm index f570365..145eba4 100644 --- a/lib/Text/Tradition/Witness.pm +++ b/lib/Text/Tradition/Witness.pm @@ -32,25 +32,19 @@ has 'path' => ( predicate => 'has_path', ); -has 'post_correctione' => ( - is => 'rw', - isa => 'Str', - predicate => 'has_post_correctione', - ); - subtype 'Correction', as 'ArrayRef', where { @{$_} == 3 && - $_->[0]->isa( 'Int' ) && - $_->[1]->isa( 'Int' ) && - $_->[2]->isa( 'ArrayRef[Text::Tradition::Collation::Reading]' ); + find_type_constraint('Int')->check( $_->[0] ) && + find_type_constraint('Int')->check( $_->[1] ) && + find_type_constraint('ArrayRef[Text::Tradition::Collation::Reading]')->check( $_->[2] ); }, message { 'Correction must be a tuple of [offset, length, list]' }; -has 'corrections' => ( - is => 'ro', +has 'ante_corr' => ( + is => 'rw', isa => 'ArrayRef[Correction]', - default => sub { [] }, + predicate => 'has_ante_corr', ); @@ -85,20 +79,14 @@ around text => sub { $self->$orig( @_ ); }; -sub add_correction { - my( $self, $offset, $length, @replacement ) = @_; - # Rely on Moose for type checking of the arguments - push( @{$self->corrections}, [ $offset, $length, \@replacement ] ); -} - -sub corrected_path { +sub uncorrected_path { my $self = shift; my @new_path; push( @new_path, @{$self->path} ); my $drift = 0; - foreach my $correction ( @{$self->corrections} ) { - my( $offset, $length, $items ) = @$correction; + foreach my $change ( @{$self->ante_corr} ) { + my( $offset, $length, $items ) = @$change; my $realoffset = $offset + $drift; splice( @new_path, $realoffset, $length, @$items ); $drift += @$items - $length; diff --git a/script/svg_from_csv.pl b/script/svg_from_csv.pl index 3907ca6..97e2741 100644 --- a/script/svg_from_csv.pl +++ b/script/svg_from_csv.pl @@ -11,6 +11,7 @@ use Text::Tradition; my $tradition = Text::Tradition->new( 'CSV' => $ARGV[0], 'base' => $ARGV[1], + 'linear' => 1, );