From: Tara L Andrews Date: Tue, 27 Dec 2011 01:07:16 +0000 (+0100) Subject: various bugfixes, getting real traditions to parse X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=49d4f2accb29737fdbaa42f75062d2055c4bc2ef;p=scpubgit%2Fstemmatology.git various bugfixes, getting real traditions to parse --- diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index 8609c96..17eed6a 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -135,7 +135,6 @@ around del_reading => sub { if( ref( $arg ) eq 'Text::Tradition::Collation::Reading' ) { $arg = $arg->id; } - # Remove the reading from the graphs. $self->sequence->delete_vertex( $arg ); $self->relations->delete_vertex( $arg ); @@ -151,7 +150,7 @@ sub merge_readings { # We only need the IDs for adding paths to the graph, not the reading # objects themselves. - my( $kept, $deleted ) = $self->_stringify_args( @_ ); + my( $kept, $deleted, $combine_char ) = $self->_stringify_args( @_ ); # The kept reading should inherit the paths and the relationships # of the deleted reading. @@ -159,6 +158,7 @@ sub merge_readings { my @vector = ( $kept ); push( @vector, $path->[1] ) if $path->[0] eq $deleted; unshift( @vector, $path->[0] ) if $path->[1] eq $deleted; + next if $vector[0] eq $vector[1]; # Don't add a self loop my %wits = %{$self->sequence->get_edge_attributes( @$path )}; $self->sequence->add_edge( @vector ); my $fwits = $self->sequence->get_edge_attributes( @vector ); @@ -168,6 +168,7 @@ sub merge_readings { foreach my $rel ( $self->relations->edges_at( $deleted ) ) { my @vector = ( $kept ); push( @vector, $rel->[0] eq $deleted ? $rel->[1] : $rel->[0] ); + next if $vector[0] eq $vector[1]; # Don't add a self loop # Is there a relationship here already? If so, keep it. # TODO Warn about conflicting relationships next if $self->relations->has_edge( @vector ); @@ -178,6 +179,12 @@ sub merge_readings { } # Do the deletion deed. + if( $combine_char ) { + my $kept_obj = $self->reading( $kept ); + my $new_text = join( $combine_char, $kept_obj->text, + $self->reading( $deleted )->text ); + $kept_obj->alter_text( $new_text ); + } $self->del_reading( $deleted ); } @@ -209,13 +216,20 @@ sub add_path { sub del_path { my $self = shift; + my @args; + if( ref( $_[0] ) eq 'ARRAY' ) { + my $e = shift @_; + @args = ( @$e, @_ ); + } else { + @args = @_; + } # We only need the IDs for adding paths to the graph, not the reading # objects themselves. - my( $source, $target, $wit ) = $self->_stringify_args( @_ ); + my( $source, $target, $wit ) = $self->_stringify_args( @args ); if( $self->sequence->has_edge_attribute( $source, $target, $wit ) ) { - $self->sequence->del_edge_attribute( $source, $target, $wit ); + $self->sequence->delete_edge_attribute( $source, $target, $wit ); } unless( keys %{$self->sequence->get_edge_attributes( $source, $target )} ) { $self->sequence->delete_edge( $source, $target ); @@ -262,11 +276,11 @@ sub add_relationship { # Check the options if( !defined $options->{'type'} || - $options->{'type'} !~ /^(spelling|orthographic|grammatical|meaning|repetition|transposition)$/i ) { + $options->{'type'} !~ /^(spelling|orthographic|grammatical|meaning|lookalike|repetition|transposition)$/i ) { my $t = $options->{'type'} ? $options->{'type'} : ''; - return( undef, "Invalid or missing type" . $options->{'type'} ); + return( undef, "Invalid or missing type " . $options->{'type'} ); } - if( $options->{'type'} =~ /^(spelling|orthographic|grammatical|meaning)$/ ) { + unless ( $options->{'type'} =~ /^(repetition|transposition)$/ ) { $options->{'colocated'} = 1; } @@ -305,10 +319,10 @@ sub relationship_valid { # to a path loop for any witness. First make a lookup table of all the # readings related to either the source or the target. my @proposed_related = ( $source, $target ); - push( @proposed_related, $source->related_readings( 'colocated' ) ); - push( @proposed_related, $target->related_readings( 'colocated' ) ); + push( @proposed_related, $self->related_readings( $source, 'colocated' ) ); + push( @proposed_related, $self->related_readings( $target, 'colocated' ) ); my %pr_ids; - map { $pr_ids{ $_->id } = 1 } @proposed_related; + map { $pr_ids{ $_ } = 1 } @proposed_related; # None of these proposed related readings should have a neighbor that # is also in proposed_related. @@ -784,7 +798,6 @@ sub reading_sequence { $seen{$n->id} = 1; my $next = $self->next_reading( $n, $witness, $backup ); - $DB::single = 1 if $next->id eq $end->id; unless( $next ) { warn "Did not find any path for $witness from reading " . $n->id; last; @@ -952,6 +965,7 @@ sub calculate_ranks { foreach my $n ( $self->sequence->successors( $r->id ) ) { my( $tfrom, $tto ) = ( $rel_containers{$r->id}, $rel_containers{$n} ); + $DB::single = 1 unless $tfrom && $tto; $topo_graph->add_edge( $tfrom, $tto ); } } diff --git a/lib/Text/Tradition/Collation/Reading.pm b/lib/Text/Tradition/Collation/Reading.pm index 9c30ea2..d0d0385 100644 --- a/lib/Text/Tradition/Collation/Reading.pm +++ b/lib/Text/Tradition/Collation/Reading.pm @@ -78,6 +78,7 @@ has 'text' => ( is => 'ro', isa => 'Str', required => 1, + writer => 'alter_text', ); has 'is_start' => ( diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm index 545ca6d..a5ab34a 100644 --- a/lib/Text/Tradition/Parser/BaseText.pm +++ b/lib/Text/Tradition/Parser/BaseText.pm @@ -89,7 +89,6 @@ sub merge_base { my @base_line_starts = read_base( $base_file, $collation ); my %all_witnesses; - my @unwitnessed_lemma_nodes; foreach my $app ( @app_entries ) { my( $line, $num ) = split( /\./, $app->{_id} ); # DEBUG with a short graph @@ -113,12 +112,12 @@ sub merge_base { my %seen; while( $lemma_start ne $too_far ) { # Loop detection - if( $seen{ $lemma_start->name() } ) { - warn "Detected loop at " . $lemma_start->name() . + if( $seen{ $lemma_start->id() } ) { + warn "Detected loop at " . $lemma_start->id() . ", ref $line,$num"; last; } - $seen{ $lemma_start->name() } = 1; + $seen{ $lemma_start->id() } = 1; # Try to match the lemma. my $unmatch = 0; @@ -178,11 +177,6 @@ sub merge_base { foreach my $k ( grep { /^rdg/ } keys( %$app ) ) { my @mss = grep { $app->{$_} eq $k } keys( %$app ); - # Keep track of lemma nodes that don't actually appear in - # any MSS; we will want to remove them from the collation. - push( @unwitnessed_lemma_nodes, @lemma_set ) - if !@mss && $k eq 'rdg_0'; - # Keep track of what witnesses we have seen. @all_witnesses{ @mss } = ( 1 ) x scalar( @mss ); # Keep track of which witnesses bear corrected readings here. @@ -202,8 +196,9 @@ sub merge_base { my $ctr = 0; foreach my $vw ( @variant ) { my $vwname = "$k/$line.$num.$ctr"; $ctr++; - my $vwreading = $collation->add_reading( $vwname ); - $vwreading->text( $vw ); + my $vwreading = $collation->add_reading( { + 'id' => $vwname, + 'text' => $vw } ); push( @variant_readings, $vwreading ); } @@ -226,7 +221,7 @@ sub merge_base { foreach my $rkey ( keys %$variant_objects ) { # Object is argument list for splice, so: # offset, length, replacements - my $edit_object = [ $lemma_start->name, + my $edit_object = [ $lemma_start->id, scalar( @lemma_set ), $variant_objects->{$rkey}->{reading} ]; foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) { @@ -278,11 +273,7 @@ sub merge_base { # ones we have created so far. Also remove any unwitnessed # lemma nodes (TODO unless we are treating base as witness) foreach ( $collation->paths() ) { - $collation->del_path( $_ ); - } - foreach( @unwitnessed_lemma_nodes ) { - $collation->del_reading( $_ ); - # TODO do we need to delete any relationship paths here? + $collation->del_path( $_, $collation->baselabel ); } ### HACKY HACKY Do some one-off path corrections here. @@ -291,6 +282,13 @@ sub merge_base { # Now walk paths and calculate positional rank. $collation->make_witness_paths(); + # Now delete any orphaned readings. + foreach my $r ( $collation->sequence->isolated_vertices ) { + print STDERR "Deleting unconnected reading $r / " . + $collation->reading( $r )->text . "\n"; + $collation->del_reading( $r ); + } + KUL::HACK::post_path_hack( $collation ); # Have to check relationship validity at this point, because before that # we had no paths. @@ -298,7 +296,7 @@ sub merge_base { # next unless $rel->equal_rank; # unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) { # warn sprintf( "Relationship type %s between %s and %s is invalid, deleting", -# $rel->type, $rel->from->name, $rel->to->name ); +# $rel->type, $rel->from->id, $rel->to->id ); # } # } $collation->calculate_ranks(); @@ -321,8 +319,8 @@ sub read_base { # This array gives the first reading for each line. We put the # common starting point in line zero. - my $last_reading = $collation->start(); - $base_text_index{$last_reading->name} = 0; + my $last_reading = $collation->start; + $base_text_index{$last_reading->id} = 0; my $lineref_array = [ $last_reading ]; # There is no line zero. open( BASE, $base_file ) or die "Could not open file $base_file: $!"; @@ -340,8 +338,7 @@ sub read_base { last if $SHORTEND && $lineref > $SHORTEND; foreach my $w ( @words ) { my $readingref = join( ',', $lineref, ++$wordref ); - my $reading = $collation->add_reading( $readingref ); - $reading->text( $w ); + my $reading = $collation->add_reading( { id => $readingref, text => $w } ); unless( $started ) { push( @$lineref_array, $reading ); $started = 1; @@ -361,7 +358,7 @@ sub read_base { # Ending point for all texts $collation->add_path( $last_reading, $collation->end, $collation->baselabel ); push( @$lineref_array, $collation->end ); - $base_text_index{$collation->end->name} = $i; + $base_text_index{$collation->end->id} = $i; return( @$lineref_array ); } @@ -389,14 +386,14 @@ sub set_relationships { $labels{cmp_str( $r )} = $r; } foreach my $r( @$var ) { - if( exists $labels{$r->label} && - $r->name ne $labels{$r->label}->name ) { + if( exists $labels{$r->text} && + $r->id ne $labels{$r->text}->id ) { if( $type eq 'repetition' ) { # Repetition - $collation->add_relationship( $r, $labels{$r->label}, \%rel_options ); + $collation->add_relationship( $r, $labels{$r->text}, \%rel_options ); } else { # Transposition - $r->set_identical( $labels{$r->label} ); + $r->set_identical( $labels{$r->text} ); } } } @@ -437,15 +434,15 @@ sub set_relationships { sub apply_edits { my( $collation, $edit_sequence, $debug ) = @_; - my @lemma_text = $collation->reading_sequence( $collation->start, - $collation->reading( '#END#' ) ); + my @lemma_text = $collation->reading_sequence( + $collation->start, $collation->end ); my $drift = 0; foreach my $correction ( @$edit_sequence ) { my( $lemma_start, $length, $items ) = @$correction; my $offset = $base_text_index{$lemma_start}; my $realoffset = $offset + $drift; if( $debug || - $lemma_text[$realoffset]->name ne $lemma_start ) { + $lemma_text[$realoffset]->id ne $lemma_start ) { my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1]; my @base_phrase; my $i = $realoffset; @@ -458,23 +455,23 @@ sub apply_edits { print STDERR sprintf( "Trying to replace %s (%s) starting at %d " . "with %s (%s) with drift %d\n", - join( ' ', map {$_->label} @base_phrase ), - join( ' ', map {$_->name} @base_phrase ), + join( ' ', map {$_->text} @base_phrase ), + join( ' ', map {$_->id} @base_phrase ), $realoffset, - join( ' ', map {$_->label} @$items ), - join( ' ', map {$_->name} @$items ), + join( ' ', map {$_->text} @$items ), + join( ' ', map {$_->id} @$items ), $drift, ) if $debug; - if( $lemma_text[$realoffset]->name ne $lemma_start ) { + if( $lemma_text[$realoffset]->id ne $lemma_start ) { warn( sprintf( "Should be replacing %s (%s) with %s (%s) " . "but %s (%s) is there instead", - join( ' ', map {$_->label} @base_phrase ), - join( ' ', map {$_->name} @base_phrase ), - join( ' ', map {$_->label} @$items ), - join( ' ', map {$_->name} @$items ), - join( ' ', map {$_->label} @this_phrase ), - join( ' ', map {$_->name} @this_phrase ), + join( ' ', map {$_->text} @base_phrase ), + join( ' ', map {$_->id} @base_phrase ), + join( ' ', map {$_->text} @$items ), + join( ' ', map {$_->id} @$items ), + join( ' ', map {$_->text} @this_phrase ), + join( ' ', map {$_->id} @this_phrase ), ) ); # next; } diff --git a/lib/Text/Tradition/Parser/CollateX.pm b/lib/Text/Tradition/Parser/CollateX.pm index 7123d4d..6618a73 100644 --- a/lib/Text/Tradition/Parser/CollateX.pm +++ b/lib/Text/Tradition/Parser/CollateX.pm @@ -92,7 +92,6 @@ sub parse { } my %node_data = %$n; my $gnode_args = { - 'collation' => $collation, 'id' => delete $node_data{$IDKEY}, 'text' => delete $node_data{$CONTENTKEY}, }; diff --git a/lib/Text/Tradition/Parser/Self.pm b/lib/Text/Tradition/Parser/Self.pm index 62f9f1f..a1725d6 100644 --- a/lib/Text/Tradition/Parser/Self.pm +++ b/lib/Text/Tradition/Parser/Self.pm @@ -138,7 +138,6 @@ sub parse { my $use_version; print STDERR "Setting graph globals\n"; $tradition->name( $graph_data->{'name'} ); - $DB::single = 1; foreach my $gkey ( keys %{$graph_data->{'global'}} ) { my $val = $graph_data->{'global'}->{$gkey}; if( $gkey eq 'version' ) { diff --git a/lib/Text/Tradition/Parser/TEI.pm b/lib/Text/Tradition/Parser/TEI.pm index fa29702..1e64680 100644 --- a/lib/Text/Tradition/Parser/TEI.pm +++ b/lib/Text/Tradition/Parser/TEI.pm @@ -364,7 +364,6 @@ sub _return_rdg { foreach my $sig ( keys %$text ) { next if $active_wits{$sig}; my $l = $tradition->collation->add_reading( { - 'collation' => $tradition->collation, 'id' => $current_app . "_$i", 'is_lacuna' => 1 } ); $i++; @@ -380,7 +379,6 @@ sub _return_rdg { foreach my $i ( 0 .. $#cur_wits ) { my $w = $cur_wits[$i]; my $l = $tradition->collation->add_reading( { - 'collation' => $tradition->collation, 'id' => $current_app . "_$i", 'is_lacuna' => 1 } ); push( @{$text->{$w}}, $l ); @@ -472,8 +470,7 @@ sub _get_sigla { } } my $rdg = $graph->add_reading( - { 'collation' => $graph, - 'id' => $xml_id, + { 'id' => $xml_id, 'text' => $word } ); $used_nodeids{$xml_id} = $rdg; diff --git a/lib/Text/Tradition/Parser/Tabular.pm b/lib/Text/Tradition/Parser/Tabular.pm index 4c1e511..b70285f 100644 --- a/lib/Text/Tradition/Parser/Tabular.pm +++ b/lib/Text/Tradition/Parser/Tabular.pm @@ -157,7 +157,6 @@ sub parse { $l = $c->reading( $l_id ); } else { $l = $c->add_reading( { - 'collation' => $c, 'id' => $l_id, 'is_lacuna' => 1, } ); @@ -198,7 +197,6 @@ sub make_nodes { my $ctr = 1; foreach my $w ( keys %unique ) { my $rargs = { - 'collation' => $collation, 'id' => "$index,$ctr", 'rank' => $index, 'text' => $w,