From: Tara L Andrews Date: Fri, 3 Jun 2011 22:39:08 +0000 (+0200) Subject: make relationships work, add some hacks for Boodts collation X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=b15511bfc5546fb21b191921506253f89f19465a;p=scpubgit%2Fstemmatology.git make relationships work, add some hacks for Boodts collation --- diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index 82aae47..3d43a84 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -2,8 +2,10 @@ package Text::Tradition::Collation; use Graph::Easy; use IPC::Run qw( run binary ); -use Text::Tradition::Collation::Reading; use Text::Tradition::Collation::Path; +use Text::Tradition::Collation::Reading; +use Text::Tradition::Collation::Relationship; +use Text::Tradition::Collation::Segment; use XML::LibXML; use Moose; @@ -18,6 +20,7 @@ has 'graph' => ( reading => 'node', path => 'edge', readings => 'nodes', + segments => 'nodes', paths => 'edges', relationships => 'edges', }, @@ -119,6 +122,7 @@ around add_path => sub { return; } # Make sure the proposed path does not yet exist + # NOTE 'reading' will currently return readings and segments my( $source, $target, $wit ) = @_; $source = $self->reading( $source ) unless ref( $source ) eq 'Text::Tradition::Collation::Reading'; @@ -138,14 +142,28 @@ around paths => sub { my $orig = shift; my $self = shift; - my @result = grep { $_->class eq 'edge.path' } $self->$orig( @_ ); + my @result = grep { $_->sub_class eq 'path' } $self->$orig( @_ ); return @result; }; around relationships => sub { my $orig = shift; my $self = shift; - my @result = grep { $_->class eq 'edge.relationship' } $self->$orig( @_ ); + my @result = grep { $_->sub_class eq 'relationship' } $self->$orig( @_ ); + return @result; +}; + +around readings => sub { + my $orig = shift; + my $self = shift; + my @result = grep { $_->sub_class ne 'segment' } $self->$orig( @_ ); + return @result; +}; + +around segments => sub { + my $orig = shift; + my $self = shift; + my @result = grep { $_->sub_class eq 'segment' } $self->$orig( @_ ); return @result; }; @@ -165,38 +183,39 @@ sub has_path { my( $self, $source, $target, $label ) = @_; my @paths = $source->edges_to( $target ); my @relevant = grep { $_->label eq $label } @paths; - return scalar @paths; + return scalar @relevant; +} + +## Dealing with groups of readings, i.e. segments. + +sub add_segment { + my( $self, @items ) = @_; + my $segment = Text::Tradition::Collation::Segment->new( 'members' => \@items ); + return $segment; } ## Dealing with relationships between readings. This is a different ## sort of graph edge. sub add_relationship { - my( $self, $type, $source, $target, $global ) = @_; + my( $self, $source, $target, $options ) = @_; # Make sure there is not another relationship between these two - # readings already + # readings or segments already $source = $self->reading( $source ) - unless ref( $source ) eq 'Text::Tradition::Collation::Reading'; + unless ref( $source ) && $source->isa( 'Graph::Easy::Node' ); $target = $self->reading( $target ) - unless ref( $target ) eq 'Text::Tradition::Collation::Reading'; + unless ref( $target ) && $target->isa( 'Graph::Easy::Node' ); foreach my $rel ( $source->edges_to( $target ) ) { - if( $rel->label eq $type && $rel->class eq 'edge.relationship' ) { + if( $rel->label eq $options->{'type'} && $rel->class eq 'edge.relationship' ) { return; } } + $options->{'orig_relation'} = [ $source, $target ]; - my $rel = Text::Tradition::Collation::Relationship->new( - 'sort' => $type, - 'global' => $global, - 'orig_relation' => [ $source, $target ], - ); - print STDERR sprintf( "Setting relationship %s between readings %s (%s)" - . " and %s (%s)\n", $type, - $source->label, $source->name, - $target->label, $target->name ); + my $rel = Text::Tradition::Collation::Relationship->new( %$options ); $self->graph->add_edge( $source, $target, $rel ); - if( $global ) { + if( $options->{'global'} ) { # Look for all readings with the source label, and if there are # colocated readings with the target label, join them too. foreach my $r ( $self->readings() ) { @@ -206,11 +225,7 @@ sub add_relationship { if( @colocated ) { warn "Multiple readings with same label at same position!" if @colocated > 1; - my $dup_rel = Text::Tradition::Collation::Relationship->new( - 'sort' => $type, - 'global' => $global, - 'orig_relation' => [ $source, $target ], - ); + my $dup_rel = Text::Tradition::Collation::Relationship->new( %$options ); $self->graph->add_edge( $r, $colocated[0], $dup_rel ); } } @@ -271,7 +286,10 @@ sub as_dot { 11, "white", "filled", $self->graph->get_attribute( 'node', 'shape' ) ); foreach my $reading ( $self->readings ) { + # Need not output nodes without separate labels next if $reading->name eq $reading->label; + # TODO output readings or segments, but not both + next if $reading->class eq 'node.segment'; $dot .= sprintf( "\t\"%s\" [ label=\"%s\" ]\n", $reading->name, $reading->label ); } @@ -314,11 +332,9 @@ sub as_graphml { $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema ); # Add the data keys for nodes - my @node_data = ( 'name', 'reading', 'identical', 'position' ); - # HACKY HACKY HACK Relationship data my %node_data_keys; my $ndi = 0; - foreach my $datum ( @node_data ) { + foreach my $datum ( qw/ name reading identical position class / ) { $node_data_keys{$datum} = 'dn'.$ndi++; my $key = $root->addNewChild( $graphml_ns, 'key' ); $key->setAttribute( 'attr.name', $datum ); @@ -330,7 +346,7 @@ sub as_graphml { # Add the data keys for edges, i.e. witnesses my $edi = 0; my %edge_data_keys; - foreach my $edge_key( qw/ witness_main witness_ante_corr relationship / ) { + foreach my $edge_key( qw/ witness_main witness_ante_corr relationship class / ) { $edge_data_keys{$edge_key} = 'de'.$edi++; my $key = $root->addNewChild( $graphml_ns, 'key' ); $key->setAttribute( 'attr.name', $edge_key ); @@ -351,33 +367,31 @@ sub as_graphml { my $node_ctr = 0; my %node_hash; + # Add our readings to the graph foreach my $n ( sort { $a->name cmp $b->name } $self->readings ) { - my %this_node_data = (); - foreach my $datum ( @node_data ) { - my $key = $node_data_keys{$datum}; - if( $datum eq 'name' ) { - $this_node_data{$key} = $n->name; - } elsif( $datum eq 'reading' ) { - $this_node_data{$key} = $n->label; - } elsif( $datum eq 'identical' && $n->has_primary ) { - $this_node_data{$key} = $n->primary->name; - } elsif( $datum eq 'position' ) { - $this_node_data{$key} = $n->position; - } - } my $node_el = $graph->addNewChild( $graphml_ns, 'node' ); my $node_xmlid = 'n' . $node_ctr++; $node_hash{ $n->name } = $node_xmlid; $node_el->setAttribute( 'id', $node_xmlid ); - - foreach my $dk ( keys %this_node_data ) { - my $d_el = $node_el->addNewChild( $graphml_ns, 'data' ); - $d_el->setAttribute( 'key', $dk ); - $d_el->appendText( $this_node_data{$dk} ); - } + _add_graphml_data( $node_el, $node_data_keys{'name'}, $n->name ); + _add_graphml_data( $node_el, $node_data_keys{'reading'}, $n->label ); + _add_graphml_data( $node_el, $node_data_keys{'position'}, $n->position ); + _add_graphml_data( $node_el, $node_data_keys{'class'}, $n->sub_class ); + _add_graphml_data( $node_el, $node_data_keys{'identical'}, $n->primary->name ) + if $n->has_primary; } - # Add the path edges + # Add any segments we have + foreach my $n ( sort { $a->name cmp $b->name } $self->segments ) { + my $node_el = $graph->addNewChild( $graphml_ns, 'node' ); + my $node_xmlid = 'n' . $node_ctr++; + $node_hash{ $n->name } = $node_xmlid; + $node_el->setAttribute( 'id', $node_xmlid ); + _add_graphml_data( $node_el, $node_data_keys{'class'}, $n->sub_class ); + _add_graphml_data( $node_el, $node_data_keys{'name'}, $n->name ); + } + + # Add the path, relationship, and segment edges my $edge_ctr = 0; foreach my $e ( sort { $a->from->name cmp $b->from->name } $self->graph->edges() ) { my( $name, $from, $to ) = ( 'e'.$edge_ctr++, @@ -387,7 +401,9 @@ sub as_graphml { $edge_el->setAttribute( 'source', $from ); $edge_el->setAttribute( 'target', $to ); $edge_el->setAttribute( 'id', $name ); - if( $e->class() eq 'edge.path' ) { + # Add the edge class + _add_graphml_data( $edge_el, $edge_data_keys{'class'}, $e->sub_class ); + if( $e->sub_class eq 'path' ) { # It's a witness path, so add the witness my $base = $e->label; my $key = $edge_data_keys{'witness_main'}; @@ -396,15 +412,11 @@ sub as_graphml { $base = $1; $key = $edge_data_keys{'witness_ante_corr'}; } - my $wit_el = $edge_el->addNewChild( $graphml_ns, 'data' ); - $wit_el->setAttribute( 'key', $key ); - $wit_el->appendText( $base ); - } else { + _add_graphml_data( $edge_el, $key, $base ); + } elsif( $e->sub_class eq 'relationship' ) { # It's a relationship - my $rel_el = $edge_el->addNewChild( $graphml_ns, 'data' ); - $rel_el->setAttribute( 'key', $edge_data_keys{'relationship'} ); - $rel_el->appendText( $e->label() ); - } + _add_graphml_data( $edge_el, $edge_data_keys{'relationship'}, $e->label ); + } # else a segment, nothing to record but source, target, class } # Return the thing @@ -412,12 +424,12 @@ sub as_graphml { return $graphml->toString(1); } -sub _make_xml_attr { - my $str = shift; - $str =~ s/\s/_/g; - $str =~ s/\W//g; - $str =~ "a$str" if $str =~ /^\d/; - return $str; +sub _add_graphml_data { + my( $el, $key, $value ) = @_; + my $data_el = $el->addNewChild( $el->namespaceURI, 'data' ); + return unless defined $value; + $data_el->setAttribute( 'key', $key ); + $data_el->appendText( $value ); } sub collapse_graph_paths { @@ -434,7 +446,7 @@ sub collapse_graph_paths { my $majority = int( scalar( @{$self->tradition->witnesses} ) / 2 ) + 1; # But don't compress if there are only a few witnesses. $majority = 4 if $majority < 4; - foreach my $node( $self->readings ) { + foreach my $node ( $self->readings ) { my $newlabels = {}; # We will visit each node, so we only look ahead. foreach my $edge ( $node->outgoing() ) { @@ -716,6 +728,7 @@ sub make_witness_paths { @common_readings = _find_common( \@common_readings, $wit->path ); @common_readings = _find_common( \@common_readings, $wit->uncorrected_path ); } + map { $_->make_common } @common_readings; return @common_readings; } @@ -730,7 +743,7 @@ sub make_witness_path { foreach my $idx( 0 .. $#chain-1 ) { my $source = $chain[$idx]; my $target = $chain[$idx+1]; - $self->add_path( $source, $target, "$sig (a.c.)" ) + $self->add_path( $source, $target, $sig.$self->ac_label ) unless $self->has_path( $source, $target, $sig ); } } @@ -757,7 +770,7 @@ sub calculate_positions { print STDERR "Calculating positions in " . $wit->sigil . "\n"; _update_positions_from_path( $wit->path, @ordered_common ); _update_positions_from_path( $wit->uncorrected_path, @ordered_common ) - if $wit->has_uncorrected; + if $wit->has_ante_corr; } # DEBUG diff --git a/lib/Text/Tradition/Collation/Relationship.pm b/lib/Text/Tradition/Collation/Relationship.pm index 7001782..49ac1db 100644 --- a/lib/Text/Tradition/Collation/Relationship.pm +++ b/lib/Text/Tradition/Collation/Relationship.pm @@ -15,8 +15,8 @@ enum 'RelationshipType' => qw( spelling orthographic grammatical repetition lexi subtype 'RelationshipVector', => as 'ArrayRef', => where { @$_ == 2 - && $_->[0]->isa( 'Text::Tradition::Collation::Reading' ) - && $_->[1]->isa( 'Text::Tradition::Collation::Reading' ) + && $_->[0]->isa( 'Graph::Easy::Node' ) + && $_->[1]->isa( 'Graph::Easy::Node' ) }, message { 'Argument should be [ SourceReading, TargetReading ]' }; @@ -50,6 +50,16 @@ has 'global' => ( default => 0, ); +has 'non_correctable' => ( + is => 'rw', + isa => 'Bool', + ); + +has 'non_independent' => ( + is => 'rw', + isa => 'Bool', + ); + sub FOREIGNBUILDARGS { my $class = shift; my %args = @_; diff --git a/lib/Text/Tradition/Collation/Segment.pm b/lib/Text/Tradition/Collation/Segment.pm new file mode 100644 index 0000000..b3a6204 --- /dev/null +++ b/lib/Text/Tradition/Collation/Segment.pm @@ -0,0 +1,78 @@ +package Text::Tradition::Collation::Segment; + +use Moose; +use MooseX::NonMoose; + +extends 'Graph::Easy::Node'; + +# A segment is a special 'invisible' node that is a set of Readings. +# We should never display these, but it is useful to have them +# available for many-to-many relationship mappings. + +has 'members' => ( + is => 'rw', + isa => 'ArrayRef[Text::Tradition::Collation::Reading]', + required => 1, +); + +sub FOREIGNBUILDARGS { + my $class = shift; + my %args = @_; + + # Name the segment after its member elements. + my $nodename = join( ' ', map { $_->name } @{$args{'members'}} ); + return ( 'name', $nodename ); +} + +sub BUILD { + my( $self, $args ) = @_; + $self->set_attribute( 'class', 'segment' ); + + foreach my $r ( @{$self->members} ) { + my $seg_edge = $r->parent->add_edge( $r, $self, 'segment' ); + $seg_edge->set_attribute( 'class', 'segment' ); + } +} + +# For now, a segment has no position in the graph. Eventually it might +# have the position of its first member. +sub has_position { + return undef; +} + +no Moose; +__PACKAGE__->meta->make_immutable; + +1; + +###################################################### +## copied from Graph::Easy::Parser docs +###################################################### +# when overriding nodes, we also need ::Anon + +package Text::Tradition::Collation::Segment::Anon; +use Moose; +use MooseX::NonMoose; +extends 'Text::Tradition::Collation::Segment'; +extends 'Graph::Easy::Node::Anon'; +no Moose; +__PACKAGE__->meta->make_immutable; + +1; +# use base qw/Text::Tradition::Collation::Segment/; +# use base qw/Graph::Easy::Node::Anon/; + +###################################################### +# and :::Empty + +package Text::Tradition::Collation::Segment::Empty; +use Moose; +use MooseX::NonMoose; +extends 'Graph::Easy::Node::Empty'; +no Moose; +__PACKAGE__->meta->make_immutable; + +1; +# use base qw/Text::Tradition::Collation::Segment/; + +###################################################### diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm index 4c2af19..ad963cf 100644 --- a/lib/Text/Tradition/Parser/BaseText.pm +++ b/lib/Text/Tradition/Parser/BaseText.pm @@ -76,7 +76,7 @@ underscore in its name. =cut -my $SHORTEND = 20; # Debug var - set this to limit the number of lines parsed +my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed my %base_text_index; my $edits_required = {}; @@ -252,19 +252,24 @@ sub merge_base { foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) { print STDERR "Creating witness $w\n"; my $witness_obj = $collation->tradition->add_witness( sigil => $w ); - my $debug = undef; # $w eq 'Vb10'; - my ( $text_seq, $ac ) = apply_edits( $collation, - $edits_required->{$w}, - $edits_required->{$w."_post"}, - $debug ); - - my @repeated = _check_for_repeated( @$text_seq ); - warn "Repeated elements @repeated in $w" + my $debug; # = $w eq 'Vb11'; + my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug ); + my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug ) + if exists( $edits_required->{$w."_post"} ); + + my @repeated = _check_for_repeated( @ante_corr_seq ); + warn "Repeated elements @repeated in $w a.c." + if @repeated; + @repeated = _check_for_repeated( @post_corr_seq ); + warn "Repeated elements @repeated in $w p.c." if @repeated; + # Now save these paths in my witness object - $witness_obj->path( $text_seq ); - if( $ac ) { - $witness_obj->uncorrected( $ac ); + if( @post_corr_seq ) { + $witness_obj->path( \@post_corr_seq ); + $witness_obj->uncorrected_path( \@ante_corr_seq ); + } else { + $witness_obj->path( \@ante_corr_seq ); } } @@ -278,6 +283,32 @@ sub merge_base { $collation->del_reading( $_ ); } + ### HACKY HACKY Do some one-off path corrections here. + if( $collation->linear ) { + # What? + } else { + my $c = $collation; + # Vb5: + my $path = $c->tradition->witness('Vb5')->path; + splice( @$path, 1436, 0, $c->reading('106,14') ); + # Vb11: + $path = $c->tradition->witness('Vb11')->path; + $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') ); + splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( '16,1' ) ); + # Vb12 a.c.: + $path = $c->tradition->witness('Vb12')->uncorrected_path; + splice( @$path, 1828, 1, $c->reading('rdg_2/137.5.0') ); + # Vb13: + $path = $c->tradition->witness('Vb13')->path; + splice( @$path, 782, 0, $c->reading( '58,5' ) ); + # Vb20 a.c.: + $path = $c->tradition->witness('Vb20')->uncorrected_path; + splice( @$path, 1251, 1, $c->reading( '94,6' ) ); + # Vb26: + $path = $c->tradition->witness('Vb26')->path; + splice( @$path, 618, 0, $c->reading('46,2') ) + } + # Now walk paths and calculate positions. my @common_readings = $collation->make_witness_paths(); @@ -446,9 +477,9 @@ sub collate_nonlinearly { if( @same ) { foreach my $i ( 0 .. $#same ) { unless( $merged{$same[$i]->name} ) { - print STDERR sprintf( "Merging %s into %s\n", - $vw->name, - $same[$i]->name ); + #print STDERR sprintf( "Merging %s into %s\n", + # $vw->name, + # $same[$i]->name ); $collation->merge_readings( $same[$i], $vw ); $merged{$same[$i]->name} = 1; $matched = $i; @@ -475,13 +506,19 @@ sub set_relationships { my( $collation, $app, $lemma, $variants ) = @_; foreach my $rkey ( keys %$variants ) { my $var = $variants->{$rkey}->{'reading'}; - my $typekey = sprintf( "_%s_type", $rkey ); - my $type = $app->{$typekey}; + my $type = $app->{sprintf( "_%s_type", $rkey )}; + my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )}; + my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )}; + + my %rel_options = (); + $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/; + $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/; if( $type =~ /^(inv|tr|rep)$/i ) { # Transposition or repetition: look for nodes with the # same label but different IDs and mark them. $type = 'repetition' if $type =~ /^rep/i; + $rel_options{'sort'} = $type; my %labels; foreach my $r ( @$lemma ) { $labels{cmp_str( $r )} = $r; @@ -491,46 +528,37 @@ sub set_relationships { $r->name ne $labels{$r->label}->name ) { if( $type eq 'repetition' ) { # Repetition - $collation->add_relationship( $type, $r, $labels{$r->label} ); + $collation->add_relationship( $r, $labels{$r->label}, \%rel_options ); } else { # Transposition $r->set_identical( $labels{$r->label} ); } } } - } elsif( $type =~ /^(gr|sp(el)?)$/i ) { - # Grammar/spelling: this can be a one-to-one or one-to-many - # mapping. We should think about merging readings if it is - # one-to-many. + } elsif( $type =~ /^(gr|lex|sp(el)?)$/i ) { + + # Grammar/spelling/lexical: this can be a one-to-one or + # one-to-many mapping. We should think about merging + # readings if it is one-to-many. + $type = 'grammatical' if $type =~ /gr/i; $type = 'spelling' if $type =~ /sp/i; $type = 'repetition' if $type =~ /rep/i; + $type = 'lexical' if $type =~ /lex/i; + $rel_options{'sort'} = $type; if( @$lemma == @$var ) { foreach my $i ( 0 .. $#{$lemma} ) { - $collation->add_relationship( $type, $var->[$i], - $lemma->[$i] ); - } - } elsif ( @$lemma > @$var && @$var == 1 ) { - # Merge the lemma readings into one - ## TODO This is a bad solution. We need a real one-to-many - ## mapping. - my $ln1 = shift @$lemma; - foreach my $ln ( @$lemma ) { - $collation->merge_readings( $ln1, $ln, ' ' ); - } - $lemma = [ $ln1 ]; - $collation->add_relationship( $type, $var->[0], $lemma->[0] ); - } elsif ( @$lemma < @$var && @$lemma == 1 ) { - my $vn1 = shift @$var; - foreach my $vn ( @$var ) { - $collation->merge_readings( $vn1, $vn, ' ' ); - } - $var = [ $vn1 ]; - $collation->add_relationship( $type, $var->[0], $lemma->[0] ); + $collation->add_relationship( $var->[$i], $lemma->[$i], + \%rel_options ); + } } else { - warn "Cannot set $type relationship on a many-to-many variant"; + # An uneven many-to-many mapping. Make a segment out of + # whatever we have. + my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0]; + my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0]; + $collation->add_relationship( $varseg, $lemseg, \%rel_options ); } - } elsif( $type !~ /^(lex|add|om)$/i ) { + } elsif( $type !~ /^(add|om)$/i ) { warn "Unrecognized type $type"; } } @@ -539,62 +567,55 @@ sub set_relationships { sub apply_edits { - my( $collation, $edit_sequence, $corrected_edit_sequence, $debug ) = @_; - - # Index the ante- and post-correctione edits that we have, so that - # for each spot in the text we can apply the original witness - # state and then apply its corrected state, if applicable. - my $all_edits = {}; - foreach my $c ( @$edit_sequence ) { - my $lemma_index = $base_text_index{$c->[0]}; - $all_edits->{$lemma_index}->{'ac'} = $c; - # If the text carries no corrections, pc == ac. - $all_edits->{$lemma_index}->{'pc'} = $c - unless $corrected_edit_sequence; - } - foreach my $c ( @$corrected_edit_sequence ) { - my $lemma_index = $base_text_index{$c->[0]}; - $all_edits->{$lemma_index}->{'pc'} = $c; - } - + my( $collation, $edit_sequence, $debug ) = @_; my @lemma_text = $collation->reading_sequence( $collation->start, $collation->reading( '#END#' ) ); my $drift = 0; - my @ac_sequence; - foreach my $lemma_index ( sort keys %$all_edits ) { - my $ac = $all_edits->{$lemma_index}->{'ac'}; - my $pc = $all_edits->{$lemma_index}->{'pc'}; - my $realoffset = $lemma_index + $drift; - if( $ac && $pc && $ac eq $pc ) { - # No correction, just apply the edit - my( $lemma_start, $length, $items ) = @$pc; - splice( @lemma_text, $realoffset, $length, @$items ); - $drift += @$items + $length; - } elsif ( !$pc ) { - # Lemma text is unaltered, save a.c. as an 'uncorrection' - my( $lemma_start, $length, $items ) = @$ac; - push( @ac_sequence, [ $realoffset, $length, $items ] ); - } elsif ( !$ac ) { - # Apply the edit, save lemma text as an 'uncorrection' - my( $lemma_start, $length, $items ) = @$pc; - my @old = splice( @lemma_text, $realoffset, $length, @$items ); - $drift += @$items + $length; - push( @ac_sequence, [ $realoffset, scalar( @$items ), \@old ] ); - } else { - # Apply the p.c. edit, then save the a.c. edit as an - # 'uncorrection' on the p.c. text - my( $lemma_start, $length, $items ) = @$pc; - my @old = splice( @lemma_text, $realoffset, $length, @$items ); - $drift += @$items + $length; - push( @ac_sequence, [ $realoffset, scalar( @$items ), \@old ] ); + foreach my $correction ( @$edit_sequence ) { + my( $lemma_start, $length, $items ) = @$correction; + my $offset = $base_text_index{$lemma_start}; + my $realoffset = $offset + $drift; + if( $debug || + $lemma_text[$realoffset]->name ne $lemma_start ) { + my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1]; + my @base_phrase; + my $i = $realoffset; + my $l = $collation->reading( $lemma_start ); + while( $i < $realoffset+$length ) { + push( @base_phrase, $l ); + $l = $collation->next_reading( $l ); + $i++; + } + + print STDERR sprintf( "Trying to replace %s (%s) starting at %d " . + "with %s (%s) with drift %d\n", + join( ' ', map {$_->label} @base_phrase ), + join( ' ', map {$_->name} @base_phrase ), + $realoffset, + join( ' ', map {$_->label} @$items ), + join( ' ', map {$_->name} @$items ), + $drift, + ) if $debug; + + if( $lemma_text[$realoffset]->name ne $lemma_start ) { + warn( sprintf( "Should be replacing %s (%s) with %s (%s) " . + "but %s (%s) is there instead", + join( ' ', map {$_->label} @base_phrase ), + join( ' ', map {$_->name} @base_phrase ), + join( ' ', map {$_->label} @$items ), + join( ' ', map {$_->name} @$items ), + join( ' ', map {$_->label} @this_phrase ), + join( ' ', map {$_->name} @this_phrase ), + ) ); + # next; + } } + splice( @lemma_text, $realoffset, $length, @$items ); + $drift += @$items - $length; } - return( \@lemma_text, \@ac_sequence ); + return @lemma_text; } - -# sub _apply_sequence_splice { -# my( $collation, $sequence, $correction - + # Helper function. Given a witness sigil, if it is a post-correctione # sigil,return the base witness. If not, return a false value. diff --git a/lib/Text/Tradition/Parser/CSV.pm b/lib/Text/Tradition/Parser/CSV.pm index 93a321e..003936c 100644 --- a/lib/Text/Tradition/Parser/CSV.pm +++ b/lib/Text/Tradition/Parser/CSV.pm @@ -78,7 +78,7 @@ sub read { $apparatus->{'rdg_0'} = $linehash{'text'} if $linehash{'text'}; $apparatus->{'rdg_' . ++$rdg_ctr} = $linehash{'variant'}; foreach my $attr ( @fields[3..8] ) { - $apparatus->{"_rdg_${rdg_ctr}_$attr"} = $linehash{$attr} if $linehash{$attr}; + $apparatus->{"_rdg_${rdg_ctr}_$attr"} = $linehash{$attr} if defined $linehash{$attr}; } foreach my $k ( @fields[10..$#fields] ) { @@ -88,6 +88,9 @@ sub read { $apparatus->{$k} = 'rdg_0' unless exists $apparatus->{$k}; } elsif ( $variant_rdg =~ /^1/ ) { + warn sprintf( "Already found variant reading %s for %s at %s!", + $apparatus->{$k}, $k, $apparatus->{_id} ) + if exists $apparatus->{$k} && $apparatus->{$k} ne 'rdg_0'; $apparatus->{$k} = 'rdg_' . $rdg_ctr; } else { # else for $, we don't list the MS warn "Unparsed variant indicator $variant_rdg for $k in " . diff --git a/lib/Text/Tradition/Witness.pm b/lib/Text/Tradition/Witness.pm index 2d1996a..656c185 100644 --- a/lib/Text/Tradition/Witness.pm +++ b/lib/Text/Tradition/Witness.pm @@ -2,19 +2,6 @@ package Text::Tradition::Witness; use Moose; use Moose::Util::TypeConstraints; -subtype 'Correction', - => as 'ArrayRef', - => where { return 0 unless @$_ == 3; - return 0 unless $_->[0] =~ /^\d+$/; - return 0 unless $_->[1] =~ /^\d+$/; - foreach my $x ( @{$_->[2]} ) { - return $0 unless $x->isa( 'Text::Tradition::Collation::Reading' ); - } - return 1; - }, - => message { "Correction must be ref of [ offset, length, replacement_list ]" }; - - # Sigil. Required identifier for a witness. has 'sigil' => ( is => 'ro', @@ -47,12 +34,10 @@ has 'path' => ( predicate => 'has_path', ); -# Uncorrection. This is an array of sets of reading nodes that show -# where the witness was corrected. -has 'uncorrected' => ( +has 'uncorrected_path' => ( is => 'rw', - isa => 'ArrayRef[Correction]', - predicate => 'has_uncorrected', + isa => 'ArrayRef[Text::Tradition::Collation::Reading]', + predicate => 'has_ante_corr', ); @@ -87,15 +72,5 @@ around text => sub { $self->$orig( @_ ); }; -sub uncorrected_path { - my $self = shift; - my @path; - push( @path, @{$self->path} ); - foreach my $corr ( @{$self->uncorrected} ) { - splice( @path, $corr->[0], $corr->[1], @{$corr->[2]} ); - } - return \@path; -} - no Moose; __PACKAGE__->meta->make_immutable; diff --git a/t/graph.t b/t/graph.t index 530d0b0..b6844f9 100644 --- a/t/graph.t +++ b/t/graph.t @@ -32,8 +32,9 @@ my @svg_edges = $svg_xpc->findnodes( '//svg:g[@class="edge"]' ); is( scalar @svg_edges, 32, "Correct number of edges in the graph" ); # Test for the correct common nodes -my @expected_nodes = map { [ $_, 1 ] } qw/ #START# n1 n5 n6 n7 n12 - n16 n19 n20 n27 /; +my @common_nodes = ( '#START#' ); +push( @common_nodes, qw/ n1 n5 n6 n7 n12 n16 n19 n20 n27 / ); +my @expected_nodes = map { [ $_, 1 ] } @common_nodes; foreach my $idx ( qw/2 3 4 8 10 11 13 16 17 18/ ) { splice( @expected_nodes, $idx, 0, [ "node_null", undef ] ); } @@ -69,6 +70,11 @@ sub make_text { return join( ' ', @words ); } +# Test that the common nodes are marked common +foreach my $cn ( @common_nodes ) { + ok( $collation->reading( $cn )->is_common, "Node $cn is marked common" ); +} + # Test the manuscript paths my $wit_a = '# when april with his showers sweet with fruit the drought of march has pierced unto the root #'; my $wit_b = '# when showers sweet with april fruit the march of drought has pierced to the root #'; @@ -177,4 +183,45 @@ subtest 'Turned on another node after singleton switchoff' => \&compare_active; $string = '# when ... ... showers sweet with ... fruit ... march of ... has pierced unto ... ... #'; is( make_text( @active_nodes ), $string, "Got the right text" ); +# Now start testing some position identifiers +# 2. 'april with his' have no colocated +# 3. 'april' 2 has no colocated +# 4. 'teh' and 'the' +# 5. 'drought' & 'march' +# 6. 'march' & 'drought' +# 7. 'unto' 'the' 'root'... +# 'unto can match 'to' or 'teh' +# 'the' can match 'teh' or 'rood' +# 'root' can mach 'rood' + +foreach my $cn ( @common_nodes ) { + my $cnr = $collation->reading( $cn ); + is( scalar( $collation->same_position_as( $cnr ) ), 0, "Node $cn has no colocations" ); +} + +my %expected_colocations = ( + 'n2' => [], # april + 'n3' => [], # with + 'n4' => [], # his + 'n11' => [], # april + 'n8' => [ 'n13' ], # teh -> the + 'n13' => [ 'n8' ], # the -> teh + 'n14' => [ 'n15' ], # drought -> march + 'n18' => [ 'n17' ], # drought -> march + 'n17' => [ 'n18' ], # march -> drought + 'n15' => [ 'n14' ], # march -> drought + 'n21' => [ 'n9', 'n22' ], # unto -> to, teh + 'n22' => [ 'n9', 'n21' ], # to -> unto, teh + 'n9' => [ 'n21', 'n22', 'n23' ], # teh -> unto, to, the + 'n23' => [ 'n9', 'n25' ], # the -> teh, rood + 'n25' => [ 'n9', 'n26' ], # rood -> the, root + 'n26' => [ 'n25' ], # root -> rood +); + +foreach my $n ( keys %expected_colocations ) { + my $nr = $collation->reading( $n ); + my @colocated = sort( map { $_->name } $collation->same_position_as( $nr ) ); + is_deeply( \@colocated, $expected_colocations{$n}, "Colocated nodes for $n correct" ); +} + done_testing();