From: Tara L Andrews Date: Sat, 29 Sep 2012 01:48:22 +0000 (+0200) Subject: we cannot save coderefs, so stop trying; self parser fixes for new relationship regime X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=c7bd27689f556898fbfa99084139779736085d57;p=scpubgit%2Fstemmatology.git we cannot save coderefs, so stop trying; self parser fixes for new relationship regime --- diff --git a/base/lib/Text/Tradition/Collation/RelationshipStore.pm b/base/lib/Text/Tradition/Collation/RelationshipStore.pm index 2834a13..923214a 100644 --- a/base/lib/Text/Tradition/Collation/RelationshipStore.pm +++ b/base/lib/Text/Tradition/Collation/RelationshipStore.pm @@ -146,16 +146,15 @@ has '_equivalence_readings' => ( sub BUILD { my $self = shift; - my $regularize = sub { - return $_[0]->can('regularize') ? $_[0]->regularize : $_[0]->text; }; - my @DEFAULT_TYPES = ( { name => 'collated', bindlevel => 50, is_weak => 1, is_transitive => 0, is_generalizable => 0 }, - { name => 'orthographic', bindlevel => 0 }, - { name => 'spelling', bindlevel => 1, record_sub => $regularize }, - { name => 'punctuation', bindlevel => 2, record_sub => $regularize }, - { name => 'grammatical', bindlevel => 2, record_sub => $regularize }, - { name => 'lexical', bindlevel => 2, record_sub => $regularize }, + { name => 'orthographic', bindlevel => 0, use_regular => 0 }, + { name => 'spelling', bindlevel => 1 }, + { name => 'punctuation', bindlevel => 2 }, + { name => 'grammatical', bindlevel => 2 }, + { name => 'lexical', bindlevel => 2 }, + { name => 'uncertain', bindlevel => 50, is_transitive => 0, is_generalizable => 0 }, + { name => 'other', bindlevel => 50, is_transitive => 0, is_generalizable => 0 }, { name => 'transposition', bindlevel => 50, is_colocation => 0, is_transitive => 0 }, { name => 'repetition', bindlevel => 50, is_colocation => 0, is_transitive => 0 } ); @@ -165,9 +164,6 @@ sub BUILD { } } -sub _regular_form { -} - around add_type => sub { my $orig = shift; my $self = shift; @@ -258,6 +254,7 @@ sub create { $rel = Text::Tradition::Collation::Relationship->new( $options ); my $reltype = $self->type( $rel->type ); + throw( "Unrecognized relationship type " . $rel->type ) unless $reltype; # Validate the options given against the relationship type wanted throw( "Cannot set nonlocal scope on relationship of type " . $reltype->name ) if $rel->nonlocal && !$reltype->is_generalizable; @@ -445,6 +442,11 @@ try { ok( 0, "Failed to add normal transposition complement: " . $e->message ); } +# TODO Test 4: make a global relationship that involves re-ranking a node first, when +# the prior rank has a potential match too +my $t4 = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/globalrel_test.xml' ); + + =end testing =cut @@ -459,8 +461,9 @@ sub add_relationship { if( $sourceobj->is_meta || $targetobj->is_meta ); my $relationship; my $reltype; - my $thispaironly; + my $thispaironly = delete $options->{thispaironly}; my $droppedcolls = []; + $DB::single = 1 if $source eq 'r796.3' && $target eq 'r796.4'; if( ref( $options ) eq 'Text::Tradition::Collation::Relationship' ) { $relationship = $options; $reltype = $self->type( $relationship->type ); @@ -475,8 +478,8 @@ sub add_relationship { $reltype = $self->type( $options->{type} ); # Try to create the relationship object. - my $rdga = $reltype->record_sub->( $sourceobj ); - my $rdgb = $reltype->record_sub->( $targetobj ); + my $rdga = $reltype->regularize( $sourceobj ); + my $rdgb = $reltype->regularize( $targetobj ); $options->{'orig_a'} = $sourceobj; $options->{'orig_b'} = $targetobj; $options->{'reading_a'} = $rdga; @@ -606,11 +609,11 @@ sub _find_applicable { my $reltype = $self->type( $rel->type ); my @vectors; my @identical_readings; - @identical_readings = grep { $reltype->record_sub->( $_ ) eq $rel->reading_a } + @identical_readings = grep { $reltype->regularize( $_ ) eq $rel->reading_a } $c->readings; foreach my $ir ( @identical_readings ) { my @itarget; - @itarget = grep { $reltype->record_sub->( $_ ) eq $rel->reading_b } + @itarget = grep { $reltype->regularize( $_ ) eq $rel->reading_b } $c->readings_at_rank( $ir->rank ); if( @itarget ) { # Warn if there is more than one hit with no closer link between them. @@ -680,7 +683,8 @@ sub relationship_valid { my $reltype = $self->type( $rel ); ## Assume validity is okay if we are initializing from scratch. return ( 1, "initializing" ) unless $c->tradition->_initialized; - ## TODO Move this block to relationship type definition + ## TODO Move this block to relationship type definition when we can save + ## coderefs if ( $rel eq 'transposition' || $rel eq 'repetition' ) { # Check that the two readings do (for a repetition) or do not (for # a transposition) appear in the same witness. @@ -1007,7 +1011,7 @@ sub test_equivalence { $self->equivalence_graph->delete_edge( $teq, $succ ) if $added_succ{$succ}; } unless( $self->equivalence_graph->eq( $checkstr ) ) { - warn "GRAPH CHANGED after testing"; + throw( "GRAPH CHANGED after testing" ); } # Return our answer return $ret; @@ -1271,6 +1275,14 @@ sub _add_graphml_data { $data_el->appendText( $value ); } +sub dump_segment { + my( $self, $from, $to ) = @_; + open( DUMP, ">debug.svg" ) or die "Could not open debug.svg"; + binmode DUMP, ':utf8'; + print DUMP $self->collation->as_svg({ from => $from, to => $to, nocalc => 1 }); + close DUMP; +} + sub throw { Text::Tradition::Error->throw( 'ident' => 'Relationship error', diff --git a/base/lib/Text/Tradition/Collation/RelationshipType.pm b/base/lib/Text/Tradition/Collation/RelationshipType.pm index 98b17f6..5320ebe 100644 --- a/base/lib/Text/Tradition/Collation/RelationshipType.pm +++ b/base/lib/Text/Tradition/Collation/RelationshipType.pm @@ -37,15 +37,16 @@ for the internal 'collated' relationship, only to be used by parsers. =item * is_colocation - (Default true) Whether this relationship implies that the readings in question have parallel locations. -=item * is_transitive - (Default $self->is_colocation) Whether this -relationship type is transitive - that is, if A is related to B and C this -way, is B necessarily related to C? +=item * is_transitive - (Default 1) Whether this relationship type is +transitive - that is, if A is related to B and C this way, is B necessarily +related to C? -=item * is_generalizable - Whether this relationship can have a non-local -scope. +=item * is_generalizable - (Default is_colocation) Whether this +relationship can have a non-local scope. -=item * record_sub - A subroutine to canonify the reading text before -determining whether individual readings match. Defaults to no canonization. +=item * use_regular - (Default is_generalizable) Whether, when a +relationship has a non-local scope, the search should be made on the +regularized form of the reading. =back @@ -63,9 +64,9 @@ determining whether individual readings match. Defaults to no canonization. =head2 is_generalizable -=head2 record_sub +=head2 use_regular -See the option descriptions above. +See the option descriptions above. All attributes are read-only. =cut @@ -106,14 +107,77 @@ has 'is_generalizable' => ( default => sub { $_[0]->is_colocation } ); -has 'record_sub' => ( +# TODO I really want to make some configurable coderefs... + +has 'use_regular' => ( is => 'ro', - isa => 'CodeRef', - default => sub { sub { $_[0]->text } } + isa => 'Bool', + lazy => 1, + default => sub { $_[0]->is_generalizable } ); -# TODO Define extra validation conditions here +=head1 DEFAULTS + +This package provides the following set of relationships as default: + +=head2 orthographic: bindlevel => 0, use_regular => 0 + +The readings are orthographic variants of each other (e.g. upper vs. lower case letters.) If the Morphology plugin is in use, orthographically related readings should regularize to the same string. + +=head2 spelling: bindlevel => 1 + +The readings are spelling variations of the same word(s), e.g. 'color' vs. 'colour'. + +=head2 punctuation: bindlevel => 2 + +The readings are both punctuation markers. + +=head2 grammatical: bindlevel => 2 + +The readings are morphological variants of the same root word, e.g. 'was' vs. 'were'. + +=head2 lexical: bindlevel => 2 + +The readings have the same morphological function but different root words, e.g. '[they] worked' vs. '[they] played'. + +=head2 uncertain: bindlevel => 50, is_transitive => 0, is_generalizable => 0 + +The readings are (probably) related, but it is impossible to say for sure how. Useful for when one or both of the readings is itself uncertain. + +=head2 transposition: bindlevel => 50, is_colocation => 0 + +The readings are the same (or perhaps close variants), but the position has shifted across witnesses. + +=head2 repetition: bindlevel => 50, is_colocation => 0, is_transitive => 0 + +One of the readings is a repetition of the other, e.g. "pet the cat" vs. "pet the the cat". + +=head2 other: bindlevel => 50, is_transitive => 0, is_generalizable => 0 + +A catch-all relationship for cases not covered by the other relationship types. + +=head2 collated: bindlevel => 50, is_weak => 1, is_generalizable => 0 + +For internal use only. Denotes a parallel pair of variant readings as detected by an automatic collator. + +=head1 METHODS + +=head2 regularize( $reading ) + +Given a Reading object, return the regular form of the reading text that this +relationship type expects. + +=cut +# TODO Define extra validation conditions here when we can store coderefs + +sub regularize { + my( $self, $rdg ) = @_; + if( $self->use_regular && $rdg->can('regularize') ) { + return $rdg->regularize; + } + return $rdg->text; +} no Moose; __PACKAGE__->meta->make_immutable; diff --git a/base/lib/Text/Tradition/Parser/Self.pm b/base/lib/Text/Tradition/Parser/Self.pm index b9af342..2e5425d 100644 --- a/base/lib/Text/Tradition/Parser/Self.pm +++ b/base/lib/Text/Tradition/Parser/Self.pm @@ -265,8 +265,19 @@ sub parse { # Nodes are added via the call to add_reading above. We only need # add the relationships themselves. # TODO check that scoping does trt + $tradition->_init_done( 1 ); # so that relationships get validated $rel_data->{'edges'} ||= []; # so that the next line doesn't break on no rels - foreach my $e ( sort { _layersort_rel( $a, $b ) } @{$rel_data->{'edges'}} ) { + # Backward compatibility... + if( $use_version eq '2.0' || $use_version eq '3.0' ) { + foreach my $e ( @{$rel_data->{'edges'}} ) { + delete $e->{'class'}; + $e->{'type'} = delete $e->{'relationship'} if exists $e->{'relationship'}; + } + } + + my $rg = $collation->relations; + foreach my $e ( sort { _apply_relationship_order( $a, $b, $rg ) } + @{$rel_data->{'edges'}} ) { my $sourceid = exists $namechange{$e->{'source'}->{'id'}} ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'}; my $targetid = exists $namechange{$e->{'target'}->{'id'}} @@ -276,11 +287,6 @@ sub parse { delete $e->{'source'}; delete $e->{'target'}; # The remaining keys are relationship attributes. - # Backward compatibility... - if( $use_version eq '2.0' || $use_version eq '3.0' ) { - delete $e->{'class'}; - $e->{'type'} = delete $e->{'relationship'} if exists $e->{'relationship'}; - } # Add the specified relationship unless we already have done. my $rel_exists; if( $e->{'scope'} ne 'local' ) { @@ -288,12 +294,16 @@ sub parse { if( $relobj && $relobj->scope eq $e->{'scope'} && $relobj->type eq $e->{'type'} ) { $rel_exists = 1; + } else { + # Don't propagate the relationship; all the propagations are + # already in the XML. + $e->{'thispaironly'} = 1; } } try { $collation->add_relationship( $from, $to, $e ) unless $rel_exists; - } catch( Text::Tradition::Error $e ) { - warn "DROPPING $from -> $to: " . $e->message; + } catch( Text::Tradition::Error $err ) { + warn "DROPPING " . $e->{type} . " rel on $from -> $to: " . $err->message; } } @@ -302,19 +312,19 @@ sub parse { $collation->text_from_paths(); } -## Return the relationship that comes first in priority. -my %LAYERS = ( - 'collated' => 1, - 'orthographic' => 2, - 'spelling' => 3, - ); - -sub _layersort_rel { - my( $a, $b ) = @_; - my $key = exists $a->{'type'} ? 'type' : 'relationship'; - my $at = $LAYERS{$a->{$key}} || 99; - my $bt = $LAYERS{$b->{$key}} || 99; - return $at <=> $bt; +# Helper sort function for applying the saved relationships in a +# sensible order. +sub _apply_relationship_order { + my( $a, $b, $rg ) = @_; + my $at = $rg->type( $a->{type} ); my $bt = $rg->type( $b->{type} ); + # Apply strong relationships before weak + return -1 if $bt->is_weak && !$at->is_weak; + return 1 if $at->is_weak && !$bt->is_weak; + # Apply local before global + return -1 if $a->{scope} eq 'local' && $b->{scope} ne 'local'; + return 1 if $b->{scope} eq 'local' && $a->{scope} ne 'local'; + # Apply more tightly bound relationships first + return $at->bindlevel <=> $bt->bindlevel; } 1; @@ -325,10 +335,6 @@ sub _layersort_rel { =item * Make this into a stream parser with GraphML -=item * Simply field -> attribute correspondence for nodes and edges - -=item * Share key name constants with Collation.pm - =back =head1 LICENSE diff --git a/base/t/data/globalrel_test.xml b/base/t/data/globalrel_test.xml new file mode 100644 index 0000000..255a8e8 --- /dev/null +++ b/base/t/data/globalrel_test.xml @@ -0,0 +1,1387 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + digraph stemma { 12 [ class=hypothetical ]; 15 [ class=hypothetical ]; 17 [ class=hypothetical ]; 19 [ class=hypothetical ]; 2 [ class=hypothetical ]; 20 [ class=hypothetical ]; 22 [ class=hypothetical ]; 23 [ class=hypothetical ]; 24 [ class=hypothetical ]; 27 [ class=hypothetical ]; 3 [ class=hypothetical ]; 30 [ class=hypothetical ]; 4 [ class=hypothetical ]; 6 [ class=hypothetical ]; A [ class=extant ]; Ab [ class=extant ]; Ac [ class=extant ]; Ad [ class=extant ]; Ae [ class=extant ]; B [ class=extant ]; Ba [ class=extant ]; Bb [ class=extant ]; Bd [ class=extant ]; Be [ class=extant ]; C [ class=extant ]; Ca [ class=extant ]; Cb [ class=extant ]; Cc [ class=extant ]; Cd [ class=extant ]; Ce [ class=extant ]; Cf [ class=extant ]; E [ class=extant ]; F [ class=extant ]; G [ class=extant ]; H [ class=extant ]; K [ class=extant ]; L [ class=extant ]; M [ class=extant ]; N [ class=extant ]; O [ class=extant ]; P [ class=extant ]; R [ class=extant ]; S [ class=extant ]; T [ class=extant ]; V [ class=extant ]; W [ class=extant ]; X [ class=extant ]; Z [ class=extant ]; 12 -> Be; 12 -> Ca; 12 -> F; 15 -> 17; 15 -> 20; 15 -> N; 17 -> C; 17 -> Cd; 17 -> E; 19 -> 22; 19 -> Ad; 19 -> Cb; 19 -> Z; 20 -> H; 20 -> X; 2 -> 12; 2 -> 15; 2 -> 19; 22 -> 23; 22 -> 30; 22 -> Ac; 23 -> Cc; 23 -> G; 24 -> A; 24 -> K; 24 -> L; 2 -> 6; 27 -> A; 27 -> M; 30 -> Ab; 30 -> Ce; 30 -> R; 3 -> 4; 3 -> W; 4 -> Ae; 4 -> S; 4 -> T; 6 -> 3; 6 -> Ba; 6 -> Be; 6 -> Ca; 6 -> O; 6 -> P; 6 -> T; 6 -> V; Ac -> 24; Ac -> 27; Ac -> B; Ac -> Cf; Be -> Bb; Be -> Bd;} + Default + 3.2 + Heinrichi artificial + 1 + + #END# + 1 + __END__ + 11 + + + #START# + 1 + __START__ + 0 + + + Default + Sanoi + r455.2 + 1 + + + Default + sanoi + r455.3 + 1 + + + Default + herra + r456.2 + 2 + + + Default + pyhä + r456.3 + 2 + + + Default + Henärickus + r457.10 + 3 + + + Default + Henärickns + r457.11 + 3 + + + Default + heinäricki + r457.12 + 3 + + + Default + heinärickus + r457.13 + 3 + + + Default + Heinrickus + r457.2 + 3 + + + Default + Heinäricki + r457.3 + 3 + + + Default + Heinrich + r457.4 + 3 + + + Default + Heinärikens + r457.5 + 3 + + + Default + Heinrichi + r457.6 + 3 + + + Default + Heinäricille + r457.7 + 3 + + + Default + Henäricki + r457.8 + 3 + + + Default + Heinärickus + r457.9 + 3 + + + Default + Ericuksen + r458.10 + 4 + + + Default + ericuksell + r458.2 + 4 + + + Default + Eijrikillen + r458.3 + 4 + + + Default + erickillen + r458.4 + 4 + + + Default + Erjkellen + r458.5 + 4 + + + Default + erickjllen + r458.6 + 4 + + + Default + Erjkillen + r458.7 + 4 + + + Default + ericillen + r458.8 + 4 + + + Default + Erijkillen + r458.9 + 4 + + + Default + welje + r459.2 + 5 + + + Default + veljelleen + r460.2 + 6 + + + Default + weljellensä + r460.4 + 6 + + + Default + weliellensä + r460.5 + 6 + + + Default + läckömme + r461.1 + 7 + + + Default + Lähkämme + r461.3 + 7 + + + Default + läckäm + r461.4 + 7 + + + Default + läckämme + r461.5 + 7 + + + Default + Läckämme + r461.6 + 7 + + + Default + lähtekämme + r461.7 + 7 + + + Default + Lächkämme + r461.8 + 7 + + + Default + Hämehen + r462.2 + 8 + + + Default + me + r462.3 + 8 + + + Default + hämehen + r463.2 + 8 + + + Default + maallen + r463.3 + 9 + + + Default + Hämehen + r463.4 + 9 + + + Default + tavastjan + r463.5 + 9 + + + Default + Tavastian + r463.6 + 9 + + + Default + maallen + r464.2 + 10 + + + T + + + S + + + Ac + + + Ab + + + Ad + + + Ae + + + B + + + Ba + + + Bb + + + Bd + + + Be + + + C + + + Ca + + + Cb + + + Cc + + + Cd + + + Ce + + + Cf + + + Da + + + E + + + F + + + H + + + I + + + J + + + K + + + L + + + M + + + N + + + O + + + P + + + R + + + G + + + A + + + G + + + Be + + + Cb + + + Ce + + + A + + + Ab + + + Ac + + + Ad + + + Ae + + + B + + + Ba + + + Bb + + + Bd + + + C + + + Ca + + + Cc + + + Cd + + + Cf + + + Da + + + E + + + F + + + H + + + I + + + J + + + K + + + L + + + M + + + N + + + O + + + P + + + R + + + S + + + T + + + Da + + + O + + + Ad + + + Ae + + + Bd + + + Cf + + + K + + + L + + + N + + + R + + + S + + + F + + + Bb + + + G + + + M + + + Ba + + + E + + + P + + + T + + + C + + + A + + + Ab + + + Ac + + + B + + + H + + + I + + + J + + + Cd + + + Ca + + + Cc + + + Be + + + Ce + + + Cb + + + Ba + + + P + + + E + + + T + + + O + + + Cc + + + Ca + + + Cb + + + Bb + + + M + + + A + + + Ab + + + Ac + + + B + + + H + + + I + + + J + + + G + + + C + + + F + + + Cd + + + Da + + + Be + + + N + + + Ae + + + Bd + + + Ce + + + Cf + + + K + + + L + + + R + + + S + + + Ad + + + F + + + E + + + Ad + + + Ca + + + Da + + + Cb + + + A + + + C + + + Ab + + + Ac + + + Ae + + + B + + + Ba + + + Bb + + + Bd + + + Ce + + + Cf + + + G + + + H + + + I + + + J + + + K + + + L + + + M + + + O + + + P + + + R + + + S + + + Cc + + + Be + + + Cd + + + N + + + A + + + F + + + E + + + Cc + + + Bd + + + Ab + + + Ac + + + Ad + + + Ba + + + Bb + + + Be + + + H + + + J + + + K + + + L + + + M + + + N + + + P + + + R + + + S + + + A + + + Ae + + + B + + + Ca + + + Cb + + + Cd + + + Da + + + I + + + O + + + G + + + Cf + + + Ce + + + C + + + Ce + + + Cf + + + Cc + + + A + + + Ae + + + B + + + Cd + + + Da + + + E + + + I + + + O + + + Ca + + + Cb + + + Ab + + + Ac + + + Ad + + + Ba + + + Bb + + + Be + + + C + + + H + + + J + + + K + + + L + + + M + + + N + + + P + + + R + + + S + + + T + + + G + + + F + + + Bd + + + Ab + + + A + + + Ca + + + Cb + + + Cc + + + Ce + + + A + + + Ab + + + Ac + + + Ad + + + Ae + + + B + + + Ba + + + Bb + + + Bd + + + Be + + + C + + + Cd + + + Cf + + + Da + + + E + + + H + + + I + + + J + + + K + + + L + + + M + + + N + + + O + + + P + + + R + + + S + + + T + + + F + + + G + + + L + + + M + + + N + + + P + + + R + + + A + + + Ab + + + Ac + + + B + + + Bd + + + Be + + + Ca + + + Cc + + + Cd + + + Ce + + + K + + + Ae + + + Ba + + + Bb + + + C + + + Cb + + + Cf + + + Da + + + E + + + F + + + G + + + H + + + I + + + J + + + S + + + Ad + + + O + + + T + + + + + __END__ + + + __START__ + + + r457.2 + + + r457.3 + + + r457.4 + + + r457.5 + + + r457.6 + + + r457.7 + + + r457.8 + + + r457.9 + + + r458.10 + + + r458.2 + + + r455.2 + + + r458.3 + + + r458.4 + + + r458.5 + + + r458.6 + + + r458.7 + + + r458.8 + + + r458.9 + + + r459.2 + + + r460.2 + + + r460.4 + + + r455.3 + + + r460.5 + + + r461.1 + + + r461.3 + + + r461.4 + + + r461.5 + + + r461.6 + + + r461.7 + + + r461.8 + + + r462.2 + + + r462.3 + + + r456.2 + + + r463.2 + + + r463.3 + + + r463.4 + + + r463.5 + + + r463.6 + + + r464.2 + + + r456.3 + + + r457.10 + + + r457.11 + + + r457.12 + + + r457.13 + + + veljelleen + weljellensä + 0 + collated + local + + + weljellensä + weliellensä + 0 + collated + local + + + maallen + Hämehen + 0 + collated + local + + + Hämehen + tavastjan + 0 + collated + local + + + Hämehen + Tavastian + 0 + collated + local + + +