From: Tara L Andrews Date: Fri, 20 Jan 2012 23:51:20 +0000 (+0100) Subject: start using witness->text and ->layertext for consistency checking X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=b0b4421ad99abdb67e413f16219e932576212e50;hp=ea4e683f30880670fb4f892cacf7d846e8f52636;p=scpubgit%2Fstemmatology.git start using witness->text and ->layertext for consistency checking --- diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index ee8e9da..10d215e 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -558,9 +558,13 @@ sub as_dot { my $label = $self->path_display_label( $self->path_witnesses( $edge ) ); my $variables = { %edge_attrs, 'label' => $label }; # Account for the rank gap if necessary - my $rankgap = $self->reading( $edge->[1] )->rank + if( $self->reading( $edge->[1] )->has_rank + && $self->reading( $edge->[0] )->has_rank + && $self->reading( $edge->[1] )->rank + - $self->reading( $edge->[0] )->rank > 1 ) { + $variables->{'minlen'} = $self->reading( $edge->[1] )->rank - $self->reading( $edge->[0] )->rank; - $variables->{'minlen'} = $rankgap if $rankgap > 1; + } my $varopts = _dot_attr_string( $variables ); $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n", $edge->[0], $edge->[1], $varopts ); @@ -954,6 +958,7 @@ used wherever no path exists for $sigil or $backup. =cut # TODO Think about returning some lazy-eval iterator. +# TODO Get rid of backup; we should know from what witness is whether we need it. sub reading_sequence { my( $self, $start, $end, $witness, $backup ) = @_; @@ -1072,8 +1077,25 @@ sub _witnesses_of_label { my $regex = $self->wit_list_separator; my @answer = split( /\Q$regex\E/, $label ); return @answer; -} +} + +=head2 path_text( $sigil, $mainsigil [, $start, $end ] ) + +Returns the text of a witness (plus its backup, if we are using a layer) +as stored in the collation. The text is returned as a string, where the +individual readings are joined with spaces and the meta-readings (e.g. +lacunae) are omitted. Optional specification of $start and $end allows +the generation of a subset of the witness text. +=cut + +sub path_text { + my( $self, $wit, $backup, $start, $end ) = @_; + $start = $self->start unless $start; + $end = $self->end unless $end; + my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit, $backup ); + return join( ' ', map { $_->text } @path ); +} =head1 INITIALIZATION METHODS diff --git a/lib/Text/Tradition/Collation/Reading.pm b/lib/Text/Tradition/Collation/Reading.pm index 5dfa552..746fd5f 100644 --- a/lib/Text/Tradition/Collation/Reading.pm +++ b/lib/Text/Tradition/Collation/Reading.pm @@ -122,20 +122,7 @@ around BUILDARGS => sub { } else { $args = { @_ }; } - - # Did we get a JSON token to parse into a reading? If so, massage it. - if( exists $args->{'json'} ) { - my $j = delete $args->{'json'}; - - # If we have separated punctuation, restore it. - if( exists $j->{'punctuation'} ) { - $args->{'text'} = _restore_punct( $j->{'t'}, $j->{'punctuation'} ); - } else { - $args->{'text'} = $j->{'t'}; - # we don't use comparison or canonical forms yet - } - } - + # If one of our special booleans is set, we change the text and the # ID to match. if( exists $args->{'is_lacuna'} && !exists $args->{'text'} ) { @@ -154,15 +141,6 @@ around BUILDARGS => sub { $class->$orig( $args ); }; -# Utility function for parsing JSON from nCritic -sub _restore_punct { - my( $word, @punct ) = @_; - foreach my $p ( sort { $a->{pos} <=> $b->{pos} } @punct ) { - substr( $word, $p->{pos}, 0, $p->{char} ); - } - return $word; -} - =head2 is_meta A meta attribute (ha ha), which should be true if any of our 'special' diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm index a5ab34a..5744542 100644 --- a/lib/Text/Tradition/Parser/BaseText.pm +++ b/lib/Text/Tradition/Parser/BaseText.pm @@ -47,7 +47,7 @@ sub parse { load( $format_mod ); # TODO Handle a string someday if we ever have a format other than KUL my @apparatus_entries = $format_mod->can('read')->( $opts ); - merge_base( $tradition->collation, $opts->{'base'}, @apparatus_entries ); + merge_base( $tradition->collation, $opts, @apparatus_entries ); } =item B @@ -85,8 +85,8 @@ my $edits_required = {}; # edits_required -> wit -> [ { start_idx, end_idx, items } ] sub merge_base { - my( $collation, $base_file, @app_entries ) = @_; - my @base_line_starts = read_base( $base_file, $collation ); + my( $collation, $opts, @app_entries ) = @_; + my @base_line_starts = read_base( $opts->{'base'}, $collation ); my %all_witnesses; foreach my $app ( @app_entries ) { @@ -168,7 +168,7 @@ sub merge_base { my @lemma_set = $collation->reading_sequence( $lemma_start, $lemma_end ); my @reading_sets = [ @lemma_set ]; - + # For each reading that is not rdg_0, we create the variant # reading nodes, and store the range as an edit operation on # the base text. @@ -277,9 +277,11 @@ sub merge_base { } ### HACKY HACKY Do some one-off path corrections here. - require( 'data/boodts/s158.HACK' ); - KUL::HACK::pre_path_hack( $collation ); - + if( $opts->{'input'} eq 'KUL' ) { + require 'data/boodts/s158.HACK'; + KUL::HACK::pre_path_hack( $collation ); + } + # Now walk paths and calculate positional rank. $collation->make_witness_paths(); # Now delete any orphaned readings. @@ -289,7 +291,7 @@ sub merge_base { $collation->del_reading( $r ); } - KUL::HACK::post_path_hack( $collation ); + KUL::HACK::post_path_hack( $collation ) if $opts->{'input'} eq 'KUL'; # Have to check relationship validity at this point, because before that # we had no paths. # foreach my $rel ( $collation->relationships ) { @@ -299,7 +301,7 @@ sub merge_base { # $rel->type, $rel->from->id, $rel->to->id ); # } # } - $collation->calculate_ranks(); + # $collation->calculate_ranks(); } =item B diff --git a/lib/Text/Tradition/Parser/JSON.pm b/lib/Text/Tradition/Parser/JSON.pm index 913a21b..3e30651 100644 --- a/lib/Text/Tradition/Parser/JSON.pm +++ b/lib/Text/Tradition/Parser/JSON.pm @@ -76,6 +76,24 @@ if( $t ) { is( scalar $t->witnesses, 3, "Collation has all witnesses" ); } +my %seen_wits; +map { $seen_wits{$_} = 0 } qw/ A B C /; +# Check that we have the right witnesses +foreach my $wit ( $t->witnesses ) { + $seen_wits{$wit->sigil} = 1; +} +is( scalar keys %seen_wits, 3, "No extra witnesses were made" ); +foreach my $k ( keys %seen_wits ) { + ok( $seen_wits{$k}, "Witness $k still exists" ); +} + +# Check that the witnesses have the right texts +foreach my $wit ( $t->witnesses ) { + my $origtext = join( ' ', @{$wit->text} ); + my $graphtext = $t->collation->path_text( $wit->sigil ); + is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil ); +} + =end testing =cut @@ -95,9 +113,22 @@ sub parse { push( @witnesses, $wit ); my $aclabel = $c->ac_label; if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) { - $ac_wits{$1} = $wit; + $ac_wits{$sigil} = $1; } } + + # Save the original witness text for consistency checking. We do this + # in a separate loop to make sure we have all base witnesses defined, + # and to make sure that our munging and comparing later doesn't affect + # the original text. + foreach my $intext ( @{$table->{'alignment'}} ) { + my $rs = $intext->{'witness'}; + my $is_layer = exists $ac_wits{$rs}; + my $wit = $tradition->witness( $is_layer ? $ac_wits{$rs} : $rs ); + my @tokens = grep { $_ && $_->{'t'} !~ /^\#.*\#$/ } @{$intext->{'tokens'}}; + my @words = map { _restore_punct( $_ ) } @tokens; + $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words ); + } # Create the readings in each row my $length = exists $table->{'length'} @@ -138,9 +169,9 @@ sub parse { # Fold any a.c. witnesses into their main witness objects, and # delete the independent a.c. versions. foreach my $a ( keys %ac_wits ) { - my $main_wit = $tradition->witness( $a ); + my $ac_wit = $tradition->witness( $a ); + my $main_wit = $tradition->witness( $ac_wits{$a} ); next unless $main_wit; - my $ac_wit = $ac_wits{$a}; $main_wit->uncorrected_path( $ac_wit->path ); $tradition->del_witness( $ac_wit ); } @@ -166,17 +197,17 @@ sub make_nodes { my @readings; foreach my $j ( 0 .. $#tokens ) { if( $tokens[$j] ) { - my $t = $tokens[$j]; + my $word = _restore_punct( $tokens[$j] ); my $rdg; - if( exists( $unique{$t->{'t'}} ) ) { - $rdg = $unique{$t->{'t'}}; + if( exists( $unique{$word} ) ) { + $rdg = $unique{$word}; } else { my %args = ( 'id' => join( ',', $idx, $j+1 ), - 'json' => $t, + 'text' => $word, 'collation' => $c ); - $args{'is_lacuna'} = 1 if $t->{'t'} eq '#LACUNA#'; + $args{'is_lacuna'} = 1 if $word eq '#LACUNA#'; $rdg = Text::Tradition::Collation::Reading->new( %args ); - $unique{$t->{'t'}} = $rdg; + $unique{$word} = $rdg; } push( @readings, $rdg ); } else { @@ -187,6 +218,17 @@ sub make_nodes { return @readings; } +# Utility function for parsing JSON from nCritic +sub _restore_punct { + my( $token ) = @_; + my $word = $token->{'t'}; + return $word unless exists $token->{'punctuation'}; + foreach my $p ( sort { $a->{pos} <=> $b->{pos} } @{$token->{'punctuation'}} ) { + substr( $word, $p->{pos}, 0, $p->{char} ); + } + return $word; +} + 1; =head1 LICENSE diff --git a/lib/Text/Tradition/Parser/Tabular.pm b/lib/Text/Tradition/Parser/Tabular.pm index f81ec36..4b53e3c 100644 --- a/lib/Text/Tradition/Parser/Tabular.pm +++ b/lib/Text/Tradition/Parser/Tabular.pm @@ -73,6 +73,41 @@ if( $t ) { is( scalar $t->witnesses, 13, "Collation has all witnesses" ); } +# Check that we have the right witnesses +my %seen_wits; +map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /; +foreach my $wit ( $t->witnesses ) { + $seen_wits{$wit->sigil} = 1; +} +is( scalar keys %seen_wits, 13, "No extra witnesses were made" ); +foreach my $k ( keys %seen_wits ) { + ok( $seen_wits{$k}, "Witness $k still exists" ); +} + +# Check that the witnesses have the right texts +foreach my $wit ( $t->witnesses ) { + my $origtext = join( ' ', @{$wit->text} ); + my $graphtext = $t->collation->path_text( $wit->sigil ); + is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil ); +} + +# Check that the a.c. witnesses have the right text +map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /; +foreach my $k ( keys %seen_wits ) { + my $wit = $t->witness( $k ); + if( $seen_wits{$k} ) { + ok( $wit->is_layered, "Witness $k got marked as layered" ); + ok( $wit->has_layertext, "Witness $k has an a.c. version" ); + my $origtext = join( ' ', @{$wit->layertext} ); + my $acsig = $wit->sigil . $t->collation->ac_label; + my $graphtext = $t->collation->path_text( $acsig, $wit->sigil ); + is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" ); + } else { + ok( !$wit->is_layered, "Witness $k not marked as layered" ); + ok( !$wit->has_layertext, "Witness $k has no a.c. version" ); + } +} + =end testing =cut @@ -111,17 +146,29 @@ sub parse { # Set up the witnesses we find in the first line my @witnesses; - my %ac_wits; # Track these for later removal + my %ac_wits; # Track layered witness -> main witness mapping foreach my $sigil ( @{$alignment_table->[0]} ) { my $wit = $tradition->add_witness( 'sigil' => $sigil ); $wit->path( [ $c->start ] ); push( @witnesses, $wit ); my $aclabel = $c->ac_label; if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) { - $ac_wits{$1} = $wit; + $ac_wits{$sigil} = $1; } } + # Save the original witness text sequences. Have to loop back through + # the witness columns after we have identified all the a.c. witnesses. + foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) { + my @sequence = map { $_->[$idx] } @{$alignment_table}; + my $sigil = shift @sequence; + my $is_layer = exists( $ac_wits{$sigil} ); + my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil ); + # Now get rid of gaps and meta-readings like #LACUNA# + my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence; + $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words ); + } + # Now for the next rows, make nodes as necessary, assign their ranks, and # add them to the witness paths. foreach my $idx ( 1 .. $#{$alignment_table} ) { @@ -159,9 +206,9 @@ sub parse { # Fold any a.c. witnesses into their main witness objects, and # delete the independent a.c. versions. foreach my $a ( keys %ac_wits ) { - my $main_wit = $tradition->witness( $a ); + my $ac_wit = $tradition->witness( $a ); + my $main_wit = $tradition->witness( $ac_wits{$a} ); next unless $main_wit; - my $ac_wit = $ac_wits{$a}; $main_wit->uncorrected_path( $ac_wit->path ); $tradition->del_witness( $ac_wit ); } diff --git a/lib/Text/Tradition/Witness.pm b/lib/Text/Tradition/Witness.pm index 551d3ea..6ae078d 100644 --- a/lib/Text/Tradition/Witness.pm +++ b/lib/Text/Tradition/Witness.pm @@ -104,6 +104,12 @@ has 'text' => ( isa => 'ArrayRef[Str]', predicate => 'has_text', ); + +has 'layertext' => ( + is => 'rw', + isa => 'ArrayRef[Str]', + predicate => 'has_layertext', + ); # Source. This is where we read in the witness, if not from a # pre-prepared collation. It is probably a filename. diff --git a/t/text_tradition_collation_relationshipstore.t b/t/text_tradition_collation_relationshipstore.t new file mode 100644 index 0000000..f09e245 --- /dev/null +++ b/t/text_tradition_collation_relationshipstore.t @@ -0,0 +1,19 @@ +#!/usr/bin/perl -w + +use strict; +use Test::More 'no_plan'; +$| = 1; + + + +# =begin testing +{ +use Text::Tradition; + +use_ok( 'Text::Tradition::Collation::RelationshipStore' ); +} + + + + +1; diff --git a/t/text_tradition_parser_json.t b/t/text_tradition_parser_json.t index 658c585..0de0488 100644 --- a/t/text_tradition_parser_json.t +++ b/t/text_tradition_parser_json.t @@ -32,6 +32,24 @@ if( $t ) { is( scalar $t->collation->paths, 32, "Collation has all paths" ); is( scalar $t->witnesses, 3, "Collation has all witnesses" ); } + +my %seen_wits; +map { $seen_wits{$_} = 0 } qw/ A B C /; +# Check that we have the right witnesses +foreach my $wit ( $t->witnesses ) { + $seen_wits{$wit->sigil} = 1; +} +is( scalar keys %seen_wits, 3, "No extra witnesses were made" ); +foreach my $k ( keys %seen_wits ) { + ok( $seen_wits{$k}, "Witness $k still exists" ); +} + +# Check that the witnesses have the right texts +foreach my $wit ( $t->witnesses ) { + my $origtext = join( ' ', @{$wit->text} ); + my $graphtext = $t->collation->path_text( $wit->sigil ); + is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil ); +} } diff --git a/t/text_tradition_parser_tabular.t b/t/text_tradition_parser_tabular.t index 5426a76..c67e4f1 100644 --- a/t/text_tradition_parser_tabular.t +++ b/t/text_tradition_parser_tabular.t @@ -29,6 +29,41 @@ if( $t ) { is( scalar $t->collation->paths, 361, "Collation has all paths" ); is( scalar $t->witnesses, 13, "Collation has all witnesses" ); } + +# Check that we have the right witnesses +my %seen_wits; +map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /; +foreach my $wit ( $t->witnesses ) { + $seen_wits{$wit->sigil} = 1; +} +is( scalar keys %seen_wits, 13, "No extra witnesses were made" ); +foreach my $k ( keys %seen_wits ) { + ok( $seen_wits{$k}, "Witness $k still exists" ); +} + +# Check that the witnesses have the right texts +foreach my $wit ( $t->witnesses ) { + my $origtext = join( ' ', @{$wit->text} ); + my $graphtext = $t->collation->path_text( $wit->sigil ); + is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil ); +} + +# Check that the a.c. witnesses have the right text +map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /; +foreach my $k ( keys %seen_wits ) { + my $wit = $t->witness( $k ); + if( $seen_wits{$k} ) { + ok( $wit->is_layered, "Witness $k got marked as layered" ); + ok( $wit->has_layertext, "Witness $k has an a.c. version" ); + my $origtext = join( ' ', @{$wit->layertext} ); + my $acsig = $wit->sigil . $t->collation->ac_label; + my $graphtext = $t->collation->path_text( $acsig, $wit->sigil ); + is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" ); + } else { + ok( !$wit->is_layered, "Witness $k not marked as layered" ); + ok( !$wit->has_layertext, "Witness $k has no a.c. version" ); + } +} }