migrate lexemes and normal forms when readings are combined; script to merge readings...
Tara L Andrews [Wed, 11 Jul 2012 14:25:44 +0000 (16:25 +0200)]
lib/Text/Tradition/Collation.pm
script/join_readings.pl [new file with mode: 0644]

index d3a8567..d4395a7 100644 (file)
@@ -409,12 +409,19 @@ sub merge_readings {
        
        # Do the deletion deed.
        if( $combine ) {
+               # Combine the text of the readings
                my $joinstr = $combine_char;
                unless( defined $joinstr ) {
                        $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior;
                        $joinstr = $self->wordsep unless defined $joinstr;
                }
                $kept_obj->alter_text( join( $joinstr, $kept_obj->text, $del_obj->text ) );
+               $kept_obj->normal_form( 
+                       join( $joinstr, $kept_obj->normal_form, $del_obj->normal_form ) );
+               # Combine the lexemes present in the readings
+               if( $kept_obj->has_lexemes && $del_obj->has_lexemes ) {
+                       $kept_obj->add_lexeme( $del_obj->lexemes );
+               }
        }
        $self->del_reading( $deleted );
 }
diff --git a/script/join_readings.pl b/script/join_readings.pl
new file mode 100644 (file)
index 0000000..4105a7c
--- /dev/null
@@ -0,0 +1,80 @@
+#!/usr/bin/env perl
+
+use lib 'lib';
+use feature 'say';
+use strict;
+use warnings;
+use Getopt::Long;
+use Lingua::Features::Structure;
+use Text::Tradition::Directory;
+use XML::Easy::Syntax qw/ $xml10_name_rx $xml10_namestartchar_rx /;
+use TryCatch;
+
+binmode STDOUT, ':utf8';
+binmode STDERR, ':utf8';
+eval { no warnings; binmode $DB::OUT, ':utf8'; $DB::deep = 1000 };
+
+my( $dbuser, $dbpass );
+my $dsn = 'dbi:SQLite:dbname=stemmaweb/db/traditions.db';
+my $testrun;
+
+GetOptions( 
+       'dsn=s'    => \$dsn,
+       'u|user=s' => \$dbuser,
+       'p|pass=s' => \$dbpass,
+       'n|test'   => \$testrun,
+       );
+
+my $dbopts = { dsn => $dsn };
+$dbopts->{extra_args}->{user} = $dbuser if $dbuser;
+$dbopts->{extra_args}->{password} = $dbpass if $dbpass;
+
+my $dir = Text::Tradition::Directory->new( $dbopts );
+
+my $scope = $dir->new_scope();
+my $lookfor = $ARGV[0] || '';
+foreach my $tinfo ( $dir->traditionlist() ) {
+       next unless $tinfo->{'name'} =~ /$lookfor/ || $tinfo->{'id'} eq $lookfor;
+       my $tradition = $dir->lookup( $tinfo->{'id'} );
+       my $c = $tradition->collation;
+
+       # Anywhere in the graph that there is a reading that joins only to a single
+       # successor, and neither of these have any relationships, just join the two
+       # readings.
+       my %gobbled;
+       foreach my $rdg ( sort { $a->rank <=> $b->rank } $c->readings ) {
+               next if $rdg->is_meta;
+               next if $gobbled{$rdg->id};
+               next if $rdg->grammar_invalid || $rdg->is_nonsense;
+               next if $rdg->related_readings();
+               my %seen;
+               while( $c->sequence->successors( $rdg ) == 1 ) {
+                       my( $next ) = $c->reading( $c->sequence->successors( $rdg ) );
+                       die "Infinite loop" if $seen{$next->id};
+                       $seen{$next->id} = 1;
+                       last if $c->sequence->predecessors( $next ) > 1;
+                       last if $next->is_meta;
+                       last if $next->grammar_invalid || $next->is_nonsense;
+                       last if $next->related_readings();
+                       say "Joining readings $rdg and $next";
+                       $c->merge_readings( $rdg, $next, 1 );
+               }
+       }
+       # Make sure we haven't screwed anything up
+       foreach my $wit ( $tradition->witnesses ) {
+               my $pathtext = $c->path_text( $wit->sigil );
+               my $origtext = join( ' ', @{$wit->text} );
+               die "Text differs for witness " . $wit->sigil 
+                       unless $pathtext eq $origtext;
+               if( $wit->is_layered ) {
+                       $pathtext = $c->path_text( $wit->sigil.$c->ac_label );
+                       $origtext = join( ' ', @{$wit->layertext} );
+                       die "Ante-corr text differs for witness " . $wit->sigil
+                               unless $pathtext eq $origtext;
+               }
+       }
+
+       $c->relations->rebuild_equivalence();
+       $c->calculate_ranks();
+       $dir->save( $tradition );
+}
\ No newline at end of file