From: Tara L Andrews Date: Wed, 11 Jul 2012 14:25:44 +0000 (+0200) Subject: migrate lexemes and normal forms when readings are combined; script to merge readings... X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=869a1ada82eb48bc46f2298823fa1ef6f417c671;p=scpubgit%2Fstemmatology.git migrate lexemes and normal forms when readings are combined; script to merge readings where possible --- diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index d3a8567..d4395a7 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -409,12 +409,19 @@ sub merge_readings { # Do the deletion deed. if( $combine ) { + # Combine the text of the readings my $joinstr = $combine_char; unless( defined $joinstr ) { $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior; $joinstr = $self->wordsep unless defined $joinstr; } $kept_obj->alter_text( join( $joinstr, $kept_obj->text, $del_obj->text ) ); + $kept_obj->normal_form( + join( $joinstr, $kept_obj->normal_form, $del_obj->normal_form ) ); + # Combine the lexemes present in the readings + if( $kept_obj->has_lexemes && $del_obj->has_lexemes ) { + $kept_obj->add_lexeme( $del_obj->lexemes ); + } } $self->del_reading( $deleted ); } diff --git a/script/join_readings.pl b/script/join_readings.pl new file mode 100644 index 0000000..4105a7c --- /dev/null +++ b/script/join_readings.pl @@ -0,0 +1,80 @@ +#!/usr/bin/env perl + +use lib 'lib'; +use feature 'say'; +use strict; +use warnings; +use Getopt::Long; +use Lingua::Features::Structure; +use Text::Tradition::Directory; +use XML::Easy::Syntax qw/ $xml10_name_rx $xml10_namestartchar_rx /; +use TryCatch; + +binmode STDOUT, ':utf8'; +binmode STDERR, ':utf8'; +eval { no warnings; binmode $DB::OUT, ':utf8'; $DB::deep = 1000 }; + +my( $dbuser, $dbpass ); +my $dsn = 'dbi:SQLite:dbname=stemmaweb/db/traditions.db'; +my $testrun; + +GetOptions( + 'dsn=s' => \$dsn, + 'u|user=s' => \$dbuser, + 'p|pass=s' => \$dbpass, + 'n|test' => \$testrun, + ); + +my $dbopts = { dsn => $dsn }; +$dbopts->{extra_args}->{user} = $dbuser if $dbuser; +$dbopts->{extra_args}->{password} = $dbpass if $dbpass; + +my $dir = Text::Tradition::Directory->new( $dbopts ); + +my $scope = $dir->new_scope(); +my $lookfor = $ARGV[0] || ''; +foreach my $tinfo ( $dir->traditionlist() ) { + next unless $tinfo->{'name'} =~ /$lookfor/ || $tinfo->{'id'} eq $lookfor; + my $tradition = $dir->lookup( $tinfo->{'id'} ); + my $c = $tradition->collation; + + # Anywhere in the graph that there is a reading that joins only to a single + # successor, and neither of these have any relationships, just join the two + # readings. + my %gobbled; + foreach my $rdg ( sort { $a->rank <=> $b->rank } $c->readings ) { + next if $rdg->is_meta; + next if $gobbled{$rdg->id}; + next if $rdg->grammar_invalid || $rdg->is_nonsense; + next if $rdg->related_readings(); + my %seen; + while( $c->sequence->successors( $rdg ) == 1 ) { + my( $next ) = $c->reading( $c->sequence->successors( $rdg ) ); + die "Infinite loop" if $seen{$next->id}; + $seen{$next->id} = 1; + last if $c->sequence->predecessors( $next ) > 1; + last if $next->is_meta; + last if $next->grammar_invalid || $next->is_nonsense; + last if $next->related_readings(); + say "Joining readings $rdg and $next"; + $c->merge_readings( $rdg, $next, 1 ); + } + } + # Make sure we haven't screwed anything up + foreach my $wit ( $tradition->witnesses ) { + my $pathtext = $c->path_text( $wit->sigil ); + my $origtext = join( ' ', @{$wit->text} ); + die "Text differs for witness " . $wit->sigil + unless $pathtext eq $origtext; + if( $wit->is_layered ) { + $pathtext = $c->path_text( $wit->sigil.$c->ac_label ); + $origtext = join( ' ', @{$wit->layertext} ); + die "Ante-corr text differs for witness " . $wit->sigil + unless $pathtext eq $origtext; + } + } + + $c->relations->rebuild_equivalence(); + $c->calculate_ranks(); + $dir->save( $tradition ); +} \ No newline at end of file