script to link readings based on morphological tags
Tara L Andrews [Tue, 26 Jun 2012 03:41:59 +0000 (05:41 +0200)]
script/poslink.pl [new file with mode: 0755]

diff --git a/script/poslink.pl b/script/poslink.pl
new file mode 100755 (executable)
index 0000000..0f4be38
--- /dev/null
@@ -0,0 +1,170 @@
+#!/usr/bin/env perl
+
+use lib 'lib';
+use feature 'say';
+use strict;
+use warnings;
+use Getopt::Long;
+use Lingua::Features::Structure;
+use Text::Tradition::Directory;
+use XML::Easy::Syntax qw/ $xml10_name_rx $xml10_namestartchar_rx /;
+use TryCatch;
+
+binmode STDOUT, ':utf8';
+binmode STDERR, ':utf8';
+eval { no warnings; binmode $DB::OUT, ':utf8'; $DB::deep = 1000 };
+
+my( $dbuser, $dbpass );
+my $dsn = 'dbi:SQLite:dbname=stemmaweb/db/traditions.db';
+
+GetOptions( 
+       'dsn=s'    => \$dsn,
+       'u|user=s' => \$dbuser,
+       'p|pass=s' => \$dbpass,
+       );
+
+my $dbopts = { dsn => $dsn };
+$dbopts->{extra_args}->{user} = $dbuser if $dbuser;
+$dbopts->{extra_args}->{password} = $dbpass if $dbpass;
+
+my $dir = Text::Tradition::Directory->new( $dbopts );
+
+my $scope = $dir->new_scope();
+my $lookfor = $ARGV[0] || '';
+foreach my $tinfo ( $dir->traditionlist() ) {
+       next unless $tinfo->{'name'} =~ /$lookfor/ || $tinfo->{'id'} eq $lookfor;
+       my $tradition = $dir->lookup( $tinfo->{'id'} );
+       say STDERR "Found " . $tradition->name;
+       my $c = $tradition->collation;
+       $c->_set_tradition( $tradition );
+       
+       # Propagate lexeme forms across transposition links
+       foreach my $rel ( $c->relationships ) {
+               next unless $c->get_relationship( $rel )->type eq 'transposition';
+               my $rdg_a = $c->reading( $rel->[0] );
+               my $rdg_b = $c->reading( $rel->[1] );
+               if( $rdg_a->disambiguated && !$rdg_b->disambiguated ) {
+                       propagate_lexemes( $rdg_a, $rdg_b );
+               } elsif( $rdg_b->disambiguated && !$rdg_a->disambiguated ) {
+                       propagate_lexemes( $rdg_b, $rdg_a );
+               } elsif( !$rdg_a->disambiguated && !$rdg_b->disambiguated ) {
+                       say STDERR "Transposition link with nothing disambiguated: @$rel";
+               }
+       }
+               
+       
+       # Make the changes
+       foreach my $rank ( 1 .. $c->end->rank - 1 ) {
+               my @rankrdgs = $c->readings_at_rank( $rank );
+               # Propagate lexemes and normal forms across spelling / orthographic links
+               my %propagated;
+               foreach my $r ( @rankrdgs ) {
+                       next if $propagated{$r->id};
+                       my @samewords = $c->related_readings( $r, 
+                               sub { $_[0]->type eq 'spelling' || $_[0]->type eq 'orthographic' } );
+                       push( @samewords, $r );
+                       map { $propagated{$_->id} = 1 } @samewords;
+                       next if @samewords == 1;
+                       
+                       my( @haslex, @needslex );
+                       foreach my $w ( @samewords ) {
+                               if( $w->disambiguated ) {
+                                       push( @haslex, $w );
+                               } else {
+                                       push( @needslex, $w );
+                               }
+                       }
+                       # Check that the lexeme forms match for the readings in @haslex
+                       unless( @haslex ) {
+                               say STDERR "Multiple same word readings with no disambiguation at rank $rank";
+                               next;
+                       }
+                       my $form;
+                       my $consistent = 1;
+                       foreach my $w ( @haslex ) {
+                               my $wf = join( '//', map { $_->form->to_string } $w->lexemes );
+                               $form = $wf unless $form;
+                               unless( $wf eq $form ) {
+                                       warn "Conflicting lexeme on $w at rank $rank";
+                                       $consistent = 0;
+                               }
+                       }
+                       if( $consistent && @haslex ) {
+                               my $ref = shift @haslex;
+                               foreach my $w ( @needslex ) {
+                                       propagate_lexemes( $ref, $w );
+                               }
+                       }
+               }
+                       
+               while( @rankrdgs ) {
+                       my $r = shift @rankrdgs;
+                       next if $r->is_meta;
+                       next if $r->is_nonsense;
+                       next unless $r->has_lexemes;
+                       next if grep { !$_->is_disambiguated } $r->lexemes;
+                       my $rlem = join( ' ', map { $_->form->lemma } $r->lexemes );
+                       my @rpos = map { $_->form->morphstr } $r->lexemes;
+                       foreach my $rdg ( @rankrdgs ) {
+                               next if $r eq $rdg;
+                               next if $rdg->is_nonsense;
+                               next unless $rdg->has_lexemes;
+                               next if grep { !$_->is_disambiguated } $rdg->lexemes;
+                               next if is_sameword( $c, $r, $rdg );
+                               my $gram;
+                               if( join( ' ', map { $_->form->lemma } $rdg->lexemes ) eq $rlem
+                                       && $rlem !~ /\<unknown\>/ ) {
+                                       say sprintf( "Linking %s (%s) and %s (%s) with grammatical rel",
+                                               $r, $r->text, $rdg, $rdg->text );
+                                       $c->add_relationship( $r, $rdg, { 'type' => 'grammatical' } );
+                                       $gram = 1;
+                               }
+                               my @rdgpos = map { $_->form->morphstr } $rdg->lexemes;
+                               next unless @rpos == @rdgpos;
+                               my $lex = 1;
+                               foreach my $i ( 0 .. $#rpos ) {
+                                       my $rst = Lingua::Features::Structure->from_string( $rpos[$i] );
+                                       my $rdgst = Lingua::Features::Structure->from_string( $rdgpos[$i] );
+                                       unless( $rst && $rdgst ) {
+                                               warn "Did not get morph structure from " . 
+                                                       $rst ? $rdgpos[$i] : $rpos[$i];
+                                               next;
+                                       }
+                                       unless( $rst->is_compatible( $rdgst ) ) {
+                                               $lex = 0;
+                                       }
+                               }
+                               if( $lex ) {
+                                       if( $gram ) {
+                                               warn sprintf( "Grammatical link already made for %s (%s) / %s (%s)",
+                                                       $r, $r->text, $rdg, $rdg->text );
+                                       } else {
+                                               say sprintf( "Linking %s (%s) and %s (%s) with lexical rel",
+                                                       $r, $r->text, $rdg, $rdg->text );
+                                               $c->add_relationship( $r, $rdg, { 'type' => 'lexical' } );
+                                       }
+                               }
+                       }
+               }
+       }
+
+       # Save the lot
+       # print $c->as_svg( { nocalc => 1 } );
+       $dir->save( $tradition );
+}
+
+sub is_sameword {
+       my( $c, $rdg1, $rdg2 ) = @_;
+       my @samewords = $c->related_readings( $rdg1, 
+               sub { $_[0]->type eq 'spelling' || $_[0]->type eq 'orthographic' } );
+       my @in_set = grep { $_ eq $rdg2 } @samewords;
+       return scalar @in_set;
+}
+
+sub propagate_lexemes {
+       my( $from, $to ) = @_;
+       say sprintf( "Copying lexical form from %s (%s) to %s (%s)",
+               $from, $from->text, $to, $to->text );
+       $to->normal_form( $from->normal_form );
+       $to->_deserialize_lexemes( $from->_serialize_lexemes );
+}
\ No newline at end of file