From: Tara L Andrews Date: Tue, 26 Jun 2012 03:41:59 +0000 (+0200) Subject: script to link readings based on morphological tags X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=9328f93d4563e7529095249bdd7c1e1531e4042e;p=scpubgit%2Fstemmatology.git script to link readings based on morphological tags --- diff --git a/script/poslink.pl b/script/poslink.pl new file mode 100755 index 0000000..0f4be38 --- /dev/null +++ b/script/poslink.pl @@ -0,0 +1,170 @@ +#!/usr/bin/env perl + +use lib 'lib'; +use feature 'say'; +use strict; +use warnings; +use Getopt::Long; +use Lingua::Features::Structure; +use Text::Tradition::Directory; +use XML::Easy::Syntax qw/ $xml10_name_rx $xml10_namestartchar_rx /; +use TryCatch; + +binmode STDOUT, ':utf8'; +binmode STDERR, ':utf8'; +eval { no warnings; binmode $DB::OUT, ':utf8'; $DB::deep = 1000 }; + +my( $dbuser, $dbpass ); +my $dsn = 'dbi:SQLite:dbname=stemmaweb/db/traditions.db'; + +GetOptions( + 'dsn=s' => \$dsn, + 'u|user=s' => \$dbuser, + 'p|pass=s' => \$dbpass, + ); + +my $dbopts = { dsn => $dsn }; +$dbopts->{extra_args}->{user} = $dbuser if $dbuser; +$dbopts->{extra_args}->{password} = $dbpass if $dbpass; + +my $dir = Text::Tradition::Directory->new( $dbopts ); + +my $scope = $dir->new_scope(); +my $lookfor = $ARGV[0] || ''; +foreach my $tinfo ( $dir->traditionlist() ) { + next unless $tinfo->{'name'} =~ /$lookfor/ || $tinfo->{'id'} eq $lookfor; + my $tradition = $dir->lookup( $tinfo->{'id'} ); + say STDERR "Found " . $tradition->name; + my $c = $tradition->collation; + $c->_set_tradition( $tradition ); + + # Propagate lexeme forms across transposition links + foreach my $rel ( $c->relationships ) { + next unless $c->get_relationship( $rel )->type eq 'transposition'; + my $rdg_a = $c->reading( $rel->[0] ); + my $rdg_b = $c->reading( $rel->[1] ); + if( $rdg_a->disambiguated && !$rdg_b->disambiguated ) { + propagate_lexemes( $rdg_a, $rdg_b ); + } elsif( $rdg_b->disambiguated && !$rdg_a->disambiguated ) { + propagate_lexemes( $rdg_b, $rdg_a ); + } elsif( !$rdg_a->disambiguated && !$rdg_b->disambiguated ) { + say STDERR "Transposition link with nothing disambiguated: @$rel"; + } + } + + + # Make the changes + foreach my $rank ( 1 .. $c->end->rank - 1 ) { + my @rankrdgs = $c->readings_at_rank( $rank ); + # Propagate lexemes and normal forms across spelling / orthographic links + my %propagated; + foreach my $r ( @rankrdgs ) { + next if $propagated{$r->id}; + my @samewords = $c->related_readings( $r, + sub { $_[0]->type eq 'spelling' || $_[0]->type eq 'orthographic' } ); + push( @samewords, $r ); + map { $propagated{$_->id} = 1 } @samewords; + next if @samewords == 1; + + my( @haslex, @needslex ); + foreach my $w ( @samewords ) { + if( $w->disambiguated ) { + push( @haslex, $w ); + } else { + push( @needslex, $w ); + } + } + # Check that the lexeme forms match for the readings in @haslex + unless( @haslex ) { + say STDERR "Multiple same word readings with no disambiguation at rank $rank"; + next; + } + my $form; + my $consistent = 1; + foreach my $w ( @haslex ) { + my $wf = join( '//', map { $_->form->to_string } $w->lexemes ); + $form = $wf unless $form; + unless( $wf eq $form ) { + warn "Conflicting lexeme on $w at rank $rank"; + $consistent = 0; + } + } + if( $consistent && @haslex ) { + my $ref = shift @haslex; + foreach my $w ( @needslex ) { + propagate_lexemes( $ref, $w ); + } + } + } + + while( @rankrdgs ) { + my $r = shift @rankrdgs; + next if $r->is_meta; + next if $r->is_nonsense; + next unless $r->has_lexemes; + next if grep { !$_->is_disambiguated } $r->lexemes; + my $rlem = join( ' ', map { $_->form->lemma } $r->lexemes ); + my @rpos = map { $_->form->morphstr } $r->lexemes; + foreach my $rdg ( @rankrdgs ) { + next if $r eq $rdg; + next if $rdg->is_nonsense; + next unless $rdg->has_lexemes; + next if grep { !$_->is_disambiguated } $rdg->lexemes; + next if is_sameword( $c, $r, $rdg ); + my $gram; + if( join( ' ', map { $_->form->lemma } $rdg->lexemes ) eq $rlem + && $rlem !~ /\/ ) { + say sprintf( "Linking %s (%s) and %s (%s) with grammatical rel", + $r, $r->text, $rdg, $rdg->text ); + $c->add_relationship( $r, $rdg, { 'type' => 'grammatical' } ); + $gram = 1; + } + my @rdgpos = map { $_->form->morphstr } $rdg->lexemes; + next unless @rpos == @rdgpos; + my $lex = 1; + foreach my $i ( 0 .. $#rpos ) { + my $rst = Lingua::Features::Structure->from_string( $rpos[$i] ); + my $rdgst = Lingua::Features::Structure->from_string( $rdgpos[$i] ); + unless( $rst && $rdgst ) { + warn "Did not get morph structure from " . + $rst ? $rdgpos[$i] : $rpos[$i]; + next; + } + unless( $rst->is_compatible( $rdgst ) ) { + $lex = 0; + } + } + if( $lex ) { + if( $gram ) { + warn sprintf( "Grammatical link already made for %s (%s) / %s (%s)", + $r, $r->text, $rdg, $rdg->text ); + } else { + say sprintf( "Linking %s (%s) and %s (%s) with lexical rel", + $r, $r->text, $rdg, $rdg->text ); + $c->add_relationship( $r, $rdg, { 'type' => 'lexical' } ); + } + } + } + } + } + + # Save the lot + # print $c->as_svg( { nocalc => 1 } ); + $dir->save( $tradition ); +} + +sub is_sameword { + my( $c, $rdg1, $rdg2 ) = @_; + my @samewords = $c->related_readings( $rdg1, + sub { $_[0]->type eq 'spelling' || $_[0]->type eq 'orthographic' } ); + my @in_set = grep { $_ eq $rdg2 } @samewords; + return scalar @in_set; +} + +sub propagate_lexemes { + my( $from, $to ) = @_; + say sprintf( "Copying lexical form from %s (%s) to %s (%s)", + $from, $from->text, $to, $to->text ); + $to->normal_form( $from->normal_form ); + $to->_deserialize_lexemes( $from->_serialize_lexemes ); +} \ No newline at end of file