stemmaweb bugfixes and style fixes; add 'punctuation' relationship type
[scpubgit/stemmatology.git] / script / poslink.pl
CommitLineData
9328f93d 1#!/usr/bin/env perl
2
3use lib 'lib';
4use feature 'say';
5use strict;
6use warnings;
7use Getopt::Long;
8use Lingua::Features::Structure;
9use Text::Tradition::Directory;
10use XML::Easy::Syntax qw/ $xml10_name_rx $xml10_namestartchar_rx /;
11use TryCatch;
12
13binmode STDOUT, ':utf8';
14binmode STDERR, ':utf8';
15eval { no warnings; binmode $DB::OUT, ':utf8'; $DB::deep = 1000 };
16
17my( $dbuser, $dbpass );
18my $dsn = 'dbi:SQLite:dbname=stemmaweb/db/traditions.db';
19
20GetOptions(
21 'dsn=s' => \$dsn,
22 'u|user=s' => \$dbuser,
23 'p|pass=s' => \$dbpass,
24 );
25
26my $dbopts = { dsn => $dsn };
27$dbopts->{extra_args}->{user} = $dbuser if $dbuser;
28$dbopts->{extra_args}->{password} = $dbpass if $dbpass;
29
30my $dir = Text::Tradition::Directory->new( $dbopts );
31
32my $scope = $dir->new_scope();
33my $lookfor = $ARGV[0] || '';
34foreach my $tinfo ( $dir->traditionlist() ) {
35 next unless $tinfo->{'name'} =~ /$lookfor/ || $tinfo->{'id'} eq $lookfor;
36 my $tradition = $dir->lookup( $tinfo->{'id'} );
37 say STDERR "Found " . $tradition->name;
38 my $c = $tradition->collation;
39 $c->_set_tradition( $tradition );
40
41 # Propagate lexeme forms across transposition links
42 foreach my $rel ( $c->relationships ) {
43 next unless $c->get_relationship( $rel )->type eq 'transposition';
44 my $rdg_a = $c->reading( $rel->[0] );
45 my $rdg_b = $c->reading( $rel->[1] );
46 if( $rdg_a->disambiguated && !$rdg_b->disambiguated ) {
47 propagate_lexemes( $rdg_a, $rdg_b );
48 } elsif( $rdg_b->disambiguated && !$rdg_a->disambiguated ) {
49 propagate_lexemes( $rdg_b, $rdg_a );
50 } elsif( !$rdg_a->disambiguated && !$rdg_b->disambiguated ) {
51 say STDERR "Transposition link with nothing disambiguated: @$rel";
52 }
53 }
54
55
56 # Make the changes
57 foreach my $rank ( 1 .. $c->end->rank - 1 ) {
58 my @rankrdgs = $c->readings_at_rank( $rank );
59 # Propagate lexemes and normal forms across spelling / orthographic links
60 my %propagated;
61 foreach my $r ( @rankrdgs ) {
62 next if $propagated{$r->id};
63 my @samewords = $c->related_readings( $r,
64 sub { $_[0]->type eq 'spelling' || $_[0]->type eq 'orthographic' } );
65 push( @samewords, $r );
66 map { $propagated{$_->id} = 1 } @samewords;
67 next if @samewords == 1;
68
69 my( @haslex, @needslex );
70 foreach my $w ( @samewords ) {
71 if( $w->disambiguated ) {
72 push( @haslex, $w );
73 } else {
74 push( @needslex, $w );
75 }
76 }
77 # Check that the lexeme forms match for the readings in @haslex
78 unless( @haslex ) {
79 say STDERR "Multiple same word readings with no disambiguation at rank $rank";
80 next;
81 }
82 my $form;
83 my $consistent = 1;
84 foreach my $w ( @haslex ) {
85 my $wf = join( '//', map { $_->form->to_string } $w->lexemes );
86 $form = $wf unless $form;
87 unless( $wf eq $form ) {
88 warn "Conflicting lexeme on $w at rank $rank";
89 $consistent = 0;
90 }
91 }
92 if( $consistent && @haslex ) {
93 my $ref = shift @haslex;
94 foreach my $w ( @needslex ) {
95 propagate_lexemes( $ref, $w );
96 }
97 }
98 }
99
100 while( @rankrdgs ) {
101 my $r = shift @rankrdgs;
102 next if $r->is_meta;
103 next if $r->is_nonsense;
104 next unless $r->has_lexemes;
105 next if grep { !$_->is_disambiguated } $r->lexemes;
106 my $rlem = join( ' ', map { $_->form->lemma } $r->lexemes );
107 my @rpos = map { $_->form->morphstr } $r->lexemes;
108 foreach my $rdg ( @rankrdgs ) {
109 next if $r eq $rdg;
110 next if $rdg->is_nonsense;
111 next unless $rdg->has_lexemes;
112 next if grep { !$_->is_disambiguated } $rdg->lexemes;
113 next if is_sameword( $c, $r, $rdg );
7b7abf10 114 # Do the grammatical link if applicable
9328f93d 115 my $gram;
116 if( join( ' ', map { $_->form->lemma } $rdg->lexemes ) eq $rlem
117 && $rlem !~ /\<unknown\>/ ) {
118 say sprintf( "Linking %s (%s) and %s (%s) with grammatical rel",
119 $r, $r->text, $rdg, $rdg->text );
120 $c->add_relationship( $r, $rdg, { 'type' => 'grammatical' } );
121 $gram = 1;
122 }
7b7abf10 123
124 # Do a punctuation link (instead of a lexical link) if applicable
125 my $punct;
126 if( $rdg->text =~ /^[[:punct:]]$/ && $r->text =~ /^[[:punct:]]$/ ) {
127 say sprintf( "Linking %s (%s) and %s (%s) with punctuation rel",
128 $r, $r->text, $rdg, $rdg->text );
129 $c->add_relationship( $r, $rdg, { 'type' => 'punctuation' } );
130 $punct = 1;
131 }
132
133 # Do the lexical link if applicable
9328f93d 134 my @rdgpos = map { $_->form->morphstr } $rdg->lexemes;
135 next unless @rpos == @rdgpos;
136 my $lex = 1;
137 foreach my $i ( 0 .. $#rpos ) {
138 my $rst = Lingua::Features::Structure->from_string( $rpos[$i] );
139 my $rdgst = Lingua::Features::Structure->from_string( $rdgpos[$i] );
140 unless( $rst && $rdgst ) {
141 warn "Did not get morph structure from " .
142 $rst ? $rdgpos[$i] : $rpos[$i];
143 next;
144 }
145 unless( $rst->is_compatible( $rdgst ) ) {
146 $lex = 0;
147 }
148 }
7b7abf10 149 if( $lex && !$punct ) {
9328f93d 150 if( $gram ) {
151 warn sprintf( "Grammatical link already made for %s (%s) / %s (%s)",
152 $r, $r->text, $rdg, $rdg->text );
153 } else {
154 say sprintf( "Linking %s (%s) and %s (%s) with lexical rel",
155 $r, $r->text, $rdg, $rdg->text );
156 $c->add_relationship( $r, $rdg, { 'type' => 'lexical' } );
157 }
158 }
159 }
160 }
161 }
162
163 # Save the lot
164 # print $c->as_svg( { nocalc => 1 } );
165 $dir->save( $tradition );
166}
167
168sub is_sameword {
169 my( $c, $rdg1, $rdg2 ) = @_;
170 my @samewords = $c->related_readings( $rdg1,
171 sub { $_[0]->type eq 'spelling' || $_[0]->type eq 'orthographic' } );
172 my @in_set = grep { $_ eq $rdg2 } @samewords;
173 return scalar @in_set;
174}
175
176sub propagate_lexemes {
177 my( $from, $to ) = @_;
178 say sprintf( "Copying lexical form from %s (%s) to %s (%s)",
179 $from, $from->text, $to, $to->text );
180 $to->normal_form( $from->normal_form );
181 $to->_deserialize_lexemes( $from->_serialize_lexemes );
182}