Commit | Line | Data |
9328f93d |
1 | #!/usr/bin/env perl |
2 | |
3 | use lib 'lib'; |
4 | use feature 'say'; |
5 | use strict; |
6 | use warnings; |
7 | use Getopt::Long; |
8 | use Lingua::Features::Structure; |
9 | use Text::Tradition::Directory; |
10 | use XML::Easy::Syntax qw/ $xml10_name_rx $xml10_namestartchar_rx /; |
11 | use TryCatch; |
12 | |
13 | binmode STDOUT, ':utf8'; |
14 | binmode STDERR, ':utf8'; |
15 | eval { no warnings; binmode $DB::OUT, ':utf8'; $DB::deep = 1000 }; |
16 | |
17 | my( $dbuser, $dbpass ); |
18 | my $dsn = 'dbi:SQLite:dbname=stemmaweb/db/traditions.db'; |
19 | |
20 | GetOptions( |
21 | 'dsn=s' => \$dsn, |
22 | 'u|user=s' => \$dbuser, |
23 | 'p|pass=s' => \$dbpass, |
24 | ); |
25 | |
26 | my $dbopts = { dsn => $dsn }; |
27 | $dbopts->{extra_args}->{user} = $dbuser if $dbuser; |
28 | $dbopts->{extra_args}->{password} = $dbpass if $dbpass; |
29 | |
30 | my $dir = Text::Tradition::Directory->new( $dbopts ); |
31 | |
32 | my $scope = $dir->new_scope(); |
33 | my $lookfor = $ARGV[0] || ''; |
34 | foreach my $tinfo ( $dir->traditionlist() ) { |
35 | next unless $tinfo->{'name'} =~ /$lookfor/ || $tinfo->{'id'} eq $lookfor; |
36 | my $tradition = $dir->lookup( $tinfo->{'id'} ); |
37 | say STDERR "Found " . $tradition->name; |
38 | my $c = $tradition->collation; |
39 | $c->_set_tradition( $tradition ); |
40 | |
41 | # Propagate lexeme forms across transposition links |
42 | foreach my $rel ( $c->relationships ) { |
43 | next unless $c->get_relationship( $rel )->type eq 'transposition'; |
44 | my $rdg_a = $c->reading( $rel->[0] ); |
45 | my $rdg_b = $c->reading( $rel->[1] ); |
46 | if( $rdg_a->disambiguated && !$rdg_b->disambiguated ) { |
47 | propagate_lexemes( $rdg_a, $rdg_b ); |
48 | } elsif( $rdg_b->disambiguated && !$rdg_a->disambiguated ) { |
49 | propagate_lexemes( $rdg_b, $rdg_a ); |
50 | } elsif( !$rdg_a->disambiguated && !$rdg_b->disambiguated ) { |
51 | say STDERR "Transposition link with nothing disambiguated: @$rel"; |
52 | } |
53 | } |
54 | |
55 | |
56 | # Make the changes |
57 | foreach my $rank ( 1 .. $c->end->rank - 1 ) { |
58 | my @rankrdgs = $c->readings_at_rank( $rank ); |
59 | # Propagate lexemes and normal forms across spelling / orthographic links |
60 | my %propagated; |
61 | foreach my $r ( @rankrdgs ) { |
62 | next if $propagated{$r->id}; |
63 | my @samewords = $c->related_readings( $r, |
64 | sub { $_[0]->type eq 'spelling' || $_[0]->type eq 'orthographic' } ); |
65 | push( @samewords, $r ); |
66 | map { $propagated{$_->id} = 1 } @samewords; |
67 | next if @samewords == 1; |
68 | |
69 | my( @haslex, @needslex ); |
70 | foreach my $w ( @samewords ) { |
71 | if( $w->disambiguated ) { |
72 | push( @haslex, $w ); |
73 | } else { |
74 | push( @needslex, $w ); |
75 | } |
76 | } |
77 | # Check that the lexeme forms match for the readings in @haslex |
78 | unless( @haslex ) { |
79 | say STDERR "Multiple same word readings with no disambiguation at rank $rank"; |
80 | next; |
81 | } |
82 | my $form; |
83 | my $consistent = 1; |
84 | foreach my $w ( @haslex ) { |
85 | my $wf = join( '//', map { $_->form->to_string } $w->lexemes ); |
86 | $form = $wf unless $form; |
87 | unless( $wf eq $form ) { |
88 | warn "Conflicting lexeme on $w at rank $rank"; |
89 | $consistent = 0; |
90 | } |
91 | } |
92 | if( $consistent && @haslex ) { |
93 | my $ref = shift @haslex; |
94 | foreach my $w ( @needslex ) { |
95 | propagate_lexemes( $ref, $w ); |
96 | } |
97 | } |
98 | } |
99 | |
100 | while( @rankrdgs ) { |
101 | my $r = shift @rankrdgs; |
102 | next if $r->is_meta; |
103 | next if $r->is_nonsense; |
104 | next unless $r->has_lexemes; |
105 | next if grep { !$_->is_disambiguated } $r->lexemes; |
106 | my $rlem = join( ' ', map { $_->form->lemma } $r->lexemes ); |
107 | my @rpos = map { $_->form->morphstr } $r->lexemes; |
108 | foreach my $rdg ( @rankrdgs ) { |
109 | next if $r eq $rdg; |
110 | next if $rdg->is_nonsense; |
111 | next unless $rdg->has_lexemes; |
112 | next if grep { !$_->is_disambiguated } $rdg->lexemes; |
113 | next if is_sameword( $c, $r, $rdg ); |
7b7abf10 |
114 | # Do the grammatical link if applicable |
9328f93d |
115 | my $gram; |
116 | if( join( ' ', map { $_->form->lemma } $rdg->lexemes ) eq $rlem |
117 | && $rlem !~ /\<unknown\>/ ) { |
118 | say sprintf( "Linking %s (%s) and %s (%s) with grammatical rel", |
119 | $r, $r->text, $rdg, $rdg->text ); |
120 | $c->add_relationship( $r, $rdg, { 'type' => 'grammatical' } ); |
121 | $gram = 1; |
122 | } |
7b7abf10 |
123 | |
124 | # Do a punctuation link (instead of a lexical link) if applicable |
125 | my $punct; |
126 | if( $rdg->text =~ /^[[:punct:]]$/ && $r->text =~ /^[[:punct:]]$/ ) { |
127 | say sprintf( "Linking %s (%s) and %s (%s) with punctuation rel", |
128 | $r, $r->text, $rdg, $rdg->text ); |
129 | $c->add_relationship( $r, $rdg, { 'type' => 'punctuation' } ); |
130 | $punct = 1; |
131 | } |
132 | |
133 | # Do the lexical link if applicable |
9328f93d |
134 | my @rdgpos = map { $_->form->morphstr } $rdg->lexemes; |
135 | next unless @rpos == @rdgpos; |
136 | my $lex = 1; |
137 | foreach my $i ( 0 .. $#rpos ) { |
138 | my $rst = Lingua::Features::Structure->from_string( $rpos[$i] ); |
139 | my $rdgst = Lingua::Features::Structure->from_string( $rdgpos[$i] ); |
140 | unless( $rst && $rdgst ) { |
141 | warn "Did not get morph structure from " . |
142 | $rst ? $rdgpos[$i] : $rpos[$i]; |
143 | next; |
144 | } |
145 | unless( $rst->is_compatible( $rdgst ) ) { |
146 | $lex = 0; |
147 | } |
148 | } |
7b7abf10 |
149 | if( $lex && !$punct ) { |
9328f93d |
150 | if( $gram ) { |
151 | warn sprintf( "Grammatical link already made for %s (%s) / %s (%s)", |
152 | $r, $r->text, $rdg, $rdg->text ); |
153 | } else { |
154 | say sprintf( "Linking %s (%s) and %s (%s) with lexical rel", |
155 | $r, $r->text, $rdg, $rdg->text ); |
156 | $c->add_relationship( $r, $rdg, { 'type' => 'lexical' } ); |
157 | } |
158 | } |
159 | } |
160 | } |
161 | } |
162 | |
163 | # Save the lot |
164 | # print $c->as_svg( { nocalc => 1 } ); |
165 | $dir->save( $tradition ); |
166 | } |
167 | |
168 | sub is_sameword { |
169 | my( $c, $rdg1, $rdg2 ) = @_; |
170 | my @samewords = $c->related_readings( $rdg1, |
171 | sub { $_[0]->type eq 'spelling' || $_[0]->type eq 'orthographic' } ); |
172 | my @in_set = grep { $_ eq $rdg2 } @samewords; |
173 | return scalar @in_set; |
174 | } |
175 | |
176 | sub propagate_lexemes { |
177 | my( $from, $to ) = @_; |
178 | say sprintf( "Copying lexical form from %s (%s) to %s (%s)", |
179 | $from, $from->text, $to, $to->text ); |
180 | $to->normal_form( $from->normal_form ); |
181 | $to->_deserialize_lexemes( $from->_serialize_lexemes ); |
182 | } |