8 use Lingua::Features::Structure;
9 use Text::Tradition::Directory;
10 use XML::Easy::Syntax qw/ $xml10_name_rx $xml10_namestartchar_rx /;
13 binmode STDOUT, ':utf8';
14 binmode STDERR, ':utf8';
15 eval { no warnings; binmode $DB::OUT, ':utf8'; $DB::deep = 1000 };
17 my( $dbuser, $dbpass );
18 my $dsn = 'dbi:SQLite:dbname=stemmaweb/db/traditions.db';
22 'u|user=s' => \$dbuser,
23 'p|pass=s' => \$dbpass,
26 my $dbopts = { dsn => $dsn };
27 $dbopts->{extra_args}->{user} = $dbuser if $dbuser;
28 $dbopts->{extra_args}->{password} = $dbpass if $dbpass;
30 my $dir = Text::Tradition::Directory->new( $dbopts );
32 my $scope = $dir->new_scope();
33 my $lookfor = $ARGV[0] || '';
34 foreach my $tinfo ( $dir->traditionlist() ) {
35 next unless $tinfo->{'name'} =~ /$lookfor/ || $tinfo->{'id'} eq $lookfor;
36 my $tradition = $dir->lookup( $tinfo->{'id'} );
37 say STDERR "Found " . $tradition->name;
38 my $c = $tradition->collation;
39 $c->_set_tradition( $tradition );
41 # Propagate lexeme forms across transposition links
42 foreach my $rel ( $c->relationships ) {
43 next unless $c->get_relationship( $rel )->type eq 'transposition';
44 my $rdg_a = $c->reading( $rel->[0] );
45 my $rdg_b = $c->reading( $rel->[1] );
46 if( $rdg_a->disambiguated && !$rdg_b->disambiguated ) {
47 propagate_lexemes( $rdg_a, $rdg_b );
48 } elsif( $rdg_b->disambiguated && !$rdg_a->disambiguated ) {
49 propagate_lexemes( $rdg_b, $rdg_a );
50 } elsif( !$rdg_a->disambiguated && !$rdg_b->disambiguated ) {
51 say STDERR "Transposition link with nothing disambiguated: @$rel";
57 foreach my $rank ( 1 .. $c->end->rank - 1 ) {
58 my @rankrdgs = $c->readings_at_rank( $rank );
59 # Propagate lexemes and normal forms across spelling / orthographic links
61 foreach my $r ( @rankrdgs ) {
62 next if $propagated{$r->id};
63 my @samewords = $c->related_readings( $r,
64 sub { $_[0]->type eq 'spelling' || $_[0]->type eq 'orthographic' } );
65 push( @samewords, $r );
66 map { $propagated{$_->id} = 1 } @samewords;
67 next if @samewords == 1;
69 my( @haslex, @needslex );
70 foreach my $w ( @samewords ) {
71 if( $w->disambiguated ) {
74 push( @needslex, $w );
77 # Check that the lexeme forms match for the readings in @haslex
79 say STDERR "Multiple same word readings with no disambiguation at rank $rank";
84 foreach my $w ( @haslex ) {
85 my $wf = join( '//', map { $_->form->to_string } $w->lexemes );
86 $form = $wf unless $form;
87 unless( $wf eq $form ) {
88 warn "Conflicting lexeme on $w at rank $rank";
92 if( $consistent && @haslex ) {
93 my $ref = shift @haslex;
94 foreach my $w ( @needslex ) {
95 propagate_lexemes( $ref, $w );
101 my $r = shift @rankrdgs;
103 next if $r->is_nonsense;
104 next unless $r->has_lexemes;
105 next if grep { !$_->is_disambiguated } $r->lexemes;
106 my $rlem = join( ' ', map { $_->form->lemma } $r->lexemes );
107 my @rpos = map { $_->form->morphstr } $r->lexemes;
108 foreach my $rdg ( @rankrdgs ) {
110 next if $rdg->is_nonsense;
111 next unless $rdg->has_lexemes;
112 next if grep { !$_->is_disambiguated } $rdg->lexemes;
113 next if is_sameword( $c, $r, $rdg );
114 # Do the grammatical link if applicable
116 if( join( ' ', map { $_->form->lemma } $rdg->lexemes ) eq $rlem
117 && $rlem !~ /\<unknown\>/ ) {
118 say sprintf( "Linking %s (%s) and %s (%s) with grammatical rel",
119 $r, $r->text, $rdg, $rdg->text );
120 $c->add_relationship( $r, $rdg, { 'type' => 'grammatical' } );
124 # Do a punctuation link (instead of a lexical link) if applicable
126 if( $rdg->text =~ /^[[:punct:]]$/ && $r->text =~ /^[[:punct:]]$/ ) {
127 say sprintf( "Linking %s (%s) and %s (%s) with punctuation rel",
128 $r, $r->text, $rdg, $rdg->text );
129 $c->add_relationship( $r, $rdg, { 'type' => 'punctuation' } );
133 # Do the lexical link if applicable
134 my @rdgpos = map { $_->form->morphstr } $rdg->lexemes;
135 next unless @rpos == @rdgpos;
137 foreach my $i ( 0 .. $#rpos ) {
138 my $rst = Lingua::Features::Structure->from_string( $rpos[$i] );
139 my $rdgst = Lingua::Features::Structure->from_string( $rdgpos[$i] );
140 unless( $rst && $rdgst ) {
141 warn "Did not get morph structure from " .
142 $rst ? $rdgpos[$i] : $rpos[$i];
145 unless( $rst->is_compatible( $rdgst ) ) {
149 if( $lex && !$punct ) {
151 warn sprintf( "Grammatical link already made for %s (%s) / %s (%s)",
152 $r, $r->text, $rdg, $rdg->text );
154 say sprintf( "Linking %s (%s) and %s (%s) with lexical rel",
155 $r, $r->text, $rdg, $rdg->text );
156 $c->add_relationship( $r, $rdg, { 'type' => 'lexical' } );
164 # print $c->as_svg( { nocalc => 1 } );
165 $dir->save( $tradition );
169 my( $c, $rdg1, $rdg2 ) = @_;
170 my @samewords = $c->related_readings( $rdg1,
171 sub { $_[0]->type eq 'spelling' || $_[0]->type eq 'orthographic' } );
172 my @in_set = grep { $_ eq $rdg2 } @samewords;
173 return scalar @in_set;
176 sub propagate_lexemes {
177 my( $from, $to ) = @_;
178 say sprintf( "Copying lexical form from %s (%s) to %s (%s)",
179 $from, $from->text, $to, $to->text );
180 $to->normal_form( $from->normal_form );
181 $to->_deserialize_lexemes( $from->_serialize_lexemes );