[scpubgit/stemmatology.git] / script / poslink.pl

#!/usr/bin/env perl

use lib 'lib';
use feature 'say';
use strict;
use warnings;
use Getopt::Long;
use Lingua::Features::Structure;
use Text::Tradition::Directory;
use XML::Easy::Syntax qw/ $xml10_name_rx $xml10_namestartchar_rx /;
use TryCatch;

binmode STDOUT, ':utf8';
binmode STDERR, ':utf8';
eval { no warnings; binmode $DB::OUT, ':utf8'; $DB::deep = 1000 };

my( $dbuser, $dbpass );
my $dsn = 'dbi:SQLite:dbname=stemmaweb/db/traditions.db';

GetOptions( 
	'dsn=s'    => \$dsn,
	'u|user=s' => \$dbuser,
	'p|pass=s' => \$dbpass,
	);

my $dbopts = { dsn => $dsn };
$dbopts->{extra_args}->{user} = $dbuser if $dbuser;
$dbopts->{extra_args}->{password} = $dbpass if $dbpass;

my $dir = Text::Tradition::Directory->new( $dbopts );

my $scope = $dir->new_scope();
my $lookfor = $ARGV[0] || '';
foreach my $tinfo ( $dir->traditionlist() ) {
	next unless $tinfo->{'name'} =~ /$lookfor/ || $tinfo->{'id'} eq $lookfor;
	my $tradition = $dir->lookup( $tinfo->{'id'} );
	say STDERR "Found " . $tradition->name;
	my $c = $tradition->collation;
	$c->_set_tradition( $tradition );
	
	# Propagate lexeme forms across transposition links
	foreach my $rel ( $c->relationships ) {
		next unless $c->get_relationship( $rel )->type eq 'transposition';
		my $rdg_a = $c->reading( $rel->[0] );
		my $rdg_b = $c->reading( $rel->[1] );
		if( $rdg_a->disambiguated && !$rdg_b->disambiguated ) {
			propagate_lexemes( $rdg_a, $rdg_b );
		} elsif( $rdg_b->disambiguated && !$rdg_a->disambiguated ) {
			propagate_lexemes( $rdg_b, $rdg_a );
		} elsif( !$rdg_a->disambiguated && !$rdg_b->disambiguated ) {
			say STDERR "Transposition link with nothing disambiguated: @$rel";
		}
	}
		
	
	# Make the changes
	foreach my $rank ( 1 .. $c->end->rank - 1 ) {
		my @rankrdgs = $c->readings_at_rank( $rank );
		# Propagate lexemes and normal forms across spelling / orthographic links
		my %propagated;
		foreach my $r ( @rankrdgs ) {
			next if $propagated{$r->id};
			my @samewords = $c->related_readings( $r, 
				sub { $_[0]->type eq 'spelling' || $_[0]->type eq 'orthographic' } );
			push( @samewords, $r );
			map { $propagated{$_->id} = 1 } @samewords;
			next if @samewords == 1;
			
			my( @haslex, @needslex );
			foreach my $w ( @samewords ) {
				if( $w->disambiguated ) {
					push( @haslex, $w );
				} else {
					push( @needslex, $w );
				}
			}
			# Check that the lexeme forms match for the readings in @haslex
			unless( @haslex ) {
				say STDERR "Multiple same word readings with no disambiguation at rank $rank";
				next;
			}
			my $form;
			my $consistent = 1;
			foreach my $w ( @haslex ) {
				my $wf = join( '//', map { $_->form->to_string } $w->lexemes );
				$form = $wf unless $form;
				unless( $wf eq $form ) {
					warn "Conflicting lexeme on $w at rank $rank";
					$consistent = 0;
				}
			}
			if( $consistent && @haslex ) {
				my $ref = shift @haslex;
				foreach my $w ( @needslex ) {
					propagate_lexemes( $ref, $w );
				}
			}
		}
			
		while( @rankrdgs ) {
			my $r = shift @rankrdgs;
			next if $r->is_meta;
			next if $r->is_nonsense;
			next unless $r->has_lexemes;
			next if grep { !$_->is_disambiguated } $r->lexemes;
			my $rlem = join( ' ', map { $_->form->lemma } $r->lexemes );
			my @rpos = map { $_->form->morphstr } $r->lexemes;
			foreach my $rdg ( @rankrdgs ) {
				next if $r eq $rdg;
				next if $rdg->is_nonsense;
				next unless $rdg->has_lexemes;
				next if grep { !$_->is_disambiguated } $rdg->lexemes;
				next if is_sameword( $c, $r, $rdg );
				# Do the grammatical link if applicable
				my $gram;
				if( join( ' ', map { $_->form->lemma } $rdg->lexemes ) eq $rlem
					&& $rlem !~ /\<unknown\>/ ) {
					say sprintf( "Linking %s (%s) and %s (%s) with grammatical rel",
						$r, $r->text, $rdg, $rdg->text );
					$c->add_relationship( $r, $rdg, { 'type' => 'grammatical' } );
					$gram = 1;
				}
				
				# Do a punctuation link (instead of a lexical link) if applicable
				my $punct;
				if( $rdg->text =~ /^[[:punct:]]$/ && $r->text =~ /^[[:punct:]]$/ ) {
					say sprintf( "Linking %s (%s) and %s (%s) with punctuation rel",
						$r, $r->text, $rdg, $rdg->text );
					$c->add_relationship( $r, $rdg, { 'type' => 'punctuation' } );
					$punct = 1;
				}
				
				# Do the lexical link if applicable
				my @rdgpos = map { $_->form->morphstr } $rdg->lexemes;
				next unless @rpos == @rdgpos;
				my $lex = 1;
				foreach my $i ( 0 .. $#rpos ) {
					my $rst = Lingua::Features::Structure->from_string( $rpos[$i] );
					my $rdgst = Lingua::Features::Structure->from_string( $rdgpos[$i] );
					unless( $rst && $rdgst ) {
						warn "Did not get morph structure from " . 
							$rst ? $rdgpos[$i] : $rpos[$i];
						next;
					}
					unless( $rst->is_compatible( $rdgst ) ) {
						$lex = 0;
					}
				}
				if( $lex && !$punct ) {
					if( $gram ) {
						warn sprintf( "Grammatical link already made for %s (%s) / %s (%s)",
							$r, $r->text, $rdg, $rdg->text );
					} else {
						say sprintf( "Linking %s (%s) and %s (%s) with lexical rel",
							$r, $r->text, $rdg, $rdg->text );
						$c->add_relationship( $r, $rdg, { 'type' => 'lexical' } );
					}
				}
			}
		}
	}

	# Save the lot
	# print $c->as_svg( { nocalc => 1 } );
	$dir->save( $tradition );
}

sub is_sameword {
	my( $c, $rdg1, $rdg2 ) = @_;
	my @samewords = $c->related_readings( $rdg1, 
		sub { $_[0]->type eq 'spelling' || $_[0]->type eq 'orthographic' } );
	my @in_set = grep { $_ eq $rdg2 } @samewords;
	return scalar @in_set;
}

sub propagate_lexemes {
	my( $from, $to ) = @_;
	say sprintf( "Copying lexical form from %s (%s) to %s (%s)",
		$from, $from->text, $to, $to->text );
	$to->normal_form( $from->normal_form );
	$to->_deserialize_lexemes( $from->_serialize_lexemes );
}
Commit	Line	Data
9328f93d	1	#!/usr/bin/env perl
	2
	3	use lib 'lib';
	4	use feature 'say';
	5	use strict;
	6	use warnings;
	7	use Getopt::Long;
	8	use Lingua::Features::Structure;
	9	use Text::Tradition::Directory;
	10	use XML::Easy::Syntax qw/ $xml10_name_rx $xml10_namestartchar_rx /;
	11	use TryCatch;
	12
	13	binmode STDOUT, ':utf8';
	14	binmode STDERR, ':utf8';
	15	eval { no warnings; binmode $DB::OUT, ':utf8'; $DB::deep = 1000 };
	16
	17	my( $dbuser, $dbpass );
	18	my $dsn = 'dbi:SQLite:dbname=stemmaweb/db/traditions.db';
	19
	20	GetOptions(
	21	'dsn=s' => \$dsn,
	22	'u\|user=s' => \$dbuser,
	23	'p\|pass=s' => \$dbpass,
	24	);
	25
	26	my $dbopts = { dsn => $dsn };
	27	$dbopts->{extra_args}->{user} = $dbuser if $dbuser;
	28	$dbopts->{extra_args}->{password} = $dbpass if $dbpass;
	29
	30	my $dir = Text::Tradition::Directory->new( $dbopts );
	31
	32	my $scope = $dir->new_scope();
	33	my $lookfor = $ARGV[0] \|\| '';
	34	foreach my $tinfo ( $dir->traditionlist() ) {
	35	next unless $tinfo->{'name'} =~ /$lookfor/ \|\| $tinfo->{'id'} eq $lookfor;
	36	my $tradition = $dir->lookup( $tinfo->{'id'} );
	37	say STDERR "Found " . $tradition->name;
	38	my $c = $tradition->collation;
	39	$c->_set_tradition( $tradition );
	40
	41	# Propagate lexeme forms across transposition links
	42	foreach my $rel ( $c->relationships ) {
	43	next unless $c->get_relationship( $rel )->type eq 'transposition';
	44	my $rdg_a = $c->reading( $rel->[0] );
	45	my $rdg_b = $c->reading( $rel->[1] );
	46	if( $rdg_a->disambiguated && !$rdg_b->disambiguated ) {
	47	propagate_lexemes( $rdg_a, $rdg_b );
	48	} elsif( $rdg_b->disambiguated && !$rdg_a->disambiguated ) {
	49	propagate_lexemes( $rdg_b, $rdg_a );
	50	} elsif( !$rdg_a->disambiguated && !$rdg_b->disambiguated ) {
	51	say STDERR "Transposition link with nothing disambiguated: @$rel";
	52	}
	53	}
	54
	55
	56	# Make the changes
	57	foreach my $rank ( 1 .. $c->end->rank - 1 ) {
	58	my @rankrdgs = $c->readings_at_rank( $rank );
	59	# Propagate lexemes and normal forms across spelling / orthographic links
	60	my %propagated;
	61	foreach my $r ( @rankrdgs ) {
	62	next if $propagated{$r->id};
	63	my @samewords = $c->related_readings( $r,
	64	sub { $_[0]->type eq 'spelling' \|\| $_[0]->type eq 'orthographic' } );
65	push( @samewords, $r );
66	map { $propagated{$_->id} = 1 } @samewords;
67	next if @samewords == 1;
68
69	my( @haslex, @needslex );
70	foreach my $w ( @samewords ) {
71	if( $w->disambiguated ) {
72	push( @haslex, $w );
73	} else {
74	push( @needslex, $w );
75	}
76	}
77	# Check that the lexeme forms match for the readings in @haslex
78	unless( @haslex ) {
79	say STDERR "Multiple same word readings with no disambiguation at rank $rank";
80	next;
81	}
82	my $form;
83	my $consistent = 1;
84	foreach my $w ( @haslex ) {
85	my $wf = join( '//', map { $_->form->to_string } $w->lexemes );
86	$form = $wf unless $form;
87	unless( $wf eq $form ) {
88	warn "Conflicting lexeme on $w at rank $rank";
89	$consistent = 0;
90	}
91	}
92	if( $consistent && @haslex ) {
93	my $ref = shift @haslex;
94	foreach my $w ( @needslex ) {
95	propagate_lexemes( $ref, $w );
96	}
97	}
98	}
99
100	while( @rankrdgs ) {
101	my $r = shift @rankrdgs;
102	next if $r->is_meta;
103	next if $r->is_nonsense;
104	next unless $r->has_lexemes;
105	next if grep { !$_->is_disambiguated } $r->lexemes;
106	my $rlem = join( ' ', map { $_->form->lemma } $r->lexemes );
107	my @rpos = map { $_->form->morphstr } $r->lexemes;
108	foreach my $rdg ( @rankrdgs ) {
109	next if $r eq $rdg;
110	next if $rdg->is_nonsense;
111	next unless $rdg->has_lexemes;
112	next if grep { !$_->is_disambiguated } $rdg->lexemes;
113	next if is_sameword( $c, $r, $rdg );
7b7abf10	114	# Do the grammatical link if applicable
9328f93d	115	my $gram;
	116	if( join( ' ', map { $_->form->lemma } $rdg->lexemes ) eq $rlem
	117	&& $rlem !~ /\<unknown\>/ ) {
	118	say sprintf( "Linking %s (%s) and %s (%s) with grammatical rel",
	119	$r, $r->text, $rdg, $rdg->text );
	120	$c->add_relationship( $r, $rdg, { 'type' => 'grammatical' } );
	121	$gram = 1;
	122	}
7b7abf10	123
	124	# Do a punctuation link (instead of a lexical link) if applicable
	125	my $punct;
	126	if( $rdg->text =~ /^[[:punct:]]$/ && $r->text =~ /^[[:punct:]]$/ ) {
	127	say sprintf( "Linking %s (%s) and %s (%s) with punctuation rel",
	128	$r, $r->text, $rdg, $rdg->text );
	129	$c->add_relationship( $r, $rdg, { 'type' => 'punctuation' } );
	130	$punct = 1;
	131	}
	132
	133	# Do the lexical link if applicable
9328f93d	134	my @rdgpos = map { $_->form->morphstr } $rdg->lexemes;
	135	next unless @rpos == @rdgpos;
	136	my $lex = 1;
	137	foreach my $i ( 0 .. $#rpos ) {
	138	my $rst = Lingua::Features::Structure->from_string( $rpos[$i] );
	139	my $rdgst = Lingua::Features::Structure->from_string( $rdgpos[$i] );
	140	unless( $rst && $rdgst ) {
	141	warn "Did not get morph structure from " .
	142	$rst ? $rdgpos[$i] : $rpos[$i];
	143	next;
	144	}
	145	unless( $rst->is_compatible( $rdgst ) ) {
	146	$lex = 0;
	147	}
	148	}
7b7abf10	149	if( $lex && !$punct ) {
9328f93d	150	if( $gram ) {
	151	warn sprintf( "Grammatical link already made for %s (%s) / %s (%s)",
	152	$r, $r->text, $rdg, $rdg->text );
	153	} else {
	154	say sprintf( "Linking %s (%s) and %s (%s) with lexical rel",
	155	$r, $r->text, $rdg, $rdg->text );
	156	$c->add_relationship( $r, $rdg, { 'type' => 'lexical' } );
	157	}
	158	}
	159	}
	160	}
	161	}
	162
	163	# Save the lot
	164	# print $c->as_svg( { nocalc => 1 } );
	165	$dir->save( $tradition );
	166	}
	167
	168	sub is_sameword {
	169	my( $c, $rdg1, $rdg2 ) = @_;
	170	my @samewords = $c->related_readings( $rdg1,
	171	sub { $_[0]->type eq 'spelling' \|\| $_[0]->type eq 'orthographic' } );
	172	my @in_set = grep { $_ eq $rdg2 } @samewords;
	173	return scalar @in_set;
	174	}
	175
	176	sub propagate_lexemes {
	177	my( $from, $to ) = @_;
	178	say sprintf( "Copying lexical form from %s (%s) to %s (%s)",
	179	$from, $from->text, $to, $to->text );
	180	$to->normal_form( $from->normal_form );
	181	$to->_deserialize_lexemes( $from->_serialize_lexemes );
	182	}