requires( 'KiokuDB::TypeMap' );
requires( 'KiokuDB::TypeMap::Entry::Naive' );
requires( 'KiokuX::Model' );
+requires( 'KiokuX::User::Util' );
requires( 'Module::Load' );
requires( 'Moose' );
requires( 'Moose::Util::TypeConstraints' );
use strict;
use warnings;
+use Algorithm::Diff; # for word similarity measure
use Benchmark;
use Encode qw/ encode_utf8 /;
use Exporter 'import';
use Graph;
use JSON qw/ encode_json decode_json /;
use LWP::UserAgent;
-use Text::LevenshteinXS qw/ distance /;
use Text::Tradition;
use Text::Tradition::Stemma;
use TryCatch;
$location->{'missing'} = [ keys %lmiss ];
# Run the extra analysis we need.
+ ## TODO We run through all the variants in this call, so
+ ## why not add the reading data there instead of here below?
analyze_location( $tradition, $stemma, $location, \%lmiss );
my @layerwits;
}
}
}
-
# Get the actual graph we should work with
my $graph;
try {
# reading's evident parent(s).
foreach my $rdghash ( @{$variant_row->{'readings'}} ) {
my $rid = $rdghash->{'readingid'};
+ my $rdg = $c->reading( $rid );
# Get the subgraph
my $part = $subgraph->{$rid};
my @next;
foreach my $wparent( @check ) {
my $preading = $contig->{$wparent};
- if( $preading ) {
+ # IDP assigns all nodes, hypothetical included, to a reading
+ # in the case of genealogical sets. We prune non-necessary
+ # hypothetical readings, but they are still in $contig, so
+ # we account for that here.
+ if( $preading && $preading ne $rid ) {
$rdgparents->{$preading} = 1;
} else {
push( @next, $graph->predecessors( $wparent ) );
if( $pobj ) {
my $rel = $c->get_relationship( $p, $rdghash->{readingid} );
if( $rel ) {
- $phash->{relation} = { type => $rel->type };
- if( $rel->has_annotation ) {
- $phash->{relation}->{'annotation'} = $rel->annotation;
+ _add_to_hash( $rel, $phash );
+ } elsif( $rdg ) {
+ # First check for a transposed relationship
+ if( $rdg->rank != $pobj->rank ) {
+ foreach my $ti ( $rdg->related_readings( 'transposition' ) ) {
+ next unless $ti->text eq $rdg->text;
+ $rel = $c->get_relationship( $ti, $pobj );
+ if( $rel ) {
+ _add_to_hash( $rel, $phash, 1 );
+ last;
+ }
+ }
+ unless( $rel ) {
+ foreach my $ti ( $pobj->related_readings( 'transposition' ) ) {
+ next unless $ti->text eq $pobj->text;
+ $rel = $c->get_relationship( $ti, $rdg );
+ if( $rel ) {
+ _add_to_hash( $rel, $phash, 1 );
+ last;
+ }
+ }
+ }
}
- } elsif( $rdghash->{readingid} eq '(omitted)' ) {
- $phash->{relation} = { type => 'deletion' };
- } elsif( $rdghash->{text} ) {
- # Check for sheer word similarity.
- my $rtext = $rdghash->{text};
- my $ptext = $pobj->text;
- my $min = length( $rtext ) > length( $ptext )
- ? length( $ptext ) : length( $rtext );
- my $distance = distance( $rtext, $ptext );
- if( $distance < $min ) {
- $phash->{relation} = { type => 'wordsimilar' };
+ unless( $rel ) {
+ # and then check for sheer word similarity.
+ my $rtext = $rdg->text;
+ my $ptext = $pobj->text;
+ if( similar( $rtext, $ptext ) ) {
+ # say STDERR "Words $rtext and $ptext judged similar";
+ $phash->{relation} = { type => 'wordsimilar' };
+ }
}
+ } else {
+ $phash->{relation} = { type => 'deletion' };
}
# Get the attributes of the parent object while we are here
$phash->{'text'} = $pobj->text if $pobj;
}
}
+sub _add_to_hash {
+ my( $rel, $phash, $is_transposed ) = @_;
+ $phash->{relation} = { type => $rel->type };
+ $phash->{relation}->{transposed} = 1 if $is_transposed;
+ $phash->{relation}->{annotation} = $rel->annotation
+ if $rel->has_annotation;
+}
+
+=head2 similar( $word1, $word2 )
+
+Use Algorithm::Diff to get a sense of how close the words are to each other.
+This will hopefully handle substitutions a bit more nicely than Levenshtein.
+
+=cut
+
+#!/usr/bin/env perl
+
+sub similar {
+ my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_;
+ my @let1 = split( '', lc( $word1 ) );
+ my @let2 = split( '', lc( $word2 ) );
+ my $diff = Algorithm::Diff->new( \@let1, \@let2 );
+ my $mag = 0;
+ while( $diff->Next ) {
+ if( $diff->Same ) {
+ # Take off points for longer strings
+ my $cs = $diff->Range(1) - 2;
+ $cs = 0 if $cs < 0;
+ $mag -= $cs;
+ } elsif( !$diff->Items(1) ) {
+ $mag += $diff->Range(2);
+ } elsif( !$diff->Items(2) ) {
+ $mag += $diff->Range(1);
+ } else {
+ # Split the difference for substitutions
+ my $c1 = $diff->Range(1) || 1;
+ my $c2 = $diff->Range(2) || 1;
+ my $cd = ( $c1 + $c2 ) / 2;
+ $mag += $cd;
+ }
+ }
+ return ( $mag <= length( $word1 ) / 2 );
+}
+
+
=head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types )
use Text::Tradition::Collation::RelationshipStore;
use Text::Tradition::Error;
use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx );
-use XML::LibXML;
-use XML::LibXML::XPathContext;
use Moose;
has 'sequence' => (
# readings.
my %gobbled;
foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) {
+ # While we are here, get rid of any extra wordforms from a disambiguated
+ # reading.
+ if( $rdg->disambiguated ) {
+ foreach my $lex ( $rdg->lexemes ) {
+ $lex->clear_matching_forms();
+ $lex->add_matching_form( $lex->form );
+ }
+ }
+ # Now look for readings that can be joined to their successors.
next if $rdg->is_meta;
next if $gobbled{$rdg->id};
next if $rdg->grammar_invalid || $rdg->is_nonsense;
'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
# Create the document and root node
+ require XML::LibXML;
my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
my $root = $graphml->createElementNS( $graphml_ns, 'graphml' );
$graphml->setDocumentElement( $root );
sub scoped_relationship {
my( $self, $rdga, $rdgb ) = @_;
my( $first, $second ) = sort( $rdga, $rdgb );
+ my( $lcfirst, $lcsecond ) = sort( lc( $rdga ), lc( $rdgb ) );
if( exists $self->scopedrels->{$first}->{$second} ) {
return $self->scopedrels->{$first}->{$second};
+ } elsif( exists $self->scopedrels->{$lcfirst}->{$lcsecond} ) {
+ my $rel = $self->scopedrels->{$lcfirst}->{$lcsecond};
+ return $rel->type ne 'orthographic' ? $rel : undef;
} else {
return undef;
}
if( $options->{'scope'} ne 'local' ) {
# Is there a relationship with this a & b already?
# Case-insensitive for non-orthographics.
- my $rdga = $options->{'type'} eq 'orthographic'
- ? $options->{'reading_a'} : lc( $options->{'reading_a'} );
- my $rdgb = $options->{'type'} eq 'orthographic'
- ? $options->{'reading_b'} : lc( $options->{'reading_b'} );
+ my $rdga = $options->{'reading_a'};
+ my $rdgb = $options->{'reading_b'};
my $otherrel = $self->scoped_relationship( $rdga, $rdgb );
if( $otherrel && $otherrel->type eq $options->{type}
&& $otherrel->scope eq $options->{scope} ) {
}
- # Find all the pairs for which we need to set the relationship.
- my @vectors;
- if( $relationship->colocated && $relationship->nonlocal && !$thispaironly ) {
- push( @vectors, $self->_find_applicable( $relationship ) );
- }
-
# Now set the relationship(s).
my @pairs_set;
my $rel = $self->get_relationship( $source, $target );
$self->_set_relationship( $relationship, $source, $target ) unless $skip;
push( @pairs_set, [ $source, $target ] );
- # Set any additional relationships that might be in @vectors.
- foreach my $v ( @vectors ) {
- next if $v->[0] eq $source && $v->[1] eq $target;
- next if $v->[1] eq $source && $v->[0] eq $target;
- my @added = $self->add_relationship( @$v, $relationship );
- push( @pairs_set, @added );
+ # Find all the pairs for which we need to set the relationship.
+ if( $relationship->colocated && $relationship->nonlocal && !$thispaironly ) {
+ push( @pairs_set, $self->add_global_relationship( $relationship ) );
}
-
# Finally, restore whatever collations we can, and return.
$self->_restore_collations( @$droppedcolls );
return @pairs_set;
}
+=head2 add_global_relationship( $options, $skipvector )
+
+Adds the relationship specified wherever the relevant readings appear together
+in the graph. Options as in add_relationship above.
+
+=cut
+
+sub add_global_relationship {
+ my( $self, $options ) = @_;
+ # First see if we are dealing with a relationship object already
+ my $relationship;
+ if( ref( $options ) eq 'Text::Tradition::Collation::Relationship' ) {
+ $relationship = $options;
+ } else {
+ # Then see if a scoped relationship already applies for the words.
+ my $scopedrel = $self->scoped_relationship(
+ $options->{reading_a}, $options->{reading_b} );
+ $relationship = $scopedrel ? $scopedrel
+ : $self->create( $options );
+ }
+ # Sanity checking
+ throw( "Relationship passed to add_global is not global" )
+ unless $relationship->nonlocal;
+ throw( "Relationship passed to add_global is not a valid global type" )
+ unless $relationship->colocated && $relationship->type ne 'collated';
+
+ # Apply the relationship wherever it is valid
+ my @pairs_set;
+ foreach my $v ( $self->_find_applicable( $relationship ) ) {
+ my $exists = $self->get_relationship( @$v );
+ if( $exists && $exists->type ne 'collated' ) {
+ throw( "Found conflicting relationship at @$v" )
+ unless $exists->type eq $relationship->type
+ && $exists->scope eq $relationship->scope;
+ } else {
+ my @added = $self->add_relationship( @$v, $relationship );
+ push( @pairs_set, @added );
+ }
+ }
+ return @pairs_set;
+}
+
+
=head2 del_scoped_relationship( $reading_a, $reading_b )
Returns the general (document-level or global) relationship that has been defined
use IPC::Run qw/ run binary /;
use Text::Tradition::Error;
use Text::Tradition::StemmaUtil qw/ character_input phylip_pars parse_newick /;
-use XML::LibXML;
use Moose;
=head1 NAME
run( \@cmd, ">", binary(), \$svg );
# HACK: Parse the SVG and change the dimensions.
# Get rid of width and height attributes to allow scaling.
- my $parser = XML::LibXML->new();
- my $svgdoc = $parser->parse_string( decode_utf8( $svg ) );
if( $opts->{'size'} ) {
+ require XML::LibXML;
+ my $parser = XML::LibXML->new();
+ my $svgdoc = $parser->parse_string( decode_utf8( $svg ) );
my( $ew, $eh ) = @{$opts->{'size'}};
# If the graph is wider than it is tall, set width to ew and remove height.
# Otherwise set height to eh and remove width.
}
$svgdoc->documentElement->removeAttribute( $remove );
$svgdoc->documentElement->setAttribute( $keep, $val );
+ $svg = $svgdoc->toString();
}
# Return the result
- return decode_utf8( $svgdoc->toString );
+ return decode_utf8( $svg );
}
=head2 witnesses
}
$xmlobj = $self->object;
} else {
+ require XML::LibXML;
my $parser = XML::LibXML->new();
my $parsersub = $self->has_file ? 'parse_file' : 'parse_string';
try {
my $scope = $dir->new_scope();
my $lookfor = $ARGV[0] || '';
foreach my $tinfo ( $dir->traditionlist() ) {
+ next if $tinfo->{'name'} eq 'xxxxx';
next unless $tinfo->{'name'} =~ /$lookfor/ || $tinfo->{'id'} eq $lookfor;
my $tradition = $dir->lookup( $tinfo->{'id'} );
+ say "Looking at tradition " . $tradition->name;
my $c = $tradition->collation;
# Anywhere in the graph that there is a reading that joins only to a single
my $scope = $dir->new_scope();
my $lookfor = $ARGV[0] || '';
foreach my $tinfo ( $dir->traditionlist() ) {
+ next if $tinfo->{'name'} eq 'xxxxx';
next unless $tinfo->{'name'} =~ /$lookfor/ || $tinfo->{'id'} eq $lookfor;
my $tradition = $dir->lookup( $tinfo->{'id'} );
+ say "Looking at tradition " . $tradition->name;
my $c = $tradition->collation;
my $represented_by = {};
# the same word.
foreach my $rel ( $c->relationships ) {
my $relobj = $c->get_relationship( $rel );
- next unless $relobj->type =~ /^(grammatical|lexical)$/;
- my $r1pool = $represented_by->{$representative->{$rel->[0]}};
- my $r2pool = $represented_by->{$representative->{$rel->[1]}};
- # Error check
- if( check_distinct( $r1pool, $r2pool ) ) {
- map { propagate_rel( $c, $relobj->type, $_, @$r2pool ) } @$r1pool;
- } else {
- warn "Pools not distinct for " . join( ' and ', @$rel );
+ if( $relobj->type =~ /^(grammatical|lexical)$/ ) {
+ my $r1pool = $represented_by->{$representative->{$rel->[0]}};
+ my $r2pool = $represented_by->{$representative->{$rel->[1]}};
+ # Error check
+ if( check_distinct( $r1pool, $r2pool ) ) {
+ map { propagate_rel( $c, $relobj->type, $_, @$r2pool ) } @$r1pool;
+ } else {
+ warn "Pools not distinct for " . join( ' and ', @$rel );
+ }
+ } elsif( $relobj->type eq 'transposition' ) {
+ # We also need to propagate transposition links, but rather more strictly.
+ propagate_rel( $c, 'transposition', map { $c->reading( $_ ) } @$rel );
}
}
+
+
$dir->save( $tradition ) unless $testrun;
}
while( @list ) {
foreach my $r ( @list ) {
next if $curr eq $r;
- my $hasrel = $c->get_relationship( $curr, $r );
- if( !$hasrel || $hasrel->type eq 'collated' ) {
- say STDERR "Propagating $type relationship $curr -> $r";
- $c->add_relationship( $curr, $r, { type => $type } );
- } elsif( $hasrel->type ne $type ) {
- warn "Found relationship conflict at $curr / $r: "
- . $hasrel->type . " instead of $type"
- unless( $TYPEVALUES{$hasrel->type} < $TYPEVALUES{$type} );
+ # Check that the given relationship type exists between $curr and $r.
+ # Also check that the given relationship type exists between $curr and
+ # the same-type relationships of $r.
+ my @candidates = ( $r );
+ foreach my $rrel ( $r->related_readings() ) {
+ next if $curr eq $rrel;
+ my $rrelobj = $c->get_relationship( $r, $rrel );
+ if( $rrelobj && $rrelobj->type eq $type ) {
+ my $hasrrel = $c->get_relationship( $curr, $rrel );
+ push( @candidates, $rrel ) unless $hasrrel;
+ }
+ }
+ foreach my $cand ( @candidates ) {
+ my $hasrel = $c->get_relationship( $curr, $cand );
+ if( !$hasrel || $hasrel->type eq 'collated' ) {
+ say STDERR "Propagating $type relationship $curr -> $cand";
+ $c->add_relationship( $curr, $cand, { type => $type } );
+ } elsif( $hasrel->type ne $type ) {
+ warn "Found relationship conflict at $curr / $cand: "
+ . $hasrel->type . " instead of $type"
+ unless( $TYPEVALUES{$hasrel->type} < $TYPEVALUES{$type} );
+ }
}
}
$curr = shift @list;
my %is_parent;
my @has_no_parent;
foreach my $rdg ( @{$row->{'readings'}} ) {
- my $parents = $rdg->{'reading_parents'} || [];
- foreach my $p ( @$parents ) {
+ my $parents = $rdg->{'reading_parents'} || {};
+ foreach my $p ( keys %$parents ) {
push( @{$is_parent{$p}}, $rdg->{'readingid'} );
}
- push( @has_no_parent, $rdg->{'readingid'} ) unless @$parents;
+ push( @has_no_parent, $rdg->{'readingid'} ) unless keys %$parents;
}
# Test some stuff
foreach my $rdg ( @{$row->{'readings'}} ) {
- is( $rdg->{'independent_occurrence'}, 1,
+ is( @{$rdg->{'independent_occurrence'}}, 1,
"Genealogical reading originates exactly once" );
}
is( @has_no_parent, 1, "Only one genealogical reading lacks a parent" );