From: Tara L Andrews Date: Sun, 15 Jul 2012 10:06:32 +0000 (+0200) Subject: load XML::LibXML only when required; handle global relationships more correctly;... X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?p=scpubgit%2Fstemmatology.git;a=commitdiff_plain;h=428bcf0bc79f77a7857b21ef881708faa792e33a load XML::LibXML only when required; handle global relationships more correctly; analysis logic updates --- diff --git a/Makefile.PL b/Makefile.PL index 8d23a73..33b8642 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -19,6 +19,7 @@ requires( 'KiokuDB::GC::Naive' ); requires( 'KiokuDB::TypeMap' ); requires( 'KiokuDB::TypeMap::Entry::Naive' ); requires( 'KiokuX::Model' ); +requires( 'KiokuX::User::Util' ); requires( 'Module::Load' ); requires( 'Moose' ); requires( 'Moose::Util::TypeConstraints' ); diff --git a/lib/Text/Tradition/Analysis.pm b/lib/Text/Tradition/Analysis.pm index c51f3fd..1899952 100644 --- a/lib/Text/Tradition/Analysis.pm +++ b/lib/Text/Tradition/Analysis.pm @@ -2,13 +2,13 @@ package Text::Tradition::Analysis; use strict; use warnings; +use Algorithm::Diff; # for word similarity measure use Benchmark; use Encode qw/ encode_utf8 /; use Exporter 'import'; use Graph; use JSON qw/ encode_json decode_json /; use LWP::UserAgent; -use Text::LevenshteinXS qw/ distance /; use Text::Tradition; use Text::Tradition::Stemma; use TryCatch; @@ -202,6 +202,8 @@ sub run_analysis { $location->{'missing'} = [ keys %lmiss ]; # Run the extra analysis we need. + ## TODO We run through all the variants in this call, so + ## why not add the reading data there instead of here below? analyze_location( $tradition, $stemma, $location, \%lmiss ); my @layerwits; @@ -680,7 +682,6 @@ sub analyze_location { } } } - # Get the actual graph we should work with my $graph; try { @@ -715,6 +716,7 @@ sub analyze_location { # reading's evident parent(s). foreach my $rdghash ( @{$variant_row->{'readings'}} ) { my $rid = $rdghash->{'readingid'}; + my $rdg = $c->reading( $rid ); # Get the subgraph my $part = $subgraph->{$rid}; @@ -732,7 +734,11 @@ sub analyze_location { my @next; foreach my $wparent( @check ) { my $preading = $contig->{$wparent}; - if( $preading ) { + # IDP assigns all nodes, hypothetical included, to a reading + # in the case of genealogical sets. We prune non-necessary + # hypothetical readings, but they are still in $contig, so + # we account for that here. + if( $preading && $preading ne $rid ) { $rdgparents->{$preading} = 1; } else { push( @next, $graph->predecessors( $wparent ) ); @@ -750,22 +756,40 @@ sub analyze_location { if( $pobj ) { my $rel = $c->get_relationship( $p, $rdghash->{readingid} ); if( $rel ) { - $phash->{relation} = { type => $rel->type }; - if( $rel->has_annotation ) { - $phash->{relation}->{'annotation'} = $rel->annotation; + _add_to_hash( $rel, $phash ); + } elsif( $rdg ) { + # First check for a transposed relationship + if( $rdg->rank != $pobj->rank ) { + foreach my $ti ( $rdg->related_readings( 'transposition' ) ) { + next unless $ti->text eq $rdg->text; + $rel = $c->get_relationship( $ti, $pobj ); + if( $rel ) { + _add_to_hash( $rel, $phash, 1 ); + last; + } + } + unless( $rel ) { + foreach my $ti ( $pobj->related_readings( 'transposition' ) ) { + next unless $ti->text eq $pobj->text; + $rel = $c->get_relationship( $ti, $rdg ); + if( $rel ) { + _add_to_hash( $rel, $phash, 1 ); + last; + } + } + } } - } elsif( $rdghash->{readingid} eq '(omitted)' ) { - $phash->{relation} = { type => 'deletion' }; - } elsif( $rdghash->{text} ) { - # Check for sheer word similarity. - my $rtext = $rdghash->{text}; - my $ptext = $pobj->text; - my $min = length( $rtext ) > length( $ptext ) - ? length( $ptext ) : length( $rtext ); - my $distance = distance( $rtext, $ptext ); - if( $distance < $min ) { - $phash->{relation} = { type => 'wordsimilar' }; + unless( $rel ) { + # and then check for sheer word similarity. + my $rtext = $rdg->text; + my $ptext = $pobj->text; + if( similar( $rtext, $ptext ) ) { + # say STDERR "Words $rtext and $ptext judged similar"; + $phash->{relation} = { type => 'wordsimilar' }; + } } + } else { + $phash->{relation} = { type => 'deletion' }; } # Get the attributes of the parent object while we are here $phash->{'text'} = $pobj->text if $pobj; @@ -807,6 +831,51 @@ sub analyze_location { } } +sub _add_to_hash { + my( $rel, $phash, $is_transposed ) = @_; + $phash->{relation} = { type => $rel->type }; + $phash->{relation}->{transposed} = 1 if $is_transposed; + $phash->{relation}->{annotation} = $rel->annotation + if $rel->has_annotation; +} + +=head2 similar( $word1, $word2 ) + +Use Algorithm::Diff to get a sense of how close the words are to each other. +This will hopefully handle substitutions a bit more nicely than Levenshtein. + +=cut + +#!/usr/bin/env perl + +sub similar { + my( $word1, $word2 ) = sort { length($a) <=> length($b) } @_; + my @let1 = split( '', lc( $word1 ) ); + my @let2 = split( '', lc( $word2 ) ); + my $diff = Algorithm::Diff->new( \@let1, \@let2 ); + my $mag = 0; + while( $diff->Next ) { + if( $diff->Same ) { + # Take off points for longer strings + my $cs = $diff->Range(1) - 2; + $cs = 0 if $cs < 0; + $mag -= $cs; + } elsif( !$diff->Items(1) ) { + $mag += $diff->Range(2); + } elsif( !$diff->Items(2) ) { + $mag += $diff->Range(1); + } else { + # Split the difference for substitutions + my $c1 = $diff->Range(1) || 1; + my $c2 = $diff->Range(2) || 1; + my $cd = ( $c1 + $c2 ) / 2; + $mag += $cd; + } + } + return ( $mag <= length( $word1 ) / 2 ); +} + + =head2 perl_solver( $tradition, $rank, $stemma_id, @merge_relationship_types ) diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index 5b3cd96..dda8485 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -11,8 +11,6 @@ use Text::Tradition::Collation::Reading; use Text::Tradition::Collation::RelationshipStore; use Text::Tradition::Error; use XML::Easy::Syntax qw( $xml10_namestartchar_rx $xml10_namechar_rx ); -use XML::LibXML; -use XML::LibXML::XPathContext; use Moose; has 'sequence' => ( @@ -448,6 +446,15 @@ sub compress_readings { # readings. my %gobbled; foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) { + # While we are here, get rid of any extra wordforms from a disambiguated + # reading. + if( $rdg->disambiguated ) { + foreach my $lex ( $rdg->lexemes ) { + $lex->clear_matching_forms(); + $lex->add_matching_form( $lex->form ); + } + } + # Now look for readings that can be joined to their successors. next if $rdg->is_meta; next if $gobbled{$rdg->id}; next if $rdg->grammar_invalid || $rdg->is_nonsense; @@ -1021,6 +1028,7 @@ sub as_graphml { 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd'; # Create the document and root node + require XML::LibXML; my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" ); my $root = $graphml->createElementNS( $graphml_ns, 'graphml' ); $graphml->setDocumentElement( $root ); diff --git a/lib/Text/Tradition/Collation/RelationshipStore.pm b/lib/Text/Tradition/Collation/RelationshipStore.pm index a5a9529..4342bd9 100644 --- a/lib/Text/Tradition/Collation/RelationshipStore.pm +++ b/lib/Text/Tradition/Collation/RelationshipStore.pm @@ -232,8 +232,12 @@ between the two reading strings. Returns undef if there is no general relationsh sub scoped_relationship { my( $self, $rdga, $rdgb ) = @_; my( $first, $second ) = sort( $rdga, $rdgb ); + my( $lcfirst, $lcsecond ) = sort( lc( $rdga ), lc( $rdgb ) ); if( exists $self->scopedrels->{$first}->{$second} ) { return $self->scopedrels->{$first}->{$second}; + } elsif( exists $self->scopedrels->{$lcfirst}->{$lcsecond} ) { + my $rel = $self->scopedrels->{$lcfirst}->{$lcsecond}; + return $rel->type ne 'orthographic' ? $rel : undef; } else { return undef; } @@ -419,10 +423,8 @@ sub add_relationship { if( $options->{'scope'} ne 'local' ) { # Is there a relationship with this a & b already? # Case-insensitive for non-orthographics. - my $rdga = $options->{'type'} eq 'orthographic' - ? $options->{'reading_a'} : lc( $options->{'reading_a'} ); - my $rdgb = $options->{'type'} eq 'orthographic' - ? $options->{'reading_b'} : lc( $options->{'reading_b'} ); + my $rdga = $options->{'reading_a'}; + my $rdgb = $options->{'reading_b'}; my $otherrel = $self->scoped_relationship( $rdga, $rdgb ); if( $otherrel && $otherrel->type eq $options->{type} && $otherrel->scope eq $options->{scope} ) { @@ -436,12 +438,6 @@ sub add_relationship { } - # Find all the pairs for which we need to set the relationship. - my @vectors; - if( $relationship->colocated && $relationship->nonlocal && !$thispaironly ) { - push( @vectors, $self->_find_applicable( $relationship ) ); - } - # Now set the relationship(s). my @pairs_set; my $rel = $self->get_relationship( $source, $target ); @@ -464,19 +460,58 @@ sub add_relationship { $self->_set_relationship( $relationship, $source, $target ) unless $skip; push( @pairs_set, [ $source, $target ] ); - # Set any additional relationships that might be in @vectors. - foreach my $v ( @vectors ) { - next if $v->[0] eq $source && $v->[1] eq $target; - next if $v->[1] eq $source && $v->[0] eq $target; - my @added = $self->add_relationship( @$v, $relationship ); - push( @pairs_set, @added ); + # Find all the pairs for which we need to set the relationship. + if( $relationship->colocated && $relationship->nonlocal && !$thispaironly ) { + push( @pairs_set, $self->add_global_relationship( $relationship ) ); } - # Finally, restore whatever collations we can, and return. $self->_restore_collations( @$droppedcolls ); return @pairs_set; } +=head2 add_global_relationship( $options, $skipvector ) + +Adds the relationship specified wherever the relevant readings appear together +in the graph. Options as in add_relationship above. + +=cut + +sub add_global_relationship { + my( $self, $options ) = @_; + # First see if we are dealing with a relationship object already + my $relationship; + if( ref( $options ) eq 'Text::Tradition::Collation::Relationship' ) { + $relationship = $options; + } else { + # Then see if a scoped relationship already applies for the words. + my $scopedrel = $self->scoped_relationship( + $options->{reading_a}, $options->{reading_b} ); + $relationship = $scopedrel ? $scopedrel + : $self->create( $options ); + } + # Sanity checking + throw( "Relationship passed to add_global is not global" ) + unless $relationship->nonlocal; + throw( "Relationship passed to add_global is not a valid global type" ) + unless $relationship->colocated && $relationship->type ne 'collated'; + + # Apply the relationship wherever it is valid + my @pairs_set; + foreach my $v ( $self->_find_applicable( $relationship ) ) { + my $exists = $self->get_relationship( @$v ); + if( $exists && $exists->type ne 'collated' ) { + throw( "Found conflicting relationship at @$v" ) + unless $exists->type eq $relationship->type + && $exists->scope eq $relationship->scope; + } else { + my @added = $self->add_relationship( @$v, $relationship ); + push( @pairs_set, @added ); + } + } + return @pairs_set; +} + + =head2 del_scoped_relationship( $reading_a, $reading_b ) Returns the general (document-level or global) relationship that has been defined diff --git a/lib/Text/Tradition/Stemma.pm b/lib/Text/Tradition/Stemma.pm index d6f9646..661c205 100644 --- a/lib/Text/Tradition/Stemma.pm +++ b/lib/Text/Tradition/Stemma.pm @@ -8,7 +8,6 @@ use Graph::Reader::Dot; use IPC::Run qw/ run binary /; use Text::Tradition::Error; use Text::Tradition::StemmaUtil qw/ character_input phylip_pars parse_newick /; -use XML::LibXML; use Moose; =head1 NAME @@ -347,9 +346,10 @@ sub as_svg { run( \@cmd, ">", binary(), \$svg ); # HACK: Parse the SVG and change the dimensions. # Get rid of width and height attributes to allow scaling. - my $parser = XML::LibXML->new(); - my $svgdoc = $parser->parse_string( decode_utf8( $svg ) ); if( $opts->{'size'} ) { + require XML::LibXML; + my $parser = XML::LibXML->new(); + my $svgdoc = $parser->parse_string( decode_utf8( $svg ) ); my( $ew, $eh ) = @{$opts->{'size'}}; # If the graph is wider than it is tall, set width to ew and remove height. # Otherwise set height to eh and remove width. @@ -369,9 +369,10 @@ sub as_svg { } $svgdoc->documentElement->removeAttribute( $remove ); $svgdoc->documentElement->setAttribute( $keep, $val ); + $svg = $svgdoc->toString(); } # Return the result - return decode_utf8( $svgdoc->toString ); + return decode_utf8( $svg ); } =head2 witnesses diff --git a/lib/Text/Tradition/Witness.pm b/lib/Text/Tradition/Witness.pm index 3113e54..ab5b2f4 100644 --- a/lib/Text/Tradition/Witness.pm +++ b/lib/Text/Tradition/Witness.pm @@ -348,6 +348,7 @@ sub _init_from_xmldesc { } $xmlobj = $self->object; } else { + require XML::LibXML; my $parser = XML::LibXML->new(); my $parsersub = $self->has_file ? 'parse_file' : 'parse_string'; try { diff --git a/script/join_readings.pl b/script/join_readings.pl index a5ef593..31f780a 100755 --- a/script/join_readings.pl +++ b/script/join_readings.pl @@ -32,8 +32,10 @@ my $dir = Text::Tradition::Directory->new( $dbopts ); my $scope = $dir->new_scope(); my $lookfor = $ARGV[0] || ''; foreach my $tinfo ( $dir->traditionlist() ) { + next if $tinfo->{'name'} eq 'xxxxx'; next unless $tinfo->{'name'} =~ /$lookfor/ || $tinfo->{'id'} eq $lookfor; my $tradition = $dir->lookup( $tinfo->{'id'} ); + say "Looking at tradition " . $tradition->name; my $c = $tradition->collation; # Anywhere in the graph that there is a reading that joins only to a single diff --git a/script/propagate_transitive.pl b/script/propagate_transitive.pl index 2d49467..22cfee8 100755 --- a/script/propagate_transitive.pl +++ b/script/propagate_transitive.pl @@ -40,8 +40,10 @@ my $dir = Text::Tradition::Directory->new( $dbopts ); my $scope = $dir->new_scope(); my $lookfor = $ARGV[0] || ''; foreach my $tinfo ( $dir->traditionlist() ) { + next if $tinfo->{'name'} eq 'xxxxx'; next unless $tinfo->{'name'} =~ /$lookfor/ || $tinfo->{'id'} eq $lookfor; my $tradition = $dir->lookup( $tinfo->{'id'} ); + say "Looking at tradition " . $tradition->name; my $c = $tradition->collation; my $represented_by = {}; @@ -58,16 +60,22 @@ foreach my $tinfo ( $dir->traditionlist() ) { # the same word. foreach my $rel ( $c->relationships ) { my $relobj = $c->get_relationship( $rel ); - next unless $relobj->type =~ /^(grammatical|lexical)$/; - my $r1pool = $represented_by->{$representative->{$rel->[0]}}; - my $r2pool = $represented_by->{$representative->{$rel->[1]}}; - # Error check - if( check_distinct( $r1pool, $r2pool ) ) { - map { propagate_rel( $c, $relobj->type, $_, @$r2pool ) } @$r1pool; - } else { - warn "Pools not distinct for " . join( ' and ', @$rel ); + if( $relobj->type =~ /^(grammatical|lexical)$/ ) { + my $r1pool = $represented_by->{$representative->{$rel->[0]}}; + my $r2pool = $represented_by->{$representative->{$rel->[1]}}; + # Error check + if( check_distinct( $r1pool, $r2pool ) ) { + map { propagate_rel( $c, $relobj->type, $_, @$r2pool ) } @$r1pool; + } else { + warn "Pools not distinct for " . join( ' and ', @$rel ); + } + } elsif( $relobj->type eq 'transposition' ) { + # We also need to propagate transposition links, but rather more strictly. + propagate_rel( $c, 'transposition', map { $c->reading( $_ ) } @$rel ); } } + + $dir->save( $tradition ) unless $testrun; } @@ -77,14 +85,28 @@ sub propagate_rel { while( @list ) { foreach my $r ( @list ) { next if $curr eq $r; - my $hasrel = $c->get_relationship( $curr, $r ); - if( !$hasrel || $hasrel->type eq 'collated' ) { - say STDERR "Propagating $type relationship $curr -> $r"; - $c->add_relationship( $curr, $r, { type => $type } ); - } elsif( $hasrel->type ne $type ) { - warn "Found relationship conflict at $curr / $r: " - . $hasrel->type . " instead of $type" - unless( $TYPEVALUES{$hasrel->type} < $TYPEVALUES{$type} ); + # Check that the given relationship type exists between $curr and $r. + # Also check that the given relationship type exists between $curr and + # the same-type relationships of $r. + my @candidates = ( $r ); + foreach my $rrel ( $r->related_readings() ) { + next if $curr eq $rrel; + my $rrelobj = $c->get_relationship( $r, $rrel ); + if( $rrelobj && $rrelobj->type eq $type ) { + my $hasrrel = $c->get_relationship( $curr, $rrel ); + push( @candidates, $rrel ) unless $hasrrel; + } + } + foreach my $cand ( @candidates ) { + my $hasrel = $c->get_relationship( $curr, $cand ); + if( !$hasrel || $hasrel->type eq 'collated' ) { + say STDERR "Propagating $type relationship $curr -> $cand"; + $c->add_relationship( $curr, $cand, { type => $type } ); + } elsif( $hasrel->type ne $type ) { + warn "Found relationship conflict at $curr / $cand: " + . $hasrel->type . " instead of $type" + unless( $TYPEVALUES{$hasrel->type} < $TYPEVALUES{$type} ); + } } } $curr = shift @list; diff --git a/t/analysis.t b/t/analysis.t index 30c3852..b9e9d83 100755 --- a/t/analysis.t +++ b/t/analysis.t @@ -130,15 +130,15 @@ foreach my $row ( @{$results->{'variants'}} ) { my %is_parent; my @has_no_parent; foreach my $rdg ( @{$row->{'readings'}} ) { - my $parents = $rdg->{'reading_parents'} || []; - foreach my $p ( @$parents ) { + my $parents = $rdg->{'reading_parents'} || {}; + foreach my $p ( keys %$parents ) { push( @{$is_parent{$p}}, $rdg->{'readingid'} ); } - push( @has_no_parent, $rdg->{'readingid'} ) unless @$parents; + push( @has_no_parent, $rdg->{'readingid'} ) unless keys %$parents; } # Test some stuff foreach my $rdg ( @{$row->{'readings'}} ) { - is( $rdg->{'independent_occurrence'}, 1, + is( @{$rdg->{'independent_occurrence'}}, 1, "Genealogical reading originates exactly once" ); } is( @has_no_parent, 1, "Only one genealogical reading lacks a parent" );