From: Tara L Andrews Date: Sat, 14 Jul 2012 17:52:13 +0000 (+0200) Subject: Merge branch 'authentication' of github.com:tla/stemmatology X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?p=scpubgit%2Fstemmatology.git;a=commitdiff_plain;h=9931b0081b6caf73044b6694be070092045456ee;hp=d5033c7aa090d66ca7ac82bd81fb2be11a3c80c6 Merge branch 'authentication' of github.com:tla/stemmatology --- diff --git a/.gitignore b/.gitignore index 7e8c749..484a79c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *~ *.bbprojectd/ +t/var data !/t/data Makefile diff --git a/Makefile.PL b/Makefile.PL index ba2ecf5..8d23a73 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -30,5 +30,16 @@ requires( 'TryCatch' ); requires( 'XML::Easy::Syntax' ); requires( 'XML::LibXML' ); requires( 'XML::LibXML::XPathContext' ); +requires( 'YAML::XS' ); +requires( 'namespace::clean' ); +# For the morphology stuff +requires( 'Lingua::TagSet::Multext' ); +requires( 'Lingua::TagSet::TreeTagger::French' ); +requires( 'Lingua::TagSet::TreeTagger::English' ); +requires( 'Lingua::Features::Structure' ); +build_requires( 'Data::Dump' ); build_requires( 'Test::Warn' ); +# Modules needed for morphology but not trivially CPANnable +recommends( 'Lingua::TreeTagger' ); +recommends( 'Flemm' ); &WriteAll; diff --git a/lib/Text/Tradition.pm b/lib/Text/Tradition.pm index e135da9..ce64e1d 100644 --- a/lib/Text/Tradition.pm +++ b/lib/Text/Tradition.pm @@ -54,6 +54,13 @@ has 'stemmata' => ( }, default => sub { [] }, ); + +has '_initialized' => ( + is => 'ro', + isa => 'Bool', + default => undef, + writer => '_init_done', + ); has 'user' => ( is => 'rw', @@ -304,6 +311,7 @@ sub BUILD { $mod->can('parse')->( $self, $init_args ); } } + $self->_init_done( 1 ); return $self; } diff --git a/lib/Text/Tradition/Analysis.pm b/lib/Text/Tradition/Analysis.pm index 7777c6c..c51f3fd 100644 --- a/lib/Text/Tradition/Analysis.pm +++ b/lib/Text/Tradition/Analysis.pm @@ -8,8 +8,10 @@ use Exporter 'import'; use Graph; use JSON qw/ encode_json decode_json /; use LWP::UserAgent; +use Text::LevenshteinXS qw/ distance /; use Text::Tradition; use Text::Tradition::Stemma; +use TryCatch; use vars qw/ @EXPORT_OK /; @EXPORT_OK = qw/ run_analysis group_variants analyze_variant_location wit_stringify /; @@ -166,9 +168,13 @@ sub run_analysis { my @groups; my @use_ranks; my %lacunae; + my $moved = {}; foreach my $rank ( @ranks ) { my $missing = [ @lacunose ]; - my $rankgroup = group_variants( $tradition, $rank, $missing, \@collapse ); + my $rankgroup = group_variants( $tradition, $rank, $missing, $moved, \@collapse ); + # Filter out any empty rankgroups + # (e.g. from the later rank for a transposition) + next unless keys %$rankgroup; if( $opts{'exclude_type1'} ) { # Check to see whether this is a "useful" group. my( $rdgs, $grps ) = _useful_variant( $rankgroup, @@ -188,47 +194,42 @@ sub run_analysis { foreach my $idx ( 0 .. $#use_ranks ) { my $location = $answer->{'variants'}->[$idx]; # Add the rank back in - $location->{'id'} = $use_ranks[$idx]; + my $rank = $use_ranks[$idx]; + $location->{'id'} = $rank; # Note what our lacunae are my %lmiss; map { $lmiss{$_} = 1 } @{$lacunae{$use_ranks[$idx]}}; - # Run through the reading groups and add as 'lacunae' any redundant - # a.c. witnesses (yes, we have to do this before the analysis, thus - # identical loops before and after. Boo.) - # TODO Consider making these callbacks to analyze_location - foreach my $rdghash ( @{$location->{'readings'}} ) { - my %rwits; - map { $rwits{$_} = 1 } @{$rdghash->{'group'}}; - foreach my $rw ( keys %rwits ) { - if( $rw =~ /^(.*)\Q$aclabel\E$/ ) { - if( exists $rwits{$1} ) { - $lmiss{$rw} = 1; - delete $rwits{$rw}; - } - } - } - $rdghash->{'group'} = [ keys %rwits ]; - } $location->{'missing'} = [ keys %lmiss ]; # Run the extra analysis we need. - analyze_location( $tradition, $stemma->graph, $location ); + analyze_location( $tradition, $stemma, $location, \%lmiss ); + my @layerwits; # Do the final post-analysis tidying up of the data. foreach my $rdghash ( @{$location->{'readings'}} ) { $conflict_count++ if exists $rdghash->{'conflict'} && $rdghash->{'conflict'}; - # Add the reading text back in + # Add the reading text back in, setting display value as needed my $rdg = $c->reading( $rdghash->{'readingid'} ); - $rdghash->{'text'} = $rdg ? $rdg->text : $rdghash->{'readingid'}; + if( $rdg ) { + $rdghash->{'text'} = $rdg->text . + ( $rdg->rank == $rank ? '' : ' [' . $rdg->rank . ']' ); + $rdghash->{'is_ungrammatical'} = $rdg->grammar_invalid; + $rdghash->{'is_nonsense'} = $rdg->is_nonsense; + } # Remove lacunose witnesses from this reading's list now that the # analysis is done my @realgroup; map { push( @realgroup, $_ ) unless $lmiss{$_} } @{$rdghash->{'group'}}; $rdghash->{'group'} = \@realgroup; - # TODO Record hypotheticals used to create group, if we end up - # needing it + # Note any layered witnesses that appear in this group + foreach( @realgroup ) { + if( $_ =~ /^(.*)\Q$aclabel\E$/ ) { + push( @layerwits, $1 ); + } + } } + $location->{'layerwits'} = \@layerwits if @layerwits; } $answer->{'conflict_count'} = $conflict_count; @@ -240,7 +241,8 @@ sub run_analysis { Groups the variants at the given $rank of the collation, treating any relationships in @merge_relationship_types as equivalent. $lacunose should be a reference to an array, to which the sigla of lacunose witnesses at this -rank will be appended. +rank will be appended; $transposed should be a reference to a hash, wherein +the identities of transposed readings and their relatives will be stored. Returns a hash $group_readings where $rdg is attested by the witnesses listed in $group_readings->{$rdg}. @@ -249,61 +251,109 @@ in $group_readings->{$rdg}. # Return group_readings, groups, lacunose sub group_variants { - my( $tradition, $rank, $lacunose, $collapse ) = @_; + my( $tradition, $rank, $lacunose, $transposed, $collapse ) = @_; my $c = $tradition->collation; my $aclabel = $c->ac_label; - + my $table = $c->alignment_table; # Get the alignment table readings my %readings_at_rank; - my %is_lacunose; # lookup table for $lacunose - map { $is_lacunose{$_} = 1 } @$lacunose; - my @gap_wits; - foreach my $tablewit ( @{$c->alignment_table->{'alignment'}} ) { + my %is_lacunose; # lookup table for witnesses not in stemma + map { $is_lacunose{$_} = 1; $is_lacunose{$_.$aclabel} = 1 } @$lacunose; + my @check_for_gaps; + my %moved_wits; + my $has_transposition; + foreach my $tablewit ( @{$table->{'alignment'}} ) { my $rdg = $tablewit->{'tokens'}->[$rank-1]; my $wit = $tablewit->{'witness'}; # Exclude the witness if it is "lacunose" which if we got here # means "not in the stemma". next if $is_lacunose{$wit}; + # Note if the witness is actually in a lacuna if( $rdg && $rdg->{'t'}->is_lacuna ) { _add_to_witlist( $wit, $lacunose, $aclabel ); + # Otherwise the witness either has a positive reading... } elsif( $rdg ) { - $readings_at_rank{$rdg->{'t'}->text} = $rdg->{'t'}; + # If the reading has been counted elsewhere as a transposition, ignore it. + if( $transposed->{$rdg->{'t'}->id} ) { + # TODO Does this cope with three-way transpositions? + map { $moved_wits{$_} = 1 } @{$transposed->{$rdg->{'t'}->id}}; + next; + } + # Otherwise, record it... + $readings_at_rank{$rdg->{'t'}->id} = $rdg->{'t'}; + # ...and grab any transpositions, and their relations. + my @transp = grep { $_->rank != $rank } $rdg->{'t'}->related_readings(); + foreach my $trdg ( @transp ) { + next if exists $readings_at_rank{$trdg->id}; + $has_transposition = 1; + my @affected_wits = _table_witnesses( + $table, $trdg, \%is_lacunose, $aclabel ); + next unless @affected_wits; + map { $moved_wits{$_} = 1 } @affected_wits; + $transposed->{$trdg->id} = + [ _table_witnesses( $table, $rdg->{'t'}, \%is_lacunose, $aclabel ) ]; + $readings_at_rank{$trdg->id} = $trdg; + } + # ...or it is empty, ergo a gap. } else { - _add_to_witlist( $wit, \@gap_wits, $aclabel ); + _add_to_witlist( $wit, \@check_for_gaps, $aclabel ); } } - + my @gap_wits; + map { _add_to_witlist( $_, \@gap_wits, $aclabel ) + unless $moved_wits{$_} } @check_for_gaps; # Group the readings, collapsing groups by relationship if needed - my %grouped_readings; + my $grouped_readings = {}; foreach my $rdg ( values %readings_at_rank ) { # Skip readings that have been collapsed into others. - next if exists $grouped_readings{$rdg->id} && !$grouped_readings{$rdg->id}; + next if exists $grouped_readings->{$rdg->id} + && $grouped_readings->{$rdg->id} eq 'COLLAPSE'; # Get the witness list, including from readings collapsed into this one. - my @wits = $rdg->witnesses; - if( $collapse ) { + my @wits = _table_witnesses( $table, $rdg, \%is_lacunose, $aclabel ); + if( $collapse && @$collapse ) { my $filter = sub { my $r = $_[0]; grep { $_ eq $r->type } @$collapse; }; foreach my $other ( $rdg->related_readings( $filter ) ) { - my @otherwits = $other->witnesses; + my @otherwits = _table_witnesses( + $table, $other, \%is_lacunose, $aclabel ); push( @wits, @otherwits ); - $grouped_readings{$other->id} = 0; + $grouped_readings->{$other->id} = 'COLLAPSE'; } } - # Filter the group to those witnesses in the stemma - my @use_wits; - foreach my $wit ( @wits ) { - next if $is_lacunose{$wit}; - push( @use_wits, $wit ); - } - $grouped_readings{$rdg->id} = \@use_wits; + $grouped_readings->{$rdg->id} = \@wits; } - $grouped_readings{'(omitted)'} = \@gap_wits if @gap_wits; + $grouped_readings->{'(omitted)'} = \@gap_wits if @gap_wits; # Get rid of our collapsed readings - map { delete $grouped_readings{$_} unless $grouped_readings{$_} } - keys %grouped_readings + map { delete $grouped_readings->{$_} if $grouped_readings->{$_} eq 'COLLAPSE' } + keys %$grouped_readings if $collapse; + + # If something was transposed, check the groups for doubled-up readings + if( $has_transposition ) { + # print STDERR "Group for rank $rank:\n"; + # map { print STDERR "\t$_: " . join( ' ' , @{$grouped_readings->{$_}} ) . "\n" } + # keys %$grouped_readings; + _check_transposed_consistency( $c, $rank, $transposed, $grouped_readings ); + } # Return the result - return \%grouped_readings; + return $grouped_readings; +} + +# Helper function to query the alignment table for all witnesses (a.c. included) +# that have a given reading at its rank. +sub _table_witnesses { + my( $table, $trdg, $lacunose, $aclabel ) = @_; + my $tableidx = $trdg->rank - 1; + my @has_reading; + foreach my $row ( @{$table->{'alignment'}} ) { + my $wit = $row->{'witness'}; + next if $lacunose->{$wit}; + my $rdg = $row->{'tokens'}->[$tableidx]; + next unless exists $rdg->{'t'} && defined $rdg->{'t'}; + _add_to_witlist( $wit, \@has_reading, $aclabel ) + if $rdg->{'t'}->id eq $trdg->id; + } + return @has_reading; } # Helper function to ensure that X and X a.c. never appear in the same list. @@ -328,6 +378,60 @@ sub _add_to_witlist { } } +sub _check_transposed_consistency { + my( $c, $rank, $transposed, $groupings ) = @_; + my %seen_wits; + my %thisrank; + # Note which readings are actually at this rank, and which witnesses + # belong to which reading. + foreach my $rdg ( keys %$groupings ) { + my $rdgobj = $c->reading( $rdg ); + # Count '(omitted)' as a reading at this rank + $thisrank{$rdg} = 1 if !$rdgobj || $rdgobj->rank == $rank; + map { push( @{$seen_wits{$_}}, $rdg ) } @{$groupings->{$rdg}}; + } + # Our work is done if we have no witness belonging to more than one + # reading. + my @doubled = grep { scalar @{$seen_wits{$_}} > 1 } keys %seen_wits; + return unless @doubled; + # If we have a symmetric related transposition, drop the non-rank readings. + if( @doubled == scalar keys %seen_wits ) { + foreach my $rdg ( keys %$groupings ) { + if( !$thisrank{$rdg} ) { + my $groupstr = wit_stringify( $groupings->{$rdg} ); + my ( $matched ) = grep { $groupstr eq wit_stringify( $groupings->{$_} ) } + keys %thisrank; + delete $groupings->{$rdg}; + # If we found a group match, assume there is a symmetry happening. + # TODO think more about this + # print STDERR "*** Deleting symmetric reading $rdg\n"; + unless( $matched ) { + delete $transposed->{$rdg}; + warn "Found problem in evident symmetry with reading $rdg"; + } + } + } + # Otherwise 'unhook' the transposed reading(s) that have duplicates. + } else { + foreach my $dup ( @doubled ) { + foreach my $rdg ( @{$seen_wits{$dup}} ) { + next if $thisrank{$rdg}; + next unless exists $groupings->{$rdg}; + # print STDERR "*** Deleting asymmetric doubled-up reading $rdg\n"; + delete $groupings->{$rdg}; + delete $transposed->{$rdg}; + } + } + # and put any now-orphaned readings into an 'omitted' reading. + foreach my $wit ( keys %seen_wits ) { + unless( grep { exists $groupings->{$_} } @{$seen_wits{$wit}} ) { + $groupings->{'(omitted)'} = [] unless exists $groupings->{'(omitted)'}; + _add_to_witlist( $wit, $groupings->{'(omitted)'}, $c->ac_label ); + } + } + } +} + =head2 solve_variants( $graph, @groups ) Sends the set of groups to the external graph solver service and returns @@ -379,7 +483,12 @@ sub solve_variants { # Finally, add the group to the list to be calculated for this graph. map { s/\Q$aclabel\E$// } @acwits; - my $graph = $stemma->extend_graph( \@acwits ); + my $graph; + try { + $graph = $stemma->extend_graph( \@acwits ); + } catch { + die "Unable to extend graph with @acwits"; + } unless( exists $graph_problems->{"$graph"} ) { $graph_problems->{"$graph"} = { 'object' => $graph, 'groups' => [] }; } @@ -387,7 +496,6 @@ sub solve_variants { } ## For each distinct graph, send its groups to the solver. - $DB::single = 1; my $solver_url = 'http://byzantini.st/cgi-bin/graphcalc.cgi'; my $ua = LWP::UserAgent->new(); ## Witness map is a HACK to get around limitations in node names from IDP @@ -401,6 +509,7 @@ sub solve_variants { my $json = encode_json( _safe_wit_strings( $graph, $stemma->collation, $groupings, $witness_map ) ); # Send it off and get the result + #print STDERR "Sending request: $json\n"; my $resp = $ua->post( $solver_url, 'Content-Type' => 'application/json', 'Content' => $json ); my $answer; @@ -552,22 +661,51 @@ conflict, reading_parents, independent_occurrence, followed, not_followed, and f =cut sub analyze_location { - my ( $tradition, $graph, $variant_row ) = @_; + my ( $tradition, $stemma, $variant_row, $lacunose ) = @_; + my $c = $tradition->collation; # Make a hash of all known node memberships, and make the subgraphs. my $contig = {}; my $reading_roots = {}; my $subgraph = {}; + my $acstr = $c->ac_label; + my @acwits; + # Note which witnesses positively belong to which group foreach my $rdghash ( @{$variant_row->{'readings'}} ) { my $rid = $rdghash->{'readingid'}; - map { $contig->{$_} = $rid } @{$rdghash->{'group'}}; - + foreach my $wit ( @{$rdghash->{'group'}} ) { + $contig->{$wit} = $rid; + if( $wit =~ /^(.*)\Q$acstr\E$/ ) { + push( @acwits, $1 ); + } + } + } + + # Get the actual graph we should work with + my $graph; + try { + $graph = @acwits ? $stemma->extend_graph( \@acwits ) : $stemma->graph; + } catch { + die "Could not extend graph with a.c. witnesses @acwits"; + } + + # Now, armed with that knowledge, make a subgraph for each reading + # and note the root(s) of each subgraph. + foreach my $rdghash( @{$variant_row->{'readings'}} ) { + my $rid = $rdghash->{'readingid'}; + my %rdgwits; # Make the subgraph. my $part = $graph->copy; - my %these_vertices; - map { $these_vertices{$_} = 1 } @{$rdghash->{'group'}}; - $part->delete_vertices( grep { !$these_vertices{$_} } $part->vertices ); + my @todelete = grep { exists $contig->{$_} && $contig->{$_} ne $rid } + keys %$contig; + $part->delete_vertices( @todelete ); + _prune_subtree( $part, $lacunose ); $subgraph->{$rid} = $part; + # Record the remaining lacunose nodes as part of this group, if + # we are dealing with a non-genealogical reading. + unless( $variant_row->{'genealogical'} ) { + map { $contig->{$_} = $rid } $part->vertices; + } # Get the reading roots. map { $reading_roots->{$_} = $rid } $part->predecessorless_vertices; } @@ -576,18 +714,16 @@ sub analyze_location { # non-followed/unknown values for each reading. Also figure out the # reading's evident parent(s). foreach my $rdghash ( @{$variant_row->{'readings'}} ) { - # Group string key - TODO do we need this? - my $gst = wit_stringify( $rdghash->{'group'} ); my $rid = $rdghash->{'readingid'}; # Get the subgraph my $part = $subgraph->{$rid}; # Start figuring things out. - my @roots = $part->predecessorless_vertices; - $rdghash->{'independent_occurrence'} = scalar @roots; + my @roots = grep { $reading_roots->{$_} eq $rid } keys %$reading_roots; + $rdghash->{'independent_occurrence'} = \@roots; $rdghash->{'followed'} = scalar( $part->vertices ) - scalar( @roots ); # Find the parent readings, if any, of this reading. - my %rdgparents; + my $rdgparents = {}; foreach my $wit ( @roots ) { # Look in the main stemma to find this witness's extant or known-reading # immediate ancestor(s), and look up the reading that each ancestor olds. @@ -597,7 +733,7 @@ sub analyze_location { foreach my $wparent( @check ) { my $preading = $contig->{$wparent}; if( $preading ) { - $rdgparents{$preading} = 1; + $rdgparents->{$preading} = 1; } else { push( @next, $graph->predecessors( $wparent ) ); } @@ -605,7 +741,44 @@ sub analyze_location { @check = @next; } } - $rdghash->{'reading_parents'} = [ keys %rdgparents ]; + foreach my $p ( keys %$rdgparents ) { + # Resolve the relationship of the parent to the reading, and + # save it in our hash. + my $pobj = $c->reading( $p ); + my $prep = $pobj ? $pobj->id . ' (' . $pobj->text . ')' : $p; + my $phash = { 'label' => $prep }; + if( $pobj ) { + my $rel = $c->get_relationship( $p, $rdghash->{readingid} ); + if( $rel ) { + $phash->{relation} = { type => $rel->type }; + if( $rel->has_annotation ) { + $phash->{relation}->{'annotation'} = $rel->annotation; + } + } elsif( $rdghash->{readingid} eq '(omitted)' ) { + $phash->{relation} = { type => 'deletion' }; + } elsif( $rdghash->{text} ) { + # Check for sheer word similarity. + my $rtext = $rdghash->{text}; + my $ptext = $pobj->text; + my $min = length( $rtext ) > length( $ptext ) + ? length( $ptext ) : length( $rtext ); + my $distance = distance( $rtext, $ptext ); + if( $distance < $min ) { + $phash->{relation} = { type => 'wordsimilar' }; + } + } + # Get the attributes of the parent object while we are here + $phash->{'text'} = $pobj->text if $pobj; + $phash->{'is_nonsense'} = $pobj->is_nonsense; + $phash->{'is_ungrammatical'} = $pobj->grammar_invalid; + } elsif( $p eq '(omitted)' ) { + $phash->{relation} = { type => 'addition' }; + } + # Save it + $rdgparents->{$p} = $phash; + } + + $rdghash->{'reading_parents'} = $rdgparents; # Find the number of times this reading was altered, and the number of # times we're not sure. @@ -702,7 +875,7 @@ sub _solve_variant_location { # that implicitly later. foreach my $root ( @roots ) { # Prune the tree to get rid of extraneous hypotheticals. - $root = _prune_subtree( $part, $root, $contig ); + $root = _prune_subtree_old( $part, $root, $contig ); next unless $root; # Save this root for our group. push( @group_roots, $root ); @@ -836,23 +1009,46 @@ sub _solve_variant_location { sub _prune_group { my( $group, $stemma, $graph ) = @_; - # Get these into a form prune_subtree will recognize. Make a "contighash" - my $hypohash = {}; - map { $hypohash->{$_} = 1 } @$group; - # ...with reference values for hypotheticals. - map { $hypohash->{$_} = [] } $stemma->hypotheticals; + my $lacunose = {}; + map { $lacunose->{$_} = 1 } $stemma->hypotheticals; + map { $lacunose->{$_} = 0 } @$group; # Make our subgraph my $subgraph = $graph->copy; - map { $subgraph->delete_vertex( $_ ) unless exists $hypohash->{$_} } + map { $subgraph->delete_vertex( $_ ) unless exists $lacunose->{$_} } $subgraph->vertices; # ...and find the root. - my( $root ) = $subgraph->predecessorless_vertices; # Now prune and return the remaining vertices. - _prune_subtree( $subgraph, $root, $hypohash ); + _prune_subtree( $subgraph, $lacunose ); return $subgraph->vertices; } sub _prune_subtree { + my( $tree, $lacunose ) = @_; + + # Delete lacunose witnesses that have no successors + my @orphan_hypotheticals; + my $ctr = 0; + do { + die "Infinite loop on leaves" if $ctr > 100; + @orphan_hypotheticals = grep { $lacunose->{$_} } + $tree->successorless_vertices; + $tree->delete_vertices( @orphan_hypotheticals ); + $ctr++; + } while( @orphan_hypotheticals ); + + # Delete lacunose roots that have a single successor + my @redundant_root; + $ctr = 0; + do { + die "Infinite loop on roots" if $ctr > 100; + @redundant_root = grep { $lacunose->{$_} && $tree->successors( $_ ) == 1 } + $tree->predecessorless_vertices; + $tree->delete_vertices( @redundant_root ); + $ctr++; + } while( @redundant_root ); +} + +sub _prune_subtree_old { my( $tree, $root, $contighash ) = @_; # First, delete hypothetical leaves / orphans until there are none left. my @orphan_hypotheticals = grep { ref( $contighash->{$_} ) } diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index b246c7e..465ef1e 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -1,5 +1,6 @@ package Text::Tradition::Collation; +use feature 'say'; use Encode qw( decode_utf8 ); use File::Temp; use File::Which; @@ -274,8 +275,10 @@ See L for the available options. sub BUILD { my $self = shift; $self->_set_relations( Text::Tradition::Collation::RelationshipStore->new( 'collation' => $self ) ); - $self->_set_start( $self->add_reading( { 'collation' => $self, 'is_start' => 1 } ) ); - $self->_set_end( $self->add_reading( { 'collation' => $self, 'is_end' => 1 } ) ); + $self->_set_start( $self->add_reading( + { 'collation' => $self, 'is_start' => 1, 'init' => 1 } ) ); + $self->_set_end( $self->add_reading( + { 'collation' => $self, 'is_end' => 1, 'init' => 1 } ) ); } ### Reading construct/destruct functions @@ -284,6 +287,13 @@ sub add_reading { my( $self, $reading ) = @_; unless( ref( $reading ) eq 'Text::Tradition::Collation::Reading' ) { my %args = %$reading; + if( $args{'init'} ) { + # If we are initializing an empty collation, don't assume that we + # have set a tradition. + delete $args{'init'}; + } elsif( $self->tradition->has_language && !exists $args{'language'} ) { + $args{'language'} = $self->tradition->language; + } $reading = Text::Tradition::Collation::Reading->new( 'collation' => $self, %args ); @@ -366,11 +376,23 @@ is( $c->reading('n21p0')->text, 'unto', "Reading n21p0 merged correctly" ); sub merge_readings { my $self = shift; + # Sanity check + my( $kept_obj, $del_obj, $combine, $combine_char ) = $self->_objectify_args( @_ ); + my $mergemeta = $kept_obj->is_meta; + throw( "Cannot merge meta and non-meta reading" ) + unless ( $mergemeta && $del_obj->is_meta ) + || ( !$mergemeta && !$del_obj->is_meta ); + if( $mergemeta ) { + throw( "Cannot merge with start or end node" ) + if( $kept_obj eq $self->start || $kept_obj eq $self->end + || $del_obj eq $self->start || $del_obj eq $self->end ); + } # We only need the IDs for adding paths to the graph, not the reading # objects themselves. - my( $kept, $deleted, $combine, $combine_char ) = $self->_stringify_args( @_ ); + my $kept = $kept_obj->id; + my $deleted = $del_obj->id; $self->_graphcalc_done(0); - + # The kept reading should inherit the paths and the relationships # of the deleted reading. foreach my $path ( $self->sequence->edges_at( $deleted ) ) { @@ -384,22 +406,82 @@ sub merge_readings { @wits{keys %$fwits} = values %$fwits; $self->sequence->set_edge_attributes( @vector, \%wits ); } - $self->relations->merge_readings( $kept, $deleted, $combine_char ); + $self->relations->merge_readings( $kept, $deleted, $combine ); # Do the deletion deed. if( $combine ) { - my $kept_obj = $self->reading( $kept ); - my $del_obj = $self->reading( $deleted ); + # Combine the text of the readings my $joinstr = $combine_char; unless( defined $joinstr ) { $joinstr = '' if $kept_obj->join_next || $del_obj->join_prior; $joinstr = $self->wordsep unless defined $joinstr; } $kept_obj->alter_text( join( $joinstr, $kept_obj->text, $del_obj->text ) ); + # Change this reading to a joining one if necessary + $kept_obj->_set_join_next( $del_obj->join_next ); + $kept_obj->normal_form( + join( $joinstr, $kept_obj->normal_form, $del_obj->normal_form ) ); + # Combine the lexemes present in the readings + if( $kept_obj->has_lexemes && $del_obj->has_lexemes ) { + $kept_obj->add_lexeme( $del_obj->lexemes ); + } } $self->del_reading( $deleted ); } +=head2 compress_readings + +Where possible in the graph, compresses plain sequences of readings into a +single reading. The sequences must consist of readings with no +relationships to other readings, with only a single witness path between +them and no other witness paths from either that would skip the other. The +readings must also not be marked as nonsense or bad grammar. + +WARNING: This operation cannot be undone. + +=cut + +sub compress_readings { + my $self = shift; + # Anywhere in the graph that there is a reading that joins only to a single + # successor, and neither of these have any relationships, just join the two + # readings. + my %gobbled; + foreach my $rdg ( sort { $a->rank <=> $b->rank } $self->readings ) { + next if $rdg->is_meta; + next if $gobbled{$rdg->id}; + next if $rdg->grammar_invalid || $rdg->is_nonsense; + next if $rdg->related_readings(); + my %seen; + while( $self->sequence->successors( $rdg ) == 1 ) { + my( $next ) = $self->reading( $self->sequence->successors( $rdg ) ); + throw( "Infinite loop" ) if $seen{$next->id}; + $seen{$next->id} = 1; + last if $self->sequence->predecessors( $next ) > 1; + last if $next->is_meta; + last if $next->grammar_invalid || $next->is_nonsense; + last if $next->related_readings(); + say "Joining readings $rdg and $next"; + $self->merge_readings( $rdg, $next, 1 ); + } + } + # Make sure we haven't screwed anything up + foreach my $wit ( $self->tradition->witnesses ) { + my $pathtext = $self->path_text( $wit->sigil ); + my $origtext = join( ' ', @{$wit->text} ); + throw( "Text differs for witness " . $wit->sigil ) + unless $pathtext eq $origtext; + if( $wit->is_layered ) { + $pathtext = $self->path_text( $wit->sigil.$self->ac_label ); + $origtext = join( ' ', @{$wit->layertext} ); + throw( "Ante-corr text differs for witness " . $wit->sigil ) + unless $pathtext eq $origtext; + } + } + + $self->relations->rebuild_equivalence(); + $self->calculate_ranks(); +} # Helper function for manipulating the graph. sub _stringify_args { @@ -557,7 +639,8 @@ sub as_svg { throw( "Need GraphViz installed to output SVG" ) unless File::Which::which( 'dot' ); my $want_subgraph = exists $opts->{'from'} || exists $opts->{'to'}; - $self->calculate_ranks() unless( $self->_graphcalc_done || $opts->{'nocalc'} ); + $self->calculate_ranks() + unless( $self->_graphcalc_done || $opts->{'nocalc'} || !$self->linear ); if( !$self->has_cached_svg || $opts->{'recalc'} || $want_subgraph ) { my @cmd = qw/dot -Tsvg/; my( $svg, $err ); @@ -639,15 +722,15 @@ sub as_dot { # Output substitute start/end readings if necessary if( $startrank ) { - $dot .= "\t\"#SUBSTART#\" [ label=\"...\" ];\n"; + $dot .= "\t\"__SUBSTART__\" [ label=\"...\",id=\"__START__\" ];\n"; } if( $endrank ) { - $dot .= "\t\"#SUBEND#\" [ label=\"...\" ];\n"; + $dot .= "\t\"__SUBEND__\" [ label=\"...\",id=\"__END__\" ];\n"; } if( $STRAIGHTENHACK ) { ## HACK part 1 - my $startlabel = $startrank ? 'SUBSTART' : 'START'; - $dot .= "\tsubgraph { rank=same \"#$startlabel#\" \"#SILENT#\" }\n"; + my $startlabel = $startrank ? '__SUBSTART__' : '__START__'; + $dot .= "\tsubgraph { rank=same \"$startlabel\" \"#SILENT#\" }\n"; $dot .= "\t\"#SILENT#\" [ shape=diamond,color=white,penwidth=0,label=\"\" ];" } my %used; # Keep track of the readings that actually appear in the graph @@ -669,6 +752,7 @@ sub as_dot { $label = "-$label" if $reading->join_prior; $label =~ s/\"/\\\"/g; $rattrs->{'label'} = $label; + $rattrs->{'id'} = $reading->id; $rattrs->{'fillcolor'} = '#b3f36d' if $reading->is_common && $color_common; $dot .= sprintf( "\t\"%s\" %s;\n", $reading->id, _dot_attr_string( $rattrs ) ); } @@ -708,28 +792,33 @@ sub as_dot { $dot .= sprintf( "\t\"%s\" -> \"%s\" %s;\n", $edge->[0], $edge->[1], $varopts ); } elsif( $used{$edge->[0]} ) { - $subend{$edge->[0]} = 1; + $subend{$edge->[0]} = $edge->[1]; } elsif( $used{$edge->[1]} ) { - $substart{$edge->[1]} = 1; + $substart{$edge->[1]} = $edge->[0]; } } # Add substitute start and end edges if necessary foreach my $node ( keys %substart ) { - my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) ); + my $witstr = $self->_path_display_label ( $self->path_witnesses( $substart{$node}, $node ) ); my $variables = { %edge_attrs, 'label' => $witstr }; + my $nrdg = $self->reading( $node ); + if( $nrdg->has_rank && $nrdg->rank > $startrank ) { + # Substart is actually one lower than $startrank + $variables->{'minlen'} = $nrdg->rank - ( $startrank - 1 ); + } my $varopts = _dot_attr_string( $variables ); - $dot .= "\t\"#SUBSTART#\" -> \"$node\" $varopts;"; + $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n"; } foreach my $node ( keys %subend ) { - my $witstr = $self->_path_display_label ( $self->reading_witnesses( $self->reading( $node ) ) ); + my $witstr = $self->_path_display_label ( $self->path_witnesses( $node, $subend{$node} ) ); my $variables = { %edge_attrs, 'label' => $witstr }; my $varopts = _dot_attr_string( $variables ); - $dot .= "\t\"$node\" -> \"#SUBEND#\" $varopts;"; + $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n"; } # HACK part 2 if( $STRAIGHTENHACK ) { - my $endlabel = $endrank ? 'SUBEND' : 'END'; - $dot .= "\t\"#$endlabel#\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n"; + my $endlabel = $endrank ? '__SUBEND__' : '__END__'; + $dot .= "\t\"$endlabel\" -> \"#SILENT#\" [ color=white,penwidth=0 ];\n"; } $dot .= "}\n"; @@ -813,6 +902,7 @@ sub _path_display_label { # See if we are in a majority situation. my $maj = scalar( $self->tradition->witnesses ) * 0.6; + $maj = $maj > 5 ? $maj : 5; if( scalar keys %wits > $maj ) { unshift( @disp_ac, 'majority' ); return join( ', ', @disp_ac ); @@ -883,6 +973,13 @@ is( scalar $st->collation->readings, $READINGS, "Reparsed collation has all read is( scalar $st->collation->paths, $PATHS, "Reparsed collation has all paths" ); is( scalar $st->collation->relationships, 3, "Reparsed collation has new relationships" ); +# Now add a stemma, write to GraphML, and parse again. +my $stemma = $tradition->add_stemma( 'dotfile' => 't/data/florilegium.dot' ); +is( ref( $stemma ), 'Text::Tradition::Stemma', "Parsed dotfile into stemma" ); +is( $tradition->stemmata, 1, "Tradition now has the stemma" ); +$graphml = $c->as_graphml; +like( $graphml, qr/digraph/, "Digraph declaration exists in GraphML" ); + =end testing =cut @@ -926,6 +1023,7 @@ sub as_graphml { 'Str' => 'string', 'Int' => 'int', 'Bool' => 'boolean', + 'ReadingID' => 'string', 'RelationshipType' => 'string', 'RelationshipScope' => 'string', ); @@ -952,6 +1050,8 @@ sub as_graphml { next unless $save_types{$attr->type_constraint->name}; $graph_attributes{$attr->name} = $save_types{$attr->type_constraint->name}; } + # Extra custom key for the tradition stemma(ta) + $graph_attributes{'stemmata'} = 'string'; foreach my $datum ( sort keys %graph_attributes ) { $graph_data_keys{$datum} = 'dg'.$gdi++; @@ -971,6 +1071,9 @@ sub as_graphml { next unless $save_types{$attr->type_constraint->name}; $reading_attributes{$attr->name} = $save_types{$attr->type_constraint->name}; } + # Extra custom key for the reading morphology + $reading_attributes{'lexemes'} = 'string'; + my %node_data_keys; my $ndi = 0; foreach my $datum ( sort keys %reading_attributes ) { @@ -1022,11 +1125,16 @@ sub as_graphml { $sgraph->setAttribute( 'parse.nodes', 0 ); # fill in later $sgraph->setAttribute( 'parse.order', 'nodesfirst' ); - # Collation attribute data + # Tradition/collation attribute data foreach my $datum ( keys %graph_attributes ) { my $value; if( $datum eq 'version' ) { - $value = '3.1'; + $value = '3.2'; + } elsif( $datum eq 'stemmata' ) { + my @stemstrs; + map { push( @stemstrs, $_->editable( {linesep => ''} ) ) } + $self->tradition->stemmata; + $value = join( "\n", @stemstrs ); } elsif( $gattr_from{$datum} eq 'Tradition' ) { $value = $self->tradition->$datum; } else { @@ -1049,6 +1157,15 @@ sub as_graphml { $node_el->setAttribute( 'id', $node_xmlid ); foreach my $d ( keys %reading_attributes ) { my $nval = $n->$d; + # Custom serialization + if( $d eq 'lexemes' ) { + # If nval is a true value, we have lexemes so we need to + # serialize them. Otherwise set nval to undef so that the + # key is excluded from this reading. + $nval = $nval ? $n->_serialize_lexemes : undef; + } elsif( $d eq 'normal_form' && $n->normal_form eq $n->text ) { + $nval = undef; + } if( $rankoffset && $d eq 'rank' && $n ne $self->start ) { # Adjust the ranks within the subgraph. $nval = $n eq $self->end ? $end->rank - $rankoffset + 1 @@ -1173,7 +1290,7 @@ sub alignment_table { my $table = { 'alignment' => [], 'length' => $self->end->rank - 1 }; my @all_pos = ( 1 .. $self->end->rank - 1 ); foreach my $wit ( sort { $a->sigil cmp $b->sigil } $self->tradition->witnesses ) { - # print STDERR "Making witness row(s) for " . $wit->sigil . "\n"; + # say STDERR "Making witness row(s) for " . $wit->sigil; my @wit_path = $self->reading_sequence( $self->start, $self->end, $wit->sigil ); my @row = _make_witness_row( \@wit_path, \@all_pos ); push( @{$table->{'alignment'}}, @@ -1198,8 +1315,8 @@ sub _make_witness_row { foreach my $rdg ( @$path ) { my $rtext = $rdg->text; $rtext = '#LACUNA#' if $rdg->is_lacuna; - print STDERR "rank " . $rdg->rank . "\n" if $debug; - # print STDERR "No rank for " . $rdg->id . "\n" unless defined $rdg->rank; + say STDERR "rank " . $rdg->rank if $debug; + # say STDERR "No rank for " . $rdg->id unless defined $rdg->rank; $char_hash{$rdg->rank} = { 't' => $rdg }; } my @row = map { $char_hash{$_} } @$positions; @@ -1386,21 +1503,13 @@ sub path_text { $start = $self->start unless $start; $end = $self->end unless $end; my @path = grep { !$_->is_meta } $self->reading_sequence( $start, $end, $wit ); - return $self->_text_from_path( @path ); -} - -# Utility function so that we can cheat and use it when we need both the path -# and its text. -sub _text_from_path { - my( $self, @path ) = @_; my $pathtext = ''; my $last; foreach my $r ( @path ) { - if( $r->join_prior || !$last || $last->join_next ) { - $pathtext .= $r->text; - } else { - $pathtext .= ' ' . $r->text; - } + unless ( $r->join_prior || !$last || $last->join_next ) { + $pathtext .= ' '; + } + $pathtext .= $r->text; $last = $r; } return $pathtext; @@ -1431,7 +1540,7 @@ Call make_witness_path for all witnesses in the tradition. sub make_witness_paths { my( $self ) = @_; foreach my $wit ( $self->tradition->witnesses ) { - # print STDERR "Making path for " . $wit->sigil . "\n"; + # say STDERR "Making path for " . $wit->sigil; $self->make_witness_path( $wit ); } } @@ -1499,14 +1608,8 @@ sub calculate_ranks { # Do the rankings based on the relationship equivalence graph, starting # with the start node. - my $topo_start = $self->equivalence( $self->start->id ); - my $node_ranks = { $topo_start => 0 }; - my @curr_origin = ( $topo_start ); - # A little iterative function. - while( @curr_origin ) { - @curr_origin = _assign_rank( $self->equivalence_graph, - $node_ranks, @curr_origin ); - } + my ( $node_ranks, $rank_nodes ) = $self->relations->equivalence_ranks(); + # Transfer our rankings from the topological graph to the real one. foreach my $r ( $self->readings ) { if( defined $node_ranks->{$self->equivalence( $r->id )} ) { @@ -1536,41 +1639,6 @@ sub calculate_ranks { $self->_graphcalc_done(1); } -sub _assign_rank { - my( $graph, $node_ranks, @current_nodes ) = @_; - # Look at each of the children of @current_nodes. If all the child's - # parents have a rank, assign it the highest rank + 1 and add it to - # @next_nodes. Otherwise skip it; we will return when the highest-ranked - # parent gets a rank. - my @next_nodes; - foreach my $c ( @current_nodes ) { - warn "Current reading $c has no rank!" - unless exists $node_ranks->{$c}; - # print STDERR "Looking at child of node $c, rank " - # . $node_ranks->{$c} . "\n"; - foreach my $child ( $graph->successors( $c ) ) { - next if exists $node_ranks->{$child}; - my $highest_rank = -1; - my $skip = 0; - foreach my $parent ( $graph->predecessors( $child ) ) { - if( exists $node_ranks->{$parent} ) { - $highest_rank = $node_ranks->{$parent} - if $highest_rank <= $node_ranks->{$parent}; - } else { - $skip = 1; - last; - } - } - next if $skip; - my $c_rank = $highest_rank + 1; - # print STDERR "Assigning rank $c_rank to node $child \n"; - $node_ranks->{$child} = $c_rank; - push( @next_nodes, $child ); - } - } - return @next_nodes; -} - sub _clear_cache { my $self = shift; $self->wipe_svg if $self->has_cached_svg; @@ -1593,8 +1661,17 @@ sub flatten_ranks { next unless $rdg->has_rank; my $key = $rdg->rank . "||" . $rdg->text; if( exists $unique_rank_rdg{$key} ) { + # Make sure they don't have different grammatical forms + my $ur = $unique_rank_rdg{$key}; + if( $rdg->disambiguated && $ur->disambiguated ) { + my $rform = join( '//', map { $_->form->to_string } $rdg->lexemes ); + my $uform = join( '//', map { $_->form->to_string } $ur->lexemes ); + next unless $rform eq $uform; + } elsif( $rdg->disambiguated xor $ur->disambiguated ) { + next; + } # Combine! - # print STDERR "Combining readings at same rank: $key\n"; + #say STDERR "Combining readings at same rank: $key"; $changed = 1; $self->merge_readings( $unique_rank_rdg{$key}, $rdg ); # TODO see if this now makes a common point. @@ -1722,12 +1799,12 @@ my $c = $t->collation; is( $c->common_predecessor( 'n24', 'n23' )->id, 'n20', "Found correct common predecessor" ); is( $c->common_successor( 'n24', 'n23' )->id, - '#END#', "Found correct common successor" ); + '__END__', "Found correct common successor" ); is( $c->common_predecessor( 'n19', 'n17' )->id, 'n16', "Found correct common predecessor for readings on same path" ); is( $c->common_successor( 'n21', 'n10' )->id, - '#END#', "Found correct common successor for readings on same path" ); + '__END__', "Found correct common successor for readings on same path" ); =end testing @@ -1756,7 +1833,7 @@ sub _common_in_path { my @last_r2 = ( $r2 ); # my %all_seen = ( $r1 => 'r1', $r2 => 'r2' ); my %all_seen; - # print STDERR "Finding common $dir for $r1, $r2\n"; + # say STDERR "Finding common $dir for $r1, $r2"; while( !@candidates ) { last unless $iter--; # Avoid looping infinitely # Iterate separately down the graph from r1 and r2 @@ -1764,7 +1841,7 @@ sub _common_in_path { foreach my $lc ( @last_r1 ) { foreach my $p ( $lc->$dir ) { if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r1' ) { - # print STDERR "Path candidate $p from $lc\n"; + # say STDERR "Path candidate $p from $lc"; push( @candidates, $p ); } elsif( !$all_seen{$p->id} ) { $all_seen{$p->id} = 'r1'; @@ -1775,7 +1852,7 @@ sub _common_in_path { foreach my $lc ( @last_r2 ) { foreach my $p ( $lc->$dir ) { if( $all_seen{$p->id} && $all_seen{$p->id} ne 'r2' ) { - # print STDERR "Path candidate $p from $lc\n"; + # say STDERR "Path candidate $p from $lc"; push( @candidates, $p ); } elsif( !$all_seen{$p->id} ) { $all_seen{$p->id} = 'r2'; diff --git a/lib/Text/Tradition/Collation/Reading.pm b/lib/Text/Tradition/Collation/Reading.pm index 11ec36e..f445420 100644 --- a/lib/Text/Tradition/Collation/Reading.pm +++ b/lib/Text/Tradition/Collation/Reading.pm @@ -1,8 +1,21 @@ package Text::Tradition::Collation::Reading; use Moose; +use Moose::Util::TypeConstraints; +use JSON qw/ from_json /; +use Module::Load; +use Text::Tradition::Error; +use XML::Easy::Syntax qw( $xml10_name_rx $xml10_namestartchar_rx ); +use YAML::XS; use overload '""' => \&_stringify, 'fallback' => 1; +subtype 'ReadingID', + as 'Str', + where { $_ =~ /\A$xml10_name_rx\z/ }, + message { 'Reading ID must be a valid XML attribute string' }; + +no Moose::Util::TypeConstraints; + =head1 NAME Text::Tradition::Collation::Reading - represents a reading (usually a word) @@ -75,7 +88,7 @@ has 'collation' => ( has 'id' => ( is => 'ro', - isa => 'Str', + isa => 'ReadingID', required => 1, ); @@ -89,7 +102,7 @@ has 'text' => ( has 'language' => ( is => 'ro', isa => 'Str', - default => 'Default', + predicate => 'has_language', ); has 'is_start' => ( @@ -131,18 +144,31 @@ has 'rank' => ( ## For morphological analysis +has 'grammar_invalid' => ( + is => 'rw', + isa => 'Bool', + default => undef, + ); + +has 'is_nonsense' => ( + is => 'rw', + isa => 'Bool', + default => undef, + ); + has 'normal_form' => ( is => 'rw', isa => 'Str', - predicate => 'has_normal_form', + predicate => '_has_normal_form', + clearer => '_clear_normal_form', ); -# Holds the word form. If is_disambiguated is true, the form at index zero -# is the correct one. +# Holds the lexemes for the reading. has 'reading_lexemes' => ( traits => ['Array'], isa => 'ArrayRef[Text::Tradition::Collation::Reading::Lexeme]', handles => { + lexeme => 'get', lexemes => 'elements', has_lexemes => 'count', clear_lexemes => 'clear', @@ -157,12 +183,14 @@ has 'join_prior' => ( is => 'ro', isa => 'Bool', default => undef, + writer => '_set_join_prior', ); has 'join_next' => ( is => 'ro', isa => 'Bool', default => undef, + writer => '_set_join_next', ); @@ -178,22 +206,54 @@ around BUILDARGS => sub { # If one of our special booleans is set, we change the text and the # ID to match. - if( exists $args->{'is_lacuna'} && !exists $args->{'text'} ) { + if( exists $args->{'is_lacuna'} && $args->{'is_lacuna'} && !exists $args->{'text'} ) { $args->{'text'} = '#LACUNA#'; - } elsif( exists $args->{'is_start'} ) { - $args->{'id'} = '#START#'; # Change the ID to ensure we have only one + } elsif( exists $args->{'is_start'} && $args->{'is_start'} ) { + $args->{'id'} = '__START__'; # Change the ID to ensure we have only one $args->{'text'} = '#START#'; $args->{'rank'} = 0; - } elsif( exists $args->{'is_end'} ) { - $args->{'id'} = '#END#'; # Change the ID to ensure we have only one + } elsif( exists $args->{'is_end'} && $args->{'is_end'} ) { + $args->{'id'} = '__END__'; # Change the ID to ensure we have only one $args->{'text'} = '#END#'; - } elsif( exists $args->{'is_ph'} ) { + } elsif( exists $args->{'is_ph'} && $args->{'is_ph'} ) { $args->{'text'} = $args->{'id'}; } + # Backwards compatibility for non-XMLname IDs + my $rid = $args->{'id'}; + $rid =~ s/\#/__/g; + $rid =~ s/[\/,]/./g; + if( $rid !~ /^$xml10_namestartchar_rx/ ) { + $rid = 'r'.$rid; + } + $args->{'id'} = $rid; + $class->$orig( $args ); }; +# Look for a lexeme-string argument in the build args. +sub BUILD { + my( $self, $args ) = @_; + if( exists $args->{'lexemes'} ) { + $self->_deserialize_lexemes( $args->{'lexemes'} ); + } +} + +# Make normal_form default to text, transparently. +around 'normal_form' => sub { + my $orig = shift; + my $self = shift; + my( $arg ) = @_; + if( $arg && $arg eq $self->text ) { + $self->_clear_normal_form; + return $arg; + } elsif( !$arg && !$self->_has_normal_form ) { + return $self->text; + } else { + $self->$orig( @_ ); + } +}; + =head2 is_meta A meta attribute (ha ha), which should be true if any of our 'special' @@ -275,143 +335,99 @@ sub _stringify { =head1 MORPHOLOGY -A few methods to try to tack on morphological information. +Methods for the morphological information (if any) attached to readings. +A reading may be made up of multiple lexemes; the concatenated lexeme +strings ought to match the reading's normalized form. + +See L for more information +on Lexeme objects and their attributes. -=head2 use_lexemes +=head2 has_lexemes -TBD +Returns a true value if the reading has any attached lexemes. -=cut +=head2 lexemes -# sub use_lexemes { -# my( $self, @lexemes ) = @_; -# # The lexemes need to be the same as $self->text. -# my $cmpstr = $self->has_normal_form ? lc( $self->normal_form ) : lc( $self->text ); -# $cmpstr =~ s/[\s-]+//g; -# my $lexstr = lc( join( '', @lexemes ) ); -# $lexstr =~ s/[\s-]+//g; -# unless( $lexstr eq $cmpstr ) { -# warn "Cannot split " . $self->text . " into " . join( '.', @lexemes ); -# return; -# } -# $self->_clear_morph; -# map { $self->_add_morph( { $_ => [] } ) } @lexemes; -# } -# -# sub add_morphological_tag { -# my( $self, $lexeme, $opts ) = @_; -# my $struct; -# unless( $opts ) { -# # No lexeme was passed; use reading text. -# $opts = $lexeme; -# $lexeme = $self->text; -# $self->use_lexemes( $lexeme ); -# } -# # Get the correct container -# ( $struct ) = grep { exists $_->{$lexeme} } $self->lexemes; -# unless( $struct ) { -# warn "No lexeme $lexeme exists in this reading"; -# return; -# } -# # Now make the morph object and add it to this lexeme. -# my $morph_obj = Text::Tradition::Collation::Reading::Morphology->new( $opts ); -# # TODO Check for existence -# push( @{$struct->{$lexeme}}, $morph_obj ); -# } +Returns the Lexeme objects (if any) attached to the reading. -## Utility methods +=head2 clear_lexemes -sub TO_JSON { - my $self = shift; - return $self->text; -} +Wipes any associated Lexeme objects out of the reading. -## TODO will need a throw() here +=head2 add_lexeme( $lexobj ) -no Moose; -__PACKAGE__->meta->make_immutable; +Adds the Lexeme in $lexobj to the list of lexemes. -################################################### -### Morphology objects, to be attached to readings -################################################### +=head2 lemmatize -package Text::Tradition::Collation::Reading::Morphology; +If the language of the reading is set, this method will use the appropriate +Language model to determine the lexemes that belong to this reading. See +L if you wish to lemmatize an entire tradition. -use Moose; +=cut -has 'lemma' => ( - is => 'ro', - isa => 'Str', - required => 1, - ); - -has 'code' => ( - is => 'ro', - isa => 'Str', - required => 1, - ); +sub lemmatize { + my $self = shift; + unless( $self->has_language ) { + warn "Please set a language to lemmatize a tradition"; + return; + } + my $mod = "Text::Tradition::Language::" . $self->language; + load( $mod ); + $mod->can( 'reading_lookup' )->( $self ); + +} + +# For graph serialization. Return a JSON representation of the associated +# reading lexemes. +sub _serialize_lexemes { + my $self = shift; + my $json = JSON->new->allow_blessed(1)->convert_blessed(1); + return $json->encode( [ $self->lexemes ] ); +} + +# Given a JSON representation of the lexemes, instantiate them and add +# them to the reading. +sub _deserialize_lexemes { + my( $self, $json ) = @_; + my $data = from_json( $json ); + return unless @$data; -has 'language' => ( - is => 'ro', - isa => 'Str', - required => 1, - ); + # Need to have the lexeme module in order to have lexemes. + eval { use Text::Tradition::Collation::Reading::Lexeme; }; + throw( $@ ) if $@; -## Transmute codes into comparison arrays for our various languages. - -around BUILDARGS => sub { - my $orig = shift; - my $class = shift; - my $args; - if( @_ == 1 && ref( $_[0] ) ) { - $args = shift; - } else { - $args = { @_ }; + # Good to go - add the lexemes. + my @lexemes; + foreach my $lexhash ( @$data ) { + push( @lexemes, Text::Tradition::Collation::Reading::Lexeme->new( + 'JSON' => $lexhash ) ); } - if( exists( $args->{'serial'} ) ) { - my( $lemma, $code ) = split( /!!/, delete $args->{'serial'} ); - $args->{'lemma'} = $lemma; - $args->{'code'} = $code; - } - $class->$orig( $args ); -}; + $self->clear_lexemes; + $self->add_lexeme( @lexemes ); +} -sub serialization { +sub disambiguated { my $self = shift; - return join( '!!', $self->lemma, $self->code ); -}; + return 0 unless $self->has_lexemes; + return !grep { !$_->is_disambiguated } $self->lexemes; +} + +## Utility methods -sub comparison_array { +sub TO_JSON { my $self = shift; - if( $self->language eq 'French' ) { - my @array; - my @bits = split( /\+/, $self->code ); - # First push the non k/v parts. - while( @bits && $bits[0] !~ /=/ ) { - push( @array, shift @bits ); - } - while( @array < 2 ) { - push( @array, undef ); - } - # Now push the k/v parts in a known order. - my @fields = qw/ Pers Nb Temps Genre Spec Fonc /; - my %props; - map { my( $k, $v ) = split( /=/, $_ ); $props{$k} = $v; } @bits; - foreach my $k ( @fields ) { - push( @array, $props{$k} ); - } - # Give the answer. - return @array; - } elsif( $self->language eq 'English' ) { - # Do something as yet undetermined - } else { - # Latin or Greek or Armenian, just split the chars - return split( '', $self->code ); - } -}; + return $self->text; +} + +sub throw { + Text::Tradition::Error->throw( + 'ident' => 'Reading error', + 'message' => $_[0], + ); +} no Moose; __PACKAGE__->meta->make_immutable; 1; - diff --git a/lib/Text/Tradition/Collation/Reading/Lexeme.pm b/lib/Text/Tradition/Collation/Reading/Lexeme.pm index ca4802c..fa564ee 100644 --- a/lib/Text/Tradition/Collation/Reading/Lexeme.pm +++ b/lib/Text/Tradition/Collation/Reading/Lexeme.pm @@ -1,7 +1,10 @@ package Text::Tradition::Collation::Reading::Lexeme; use Moose; +use JSON (); use Module::Load; +use Text::Tradition::Collation::Reading::WordForm; +use Text::Tradition::Error; =head1 NAME @@ -76,7 +79,9 @@ has 'wordform_matchlist' => ( 'matching_forms' => 'elements', 'matching_form' => 'get', 'add_matching_form' => 'push', + 'clear_matching_forms' => 'clear', }, + default => sub { [] }, ); has 'is_disambiguated' => ( @@ -91,6 +96,30 @@ has 'form' => ( writer => '_set_form', ); +around BUILDARGS => sub { + my $orig = shift; + my $class = shift; + my $args = @_ == 1 ? $_[0] : { @_ }; + if( exists $args->{JSON} ) { + my $data = $args->{JSON}; + if( exists $data->{'form'} && $data->{'form'} ) { + my $form = Text::Tradition::Collation::Reading::WordForm->new( + 'JSON' => $data->{'form'} ); + $data->{'form'} = $form; + } + if( exists $data->{'wordform_matchlist'} && $data->{'wordform_matchlist'} ) { + my @ml; + foreach my $wfjson ( @{$data->{'wordform_matchlist'}} ) { + push( @ml, Text::Tradition::Collation::Reading::WordForm->new( + 'JSON' => $wfjson ) ); + } + $data->{'wordform_matchlist'} = \@ml; + } + $args = $data; + } + $class->$orig( $args ); +}; + # Do auto-disambiguation if we were created with a single wordform sub BUILD { my $self = shift; @@ -100,6 +129,23 @@ sub BUILD { } } +around 'add_matching_form' => sub { + my $orig = shift; + my $self = shift; + my @realargs; + foreach my $a ( @_ ) { + if( ref( $a ) ) { + push( @realargs, $a ); + } else { + # Make the wordform from the string + my $wf = Text::Tradition::Collation::Reading::WordForm->new( + 'JSON' => $a ); + push( @realargs, $wf ); + } + } + return $self->$orig( @realargs ); +}; + =head2 disambiguate( $index ) Selects the word form at $index in the list of matching forms, and asserts @@ -116,28 +162,40 @@ sub disambiguate { $self->is_disambiguated( 1 ); } -=head2 lookup +=head2 has_form( $rep ) -Uses the module for the declared language to look up the lexeme in the -language database (if any.) Sets the returned morphological matches in -matching_forms, and returns the list as an array of WordForm objects. +Returns the index of the matching form whose string representation is in $rep, +or else undef if none is found. =cut -sub lookup { - my $self = shift; - my $lang = $self->language; - my @answers; - try { - my $langmod = "Text::Tradition::Language::$lang"; - load( $langmod ); - @answers = $langmod->can( 'word_lookup' )->( $self->string ); - } catch { - throw( "No language module for $lang, or the module has no word_lookup functionality" ); +sub has_form { + my( $self, $rep ) = @_; + my $i = 0; + foreach my $mf ( $self->matching_forms ) { + my $struct = $mf->TO_JSON; + return $i if $struct eq $rep; + $i++; } - $self->clear_matching_forms; - $self->add_matching_form( @answers ); - return @answers; + return undef; +} + + +sub TO_JSON { + my $self = shift; + my $hash = {}; + # Do the scalar keys + map { $hash->{$_} = $self->$_ if defined $self->$_ } + qw/ language string is_disambiguated form /; + $hash->{'wordform_matchlist'} = [ $self->matching_forms ] if $self->matches; + return $hash; +} + +sub throw { + Text::Tradition::Error->throw( + 'ident' => 'Lexeme error', + 'message' => $_[0], + ); } no Moose; diff --git a/lib/Text/Tradition/Collation/Reading/WordForm.pm b/lib/Text/Tradition/Collation/Reading/WordForm.pm index 8f519f9..968f205 100644 --- a/lib/Text/Tradition/Collation/Reading/WordForm.pm +++ b/lib/Text/Tradition/Collation/Reading/WordForm.pm @@ -1,6 +1,10 @@ package Text::Tradition::Collation::Reading::WordForm; +use Lingua::Features::Structure; +use JSON (); use Moose; +use Text::Tradition::Error; +use TryCatch; =head1 NAME @@ -42,40 +46,79 @@ has 'language' => ( required => 1, ); -# TODO do we need this? -has 'form' => ( - is => 'ro', - isa => 'Str', - # required => 1, - ); - has 'lemma' => ( is => 'ro', isa => 'Str', required => 1, ); -has 'morphology' => ( +has 'morphstr' => ( is => 'ro', - isa => 'ArrayRef', + isa => 'Str', required => 1, ); around BUILDARGS => sub { my $orig = shift; my $class = shift; - my %args = @_ == 1 ? %{$_[0]} : @_; - unless( ref( $args{'morphology'} ) ) { - my @morph = split( '', $args{'morphology'} ); - $args{'morphology'} = \@morph; + my $args = @_ == 1 ? $_[0] : { @_ }; + if( exists $args->{'JSON'} ) { + my @data = split( / \/\/ /, $args->{'JSON'} ); + # print STDERR "Attempting to parse " . $data[2] . " into structure"; + $args = { 'language' => $data[0], 'lemma' => $data[1], + 'morphstr' => $data[2] }; + } elsif( exists $args->{'morphology'} ) { + # Backwards compat + my $mobj = delete $args->{'morphology'}; + $args->{'morphstr'} = $mobj->to_string() + if ref $mobj; } - $class->$orig( %args ); + $class->$orig( $args ); }; -sub _stringify { +=head2 morphology + +Returns a Lingua::Features::Structure object that corresponds to morphstr. + +=cut + +sub morphology { my $self = shift; - return sprintf( "%s//%s//%s", $self->language, $self->lemma, - join( '|', @{$self->morphology} ) ); + return unless $self->morphstr; + my $struct; + try { + $struct = Lingua::Features::Structure->from_string( $self->morphstr ); + } catch { + throw( "Morphology string " . $self->morphstr . " does not parse" ); + } + return $struct; +} + +=head2 to_string + +Returns a string combination of language/lemma/morphology that can be used +in equivalence testing. + +=cut + +sub to_string { + my $self = shift; + return JSON->new->convert_blessed(1)->encode( $self ); +} + +# Rather than spitting it out as a JSON hash, encode it as a string so that +# the XML serialization doesn't become insane. +sub TO_JSON { + my $self = shift; + return sprintf( "%s // %s // %s", $self->language, $self->lemma, + $self->morphstr ); +} + +sub throw { + Text::Tradition::Error->throw( + 'ident' => 'Wordform error', + 'message' => $_[0], + ); } no Moose; diff --git a/lib/Text/Tradition/Collation/Relationship.pm b/lib/Text/Tradition/Collation/Relationship.pm index 422c0c1..227bb7b 100644 --- a/lib/Text/Tradition/Collation/Relationship.pm +++ b/lib/Text/Tradition/Collation/Relationship.pm @@ -4,7 +4,7 @@ use Moose; use Moose::Util::TypeConstraints; enum 'RelationshipType' => qw( spelling orthographic grammatical lexical - collated repetition transposition ); + collated repetition transposition punctuation ); enum 'RelationshipScope' => qw( local document global ); @@ -48,6 +48,10 @@ or across all traditions. =item * annotation - (Optional) A freeform note to attach to the relationship. +=item * alters_meaning - Indicate whether, in context, the related words cause +the text to have different meanings. Possible values are 0 (no), 1 (slightly), +and >1 (yes). + =item * non_correctable - (Optional) True if the reading would not have been corrected independently. @@ -109,6 +113,12 @@ has 'annotation' => ( isa => 'Str', predicate => 'has_annotation', ); + +has 'alters_meaning' => ( + is => 'rw', + isa => 'Int', + default => 0, + ); has 'non_correctable' => ( is => 'ro', @@ -120,6 +130,21 @@ has 'non_independent' => ( isa => 'Bool', ); +around 'alters_meaning' => sub { + my $orig = shift; + my $self = shift; + if( @_ ) { + if( $_[0] eq 'no' ) { + return $self->$orig( 0 ); + } elsif( $_[0] eq 'slightly' ) { + return $self->$orig( 1 ); + } elsif( $_[0] eq 'yes' ) { + return $self->$orig( 2 ); + } + } + return $self->$orig( @_ ); +}; + # A read-only meta-Boolean attribute. =head2 colocated diff --git a/lib/Text/Tradition/Collation/RelationshipStore.pm b/lib/Text/Tradition/Collation/RelationshipStore.pm index c7728cd..a5a9529 100644 --- a/lib/Text/Tradition/Collation/RelationshipStore.pm +++ b/lib/Text/Tradition/Collation/RelationshipStore.pm @@ -31,10 +31,10 @@ use_ok( 'Text::Tradition::Collation::RelationshipStore' ); my $cxfile = 't/data/Collatex-16.xml'; my $t = Text::Tradition->new( - 'name' => 'inline', - 'input' => 'CollateX', - 'file' => $cxfile, - ); + 'name' => 'inline', + 'input' => 'CollateX', + 'file' => $cxfile, + ); my $c = $t->collation; my @v1 = $c->add_relationship( 'n21', 'n22', { 'type' => 'lexical' } ); @@ -97,6 +97,7 @@ has 'equivalence_graph' => ( is => 'ro', isa => 'Graph', default => sub { Graph->new() }, + writer => '_reset_equivalence', ); has '_node_equivalences' => ( @@ -106,6 +107,7 @@ has '_node_equivalences' => ( equivalence => 'get', set_equivalence => 'set', remove_equivalence => 'delete', + _clear_equivalence => 'clear', }, ); @@ -116,6 +118,7 @@ has '_equivalence_readings' => ( eqreadings => 'get', set_eqreadings => 'set', remove_eqreadings => 'delete', + _clear_eqreadings => 'clear', }, ); @@ -193,18 +196,9 @@ sub create { } } - # Check to see if a nonlocal relationship is defined for the two readings - $rel = $self->scoped_relationship( $options->{'reading_a'}, - $options->{'reading_b'} ); - if( $rel && $rel->type eq $options->{'type'} ) { - return $rel; - } elsif( $rel ) { - throw( sprintf( "Relationship of type %s with scope %s already defined for readings %s and %s", $rel->type, $rel->scope, $options->{'reading_a'}, $options->{'reading_b'} ) ); - } else { - $rel = Text::Tradition::Collation::Relationship->new( $options ); - $self->add_scoped_relationship( $rel ) if $rel->nonlocal; - return $rel; - } + $rel = Text::Tradition::Collation::Relationship->new( $options ); + $self->add_scoped_relationship( $rel ) if $rel->nonlocal; + return $rel; } =head2 add_scoped_relationship( $rel ) @@ -257,21 +251,27 @@ add_relationship. =begin testing +use Test::Warn; use Text::Tradition; use TryCatch; -my $t1 = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/legendfrag.xml' ); +my $t1; +warning_is { + $t1 = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/legendfrag.xml' ); +} 'DROPPING r14.2 -> r8.1: Cannot set relationship on a meta reading', + "Got expected relationship drop warning on parse"; + # Test 1.1: try to equate nodes that are prevented with an intermediate collation ok( $t1, "Parsed test fragment file" ); my $c1 = $t1->collation; -my $trel = $c1->get_relationship( '9,2', '9,3' ); +my $trel = $c1->get_relationship( 'r9.2', 'r9.3' ); is( ref( $trel ), 'Text::Tradition::Collation::Relationship', "Troublesome relationship exists" ); is( $trel->type, 'collated', "Troublesome relationship is a collation" ); # Try to make the link we want try { - $c1->add_relationship( '8,6', '10,3', { 'type' => 'orthographic' } ); + $c1->add_relationship( 'r8.6', 'r10.3', { 'type' => 'orthographic' } ); ok( 1, "Added cross-collation relationship as expected" ); } catch( Text::Tradition::Error $e ) { ok( 0, "Existing collation blocked equivalence relationship: " . $e->message ); @@ -286,7 +286,7 @@ try { # Test 1.2: attempt merge of an identical reading try { - $c1->merge_readings( '9,3', '11,5' ); + $c1->merge_readings( 'r9.3', 'r11.5' ); ok( 1, "Successfully merged reading 'pontifex'" ); } catch ( Text::Tradition::Error $e ) { ok( 0, "Merge of mergeable readings failed: $e->message" ); @@ -295,25 +295,39 @@ try { # Test 1.3: attempt relationship with a meta reading (should fail) try { - $c1->add_relationship( '8,1', '9,2', { 'type' => 'collated' } ); + $c1->add_relationship( 'r8.1', 'r9.2', { 'type' => 'collated' } ); ok( 0, "Allowed a meta-reading to be used in a relationship" ); } catch ( Text::Tradition::Error $e ) { is( $e->message, 'Cannot set relationship on a meta reading', "Relationship link prevented for a meta reading" ); } +# Test 1.4: try to break a relationship near a meta reading +$c1->add_relationship( 'r7.6', 'r7.3', { type => 'orthographic' } ); +try { + $c1->del_relationship( 'r7.6', 'r7.7' ); + $c1->del_relationship( 'r7.6', 'r7.3' ); + ok( 1, "Relationship broken with a meta reading as neighbor" ); +} catch { + ok( 0, "Relationship deletion failed with a meta reading as neighbor" ); +} + # Test 2.1: try to equate nodes that are prevented with a real intermediate # equivalence -my $t2 = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/legendfrag.xml' ); +my $t2; +warning_is { + $t2 = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/legendfrag.xml' ); +} 'DROPPING r14.2 -> r8.1: Cannot set relationship on a meta reading', + "Got expected relationship drop warning on parse"; my $c2 = $t2->collation; -$c2->add_relationship( '9,2', '9,3', { 'type' => 'lexical' } ); -my $trel2 = $c2->get_relationship( '9,2', '9,3' ); +$c2->add_relationship( 'r9.2', 'r9.3', { 'type' => 'lexical' } ); +my $trel2 = $c2->get_relationship( 'r9.2', 'r9.3' ); is( ref( $trel2 ), 'Text::Tradition::Collation::Relationship', "Created blocking relationship" ); is( $trel2->type, 'lexical', "Blocking relationship is not a collation" ); # This time the link ought to fail try { - $c2->add_relationship( '8,6', '10,3', { 'type' => 'orthographic' } ); + $c2->add_relationship( 'r8.6', 'r10.3', { 'type' => 'orthographic' } ); ok( 0, "Added cross-equivalent bad relationship" ); } catch ( Text::Tradition::Error $e ) { like( $e->message, qr/witness loop/, @@ -332,13 +346,13 @@ my $t3 = Text::Tradition->new( 'input' => 'Self', 'file' => 't/data/lf2.xml' ); # Test 1: try to equate nodes that are prevented with an intermediate collation my $c3 = $t3->collation; try { - $c3->add_relationship( '36,4', '38,3', { 'type' => 'transposition' } ); + $c3->add_relationship( 'r36.4', 'r38.3', { 'type' => 'transposition' } ); ok( 1, "Added straightforward transposition" ); } catch ( Text::Tradition::Error $e ) { ok( 0, "Failed to add normal transposition: " . $e->message ); } try { - $c3->add_relationship( '36,3', '38,2', { 'type' => 'transposition' } ); + $c3->add_relationship( 'r36.3', 'r38.2', { 'type' => 'transposition' } ); ok( 1, "Added straightforward transposition complement" ); } catch ( Text::Tradition::Error $e ) { ok( 0, "Failed to add normal transposition complement: " . $e->message ); @@ -346,7 +360,7 @@ try { # Test 3.2: try to make a transposition that could be a parallel. try { - $c3->add_relationship( '28,2', '29,2', { 'type' => 'transposition' } ); + $c3->add_relationship( 'r28.2', 'r29.2', { 'type' => 'transposition' } ); ok( 0, "Added bad colocated transposition" ); } catch ( Text::Tradition::Error $e ) { like( $e->message, qr/Readings appear to be colocated/, @@ -355,13 +369,13 @@ try { # Test 3.3: make the parallel, and then make the transposition again. try { - $c3->add_relationship( '28,3', '29,3', { 'type' => 'orthographic' } ); + $c3->add_relationship( 'r28.3', 'r29.3', { 'type' => 'orthographic' } ); ok( 1, "Equated identical readings for transposition" ); } catch ( Text::Tradition::Error $e ) { ok( 0, "Failed to equate identical readings: " . $e->message ); } try { - $c3->add_relationship( '28,2', '29,2', { 'type' => 'transposition' } ); + $c3->add_relationship( 'r28.2', 'r29.2', { 'type' => 'transposition' } ); ok( 1, "Added straightforward transposition complement" ); } catch ( Text::Tradition::Error $e ) { ok( 0, "Failed to add normal transposition complement: " . $e->message ); @@ -412,8 +426,10 @@ sub add_relationship { my $otherrel = $self->scoped_relationship( $rdga, $rdgb ); if( $otherrel && $otherrel->type eq $options->{type} && $otherrel->scope eq $options->{scope} ) { - warn "Applying existing scoped relationship"; + warn "Applying existing scoped relationship for $rdga / $rdgb"; $relationship = $otherrel; + } elsif( $otherrel ) { + throw( "Conflicting scoped relationship for $rdga / $rdgb at $source / $target" ); } } $relationship = $self->create( $options ) unless $relationship; # Will throw on error @@ -561,7 +577,9 @@ sub relationship_valid { my( $self, $source, $target, $rel, $mustdrop ) = @_; $mustdrop = [] unless $mustdrop; # in case we were passed nothing my $c = $self->collation; - if ( $rel eq 'transposition' || $rel eq 'repetition' ) { + ## Assume validity is okay if we are initializing from scratch. + return ( 1, "initializing" ) unless $c->tradition->_initialized; + if ( $rel eq 'transposition' || $rel eq 'repetition' ) { # Check that the two readings do (for a repetition) or do not (for # a transposition) appear in the same witness. # TODO this might be called before witness paths are set... @@ -692,6 +710,9 @@ sub related_readings { # Backwards compat if( $filter eq 'colocated' ) { $filter = sub { $_[0]->colocated }; + } elsif( !ref( $filter ) ) { + my $type = $filter; + $filter = sub { $_[0]->type eq $type }; } my %found = ( $reading => 1 ); my $check = [ $reading ]; @@ -757,14 +778,14 @@ sub _remove_equivalence_node { my $group = $self->equivalence( $node ); my $nodelist = $self->eqreadings( $group ); if( @$nodelist == 1 && $nodelist->[0] eq $node ) { - print STDERR "Removing equivalence $group for $node\n" if $node eq '451,2'; + $self->equivalence_graph->delete_vertex( $group ); $self->remove_eqreadings( $group ); + $self->remove_equivalence( $group ); } elsif( @$nodelist == 1 ) { - warn "DATA INCONSISTENCY in equivalence graph: " . $nodelist->[0] . - " in group that should have only $node"; + throw( "DATA INCONSISTENCY in equivalence graph: " . $nodelist->[0] . + " in group that should have only $node" ); } else { - print STDERR "Removing $node from equivalence $group\n" if $node eq '451,2'; - my @newlist = grep { $_ ne $node } @$nodelist; + my @newlist = grep { $_ ne $node } @$nodelist; $self->set_eqreadings( $group, \@newlist ); $self->remove_equivalence( $node ); } @@ -781,8 +802,6 @@ sub add_equivalence_edge { my( $self, $source, $target ) = @_; my $seq = $self->equivalence( $source ); my $teq = $self->equivalence( $target ); - print STDERR "Adding equivalence edge $seq -> $teq for $source -> $target\n" - if grep { $_ eq '451,2' } @_; $self->equivalence_graph->add_edge( $seq, $teq ); } @@ -797,8 +816,6 @@ sub delete_equivalence_edge { my( $self, $source, $target ) = @_; my $seq = $self->equivalence( $source ); my $teq = $self->equivalence( $target ); - print STDERR "Deleting equivalence edge $seq -> $teq for $source -> $target\n" - if grep { $_ eq '451,2' } @_; $self->equivalence_graph->delete_edge( $seq, $teq ); } @@ -816,12 +833,8 @@ sub _make_equivalence { my $teq = $self->equivalence( $target ); # Nothing to do if they are already equivalent... return if $seq eq $teq; - print STDERR "Making equivalence for $source -> $target\n" - if grep { $_ eq '451,2' } @_; my $sourcepool = $self->eqreadings( $seq ); # and add them to the target readings. - print STDERR "Moving readings '@$sourcepool' from group $seq to $teq\n" - if grep { $_ eq '451,2' } @_; push( @{$self->eqreadings( $teq )}, @$sourcepool ); map { $self->set_equivalence( $_, $teq ) } @$sourcepool; # Then merge the nodes in the equivalence graph. @@ -833,8 +846,8 @@ sub _make_equivalence { } $self->equivalence_graph->delete_vertex( $seq ); # TODO enable this after collation parsing is done -# throw( "Graph got disconnected making $source / $target equivalence" ) -# if $self->_is_disconnected; + throw( "Graph got disconnected making $source / $target equivalence" ) + if $self->_is_disconnected && $self->collation->tradition->_initialized; } =head2 test_equivalence @@ -909,14 +922,8 @@ sub _break_equivalence { map { $tng{$_} = 1 } $self->_find_equiv_without( $target, $source ); # If these groups intersect, they are still connected; do nothing. foreach my $el ( keys %tng ) { - if( exists $sng{$el} ) { - print STDERR "Equivalence break $source / $target is a noop\n" - if grep { $_ eq '451,2' } @_; - return; - } + return if( exists $sng{$el} ); } - print STDERR "Breaking equivalence $source / $target\n" - if grep { $_ eq '451,2' } @_; # If they don't intersect, then we split the nodes in the graph and in # the hashes. First figure out which group has which name my $oldgroup = $self->equivalence( $source ); # same as $target @@ -949,9 +956,11 @@ sub _break_equivalence { my $c = $self->collation; foreach my $rdg ( @$newmembers ) { foreach my $rp ( $c->sequence->predecessors( $rdg ) ) { + next unless $self->equivalence( $rp ); $self->equivalence_graph->add_edge( $self->equivalence( $rp ), $newgroup ); } foreach my $rs ( $c->sequence->successors( $rdg ) ) { + next unless $self->equivalence( $rs ); $self->equivalence_graph->add_edge( $newgroup, $self->equivalence( $rs ) ); } } @@ -960,9 +969,11 @@ sub _break_equivalence { my( %old_pred, %old_succ ); foreach my $rdg ( @$oldmembers ) { foreach my $rp ( $c->sequence->predecessors( $rdg ) ) { + next unless $self->equivalence( $rp ); $old_pred{$self->equivalence( $rp )} = 1; } foreach my $rs ( $c->sequence->successors( $rdg ) ) { + next unless $self->equivalence( $rs ); $old_succ{$self->equivalence( $rs )} = 1; } } @@ -977,8 +988,8 @@ sub _break_equivalence { } } # TODO enable this after collation parsing is done -# throw( "Graph got disconnected breaking $source / $target equivalence" ) -# if $self->_is_disconnected; + throw( "Graph got disconnected breaking $source / $target equivalence" ) + if $self->_is_disconnected && $self->collation->tradition->_initialized; } sub _find_equiv_without { @@ -1002,6 +1013,101 @@ sub _find_equiv_without { return keys %found; } +=head2 rebuild_equivalence + +(Re)build the equivalence graph from scratch. Dumps the graph, makes a new one, +adds all readings and edges, then makes an equivalence for all relationships. + +=cut + +sub rebuild_equivalence { + my $self = shift; + my $newgraph = Graph->new(); + # Set this as the new equivalence graph + $self->_reset_equivalence( $newgraph ); + # Clear out the data hashes + $self->_clear_equivalence; + $self->_clear_eqreadings; + + $self->collation->tradition->_init_done(0); + # Add the readings + foreach my $r ( $self->collation->readings ) { + my $rid = $r->id; + $newgraph->add_vertex( $rid ); + $self->set_equivalence( $rid, $rid ); + $self->set_eqreadings( $rid, [ $rid ] ); + } + + # Now add the edges + foreach my $e ( $self->collation->paths ) { + $self->add_equivalence_edge( @$e ); + } + + # Now equate the colocated readings. This does no testing; + # it assumes that all preexisting relationships are valid. + foreach my $rel ( $self->relationships ) { + my $relobj = $self->get_relationship( $rel ); + next unless $relobj && $relobj->colocated; + $self->_make_equivalence( @$rel ); + } + $self->collation->tradition->_init_done(1); +} + +=head2 equivalence_ranks + +Rank all vertices in the equivalence graph, and return a hash reference with +vertex => rank mapping. + +=cut + +sub equivalence_ranks { + my $self = shift; + my $eqstart = $self->equivalence( $self->collation->start ); + my $eqranks = { $eqstart => 0 }; + my $rankeqs = { 0 => [ $eqstart ] }; + my @curr_origin = ( $eqstart ); + # A little iterative function. + while( @curr_origin ) { + @curr_origin = $self->_assign_rank( $eqranks, $rankeqs, @curr_origin ); + } + return( $eqranks, $rankeqs ); +} + +sub _assign_rank { + my( $self, $node_ranks, $rank_nodes, @current_nodes ) = @_; + my $graph = $self->equivalence_graph; + # Look at each of the children of @current_nodes. If all the child's + # parents have a rank, assign it the highest rank + 1 and add it to + # @next_nodes. Otherwise skip it; we will return when the highest-ranked + # parent gets a rank. + my @next_nodes; + foreach my $c ( @current_nodes ) { + warn "Current reading $c has no rank!" + unless exists $node_ranks->{$c}; + foreach my $child ( $graph->successors( $c ) ) { + next if exists $node_ranks->{$child}; + my $highest_rank = -1; + my $skip = 0; + foreach my $parent ( $graph->predecessors( $child ) ) { + if( exists $node_ranks->{$parent} ) { + $highest_rank = $node_ranks->{$parent} + if $highest_rank <= $node_ranks->{$parent}; + } else { + $skip = 1; + last; + } + } + next if $skip; + my $c_rank = $highest_rank + 1; + # print STDERR "Assigning rank $c_rank to node $child \n"; + $node_ranks->{$child} = $c_rank if $node_ranks; + push( @{$rank_nodes->{$c_rank}}, $child ) if $rank_nodes; + push( @next_nodes, $child ); + } + } + return @next_nodes; +} + ### Output logic sub _as_graphml { diff --git a/lib/Text/Tradition/Directory.pm b/lib/Text/Tradition/Directory.pm index e073503..dfbbeee 100644 --- a/lib/Text/Tradition/Directory.pm +++ b/lib/Text/Tradition/Directory.pm @@ -181,18 +181,21 @@ is( ref( $nt ), 'Text::Tradition', "Made new tradition" ); =end testing =cut +use Text::Tradition::TypeMap::Entry; has +typemap => ( - is => 'rw', - isa => 'KiokuDB::TypeMap', - default => sub { - KiokuDB::TypeMap->new( - isa_entries => { - "Graph" => KiokuDB::TypeMap::Entry::Naive->new, - "Graph::AdjacencyMap" => KiokuDB::TypeMap::Entry::Naive->new, - } - ); - }, + is => 'rw', + isa => 'KiokuDB::TypeMap', + default => sub { + KiokuDB::TypeMap->new( + isa_entries => { + "Text::Tradition" => + KiokuDB::TypeMap::Entry::Naive->new(), + "Graph" => Text::Tradition::TypeMap::Entry->new(), + "Graph::AdjacencyMap" => Text::Tradition::TypeMap::Entry->new(), + } + ); + }, ); # Push some columns into the extra_args @@ -222,7 +225,8 @@ around BUILDARGS => sub { }; ## These checks don't cover store($id, $obj) -before [ qw/ store update insert delete / ] => sub { +# before [ qw/ store update insert delete / ] => sub { +before [ qw/ delete / ] => sub { my $self = shift; my @nontrad; foreach my $obj ( @_ ) { @@ -244,11 +248,11 @@ before [ qw/ store update insert delete / ] => sub { # TODO Garbage collection doesn't work. Suck it up and live with the # inflated DB. -# after delete => sub { -# my $self = shift; -# my $gc = KiokuDB::GC::Naive->new( backend => $self->directory->backend ); -# $self->directory->backend->delete( $gc->garbage->members ); -# }; +after delete => sub { + my $self = shift; + my $gc = KiokuDB::GC::Naive->new( backend => $self->directory->backend ); + $self->directory->backend->delete( $gc->garbage->members ); +}; sub save { my $self = shift; diff --git a/lib/Text/Tradition/Language/Armenian.pm b/lib/Text/Tradition/Language/Armenian.pm new file mode 100644 index 0000000..8d09353 --- /dev/null +++ b/lib/Text/Tradition/Language/Armenian.pm @@ -0,0 +1,73 @@ +package Text::Tradition::Language::Armenian; + +use strict; +use warnings; +use Module::Load; +use parent qw/ Text::Tradition::Language::Perseus /; + +=head1 NAME + +Text::Tradition::Language::Armenian - language-specific module for Armenian + +=head1 DESCRIPTION + +Implements morphology lookup for Armenian (Grabar) words in context. This module +depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data. + +=head1 SUBROUTINES + +=head2 lemmatize( $text ) + +Evaluates the string using Treetagger and Perseus, and returns the results. + +=begin testing + +use Text::Tradition; +use_ok( 'Text::Tradition::Language::Armenian' ); + +eval "use Lingua::Morph::Perseus"; +my $err = $@; + +SKIP: { + skip "No Armenian test data yet"; + + my $trad = Text::Tradition->new( + 'language' => 'Armenian', + 'file' => 't/data/legendfrag.xml', + 'input' => 'Self' ); + $trad->lemmatize(); + my $ambig = 0; + foreach my $r ( $trad->collation->readings ) { + next if $r->is_meta; + ok( $r->has_lexemes, "Reading $r has one or more lexemes" ); + my @lex = $r->lexemes; + my $lexstr = join( '', map { $_->string } @lex ); + my $textstr = $r->text; + $textstr =~ s/\s+//g; + is( $textstr, $lexstr, "Lexemes for reading $r match the reading" ); + foreach my $l ( @lex ) { + next unless $l->matches; + next if $l->is_disambiguated; + printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id, + join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) ); + $ambig++; + } + } + is( $ambig, 4, "Found 4 ambiguous forms as expected" ); +} + +=end testing + +=cut + +our $dbhandle; + +sub lemmatize { + return __PACKAGE__->perseus_lemmatize( @_ ); +} + +sub reading_lookup { + return __PACKAGE__->perseus_reading_lookup( @_ ); +} + +1; diff --git a/lib/Text/Tradition/Language/Base.pm b/lib/Text/Tradition/Language/Base.pm new file mode 100644 index 0000000..f0fe304 --- /dev/null +++ b/lib/Text/Tradition/Language/Base.pm @@ -0,0 +1,304 @@ +package Text::Tradition::Language::Base; + +use strict; +use warnings; +use Encode qw/ encode_utf8 decode_utf8 /; +use Exporter 'import'; +use vars qw/ @EXPORT_OK /; +use IPC::Run qw/ run /; +use Module::Load; +use Text::Tradition::Collation::Reading::Lexeme; +use Text::Tradition::Collation::Reading::WordForm; +use TryCatch; + +@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /; + +=head1 NAME + +Text::Tradition::Language::Base - Base subroutines for lemmatization of words + +=head1 DESCRIPTION + +Common routines for applying morphological tagging to a Text::Tradition. Used +with callbacks from the named language packages. + +=head1 SUBROUTINES + +=head2 lemmatize_treetagger( $tradition ) + +Evaluates the tradition with the given options, and returns the results. + +=cut + +sub lemmatize_treetagger { + my( $tradition, %opts ) = @_; + + # Given a tradition, lemmatize it witness by witness and see what we get. + my $c = $tradition->collation; + # First, clear out all existing lexemes from the readings. + my %witness_paths = _clear_reading_lexemes( $tradition ); + + foreach my $sig ( keys %witness_paths ) { + # Get the text as a sequence of readings and as a string + my %witopts = ( + 'path' => $witness_paths{$sig}, + %opts + ); + _lemmatize_treetagger_sequence( %witopts ); + } +} + +sub _clear_reading_lexemes { + my $tradition = shift; + my $c = $tradition->collation; + # Clear out all existing lexemes from the readings. Save the path as long + # as we went to the trouble of generating it. + my %witness_paths; + foreach my $wit ( $tradition->witnesses ) { + my @sigla = ( $wit->sigil ); + push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered; + foreach my $sig ( @sigla ) { + my @path = grep { !$_->is_meta } + $c->reading_sequence( $c->start, $c->end, $sig ); + map { $_->clear_lexemes } @path; + $witness_paths{$sig} = \@path; + } + } + return %witness_paths; +} + +=head2 reading_lookup( $rdg[, $rdg, ...] ) + +Looks up one or more readings using the Flemm package, and returns the +possible results. This uses the same logic as L above for the +entire tradition, but can also be used to (re-)analyze individual readings. + +=cut + +sub reading_lookup_treetagger { + my %opts = @_; + $opts{'replace'} = 1; + return _lemmatize_treetagger_sequence( %opts ); +} + +sub _lemmatize_treetagger_sequence { + my %opts = @_; + my @path = @{$opts{'path'}}; + my $tagresult = _treetag_string( _text_from_path( 1, @path ), $opts{'language'} ); + if( $tagresult ) { + # Map the tagged words onto the original readings, splitting + # them up into lexemes where necessary. + # NOTE we can have multiple lexemes in a reading, but not + # multiple readings to a lexeme. + my @tags = split( /\n/, $tagresult ); + my @lexemes; + my $curr_rdg = shift @path; + my @curr_lexemes; + my $unused_rdg_part; + foreach my $tag ( @tags ) { + # Get the original word + my( $lexeme, @rest ) = split( /\t/, $tag ); + # Lemmatize the whole + # TODO error trap this + my @forms = $opts{'callback'}( $tag ); + + my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new( + 'string' => $lexeme, 'language' => $opts{'language'}, + 'wordform_matchlist' => \@forms ); + # Find the next non-meta reading + while( $curr_rdg && $curr_rdg->is_meta ) { + $curr_rdg = shift @path; + } + unless( $curr_rdg ) { + warn "Ran out of readings in sequence at $lexeme"; + last; + } + my $curr_rdg_text = $curr_rdg->normal_form; + if( $unused_rdg_part && + $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { + # Nth part of curr_rdg + $unused_rdg_part = $2; + push( @curr_lexemes, $lexobj ); + } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { + # Flag an error if there is already an unused reading part. + warn "Skipped over unused text $unused_rdg_part at $curr_rdg" + if $unused_rdg_part; + $unused_rdg_part = $2; # will be empty if the whole reading matched + push( @curr_lexemes, $lexobj ); + } else { + # We do not cope with the idea of a lexeme being + # spread across multiple readings. + warn "Word sequence changed unexpectedly in text"; + # See if we can find a matching reading + my @lookahead; + my $matched; + while( my $nr = shift @path ) { + my $nrtext = $nr->normal_form; + if( $nrtext =~ /^\Q$lexeme\E/ ) { + $curr_rdg = $lookahead[-1] if @lookahead; + $matched = 1; + last; + } else { + push( @lookahead, $nr ); + } + } + # No match? Restore the state we had + unless( $matched ) { + unshift( @path, @lookahead ); + } + # Trigger a move + $unused_rdg_part = ''; + } + + unless( $unused_rdg_part ) { + # Record the lexemes for the given reading. + #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n", + # join( ' ', map { $_->string } @curr_lexemes ), + # $curr_rdg->id, $curr_rdg->text ); + _update_reading_lexemes( $opts{replace}, $curr_rdg, @curr_lexemes ); + $curr_rdg = shift @path; + @curr_lexemes = (); + } + } + } +} + +sub _update_reading_lexemes { + my( $replace, $reading, @lexemes ) = @_; + if( $reading->has_lexemes && !$replace ) { + # We need to merge what is in @lexemes with what we have already. + my @oldlex = $reading->lexemes; + my $cmp1 = join( '||', map { $_->string } @oldlex ); + my $cmp2 = join( '||', map { $_->string } @lexemes ); + if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) { + # The lexeme strings are the same, so merge the possible + # word forms from new to old. + foreach my $i ( 0 .. $#lexemes ) { + my $ol = $oldlex[$i]; + my $nl = $lexemes[$i]; + my %ofw; + map { $ofw{$_->to_string} = 1 } $ol->matching_forms; + foreach my $form ( $nl->matching_forms ) { + unless( $ofw{$form->to_string} ) { + # print STDERR "Adding form " . $form->to_string . + # " to lexeme " . $nl->string . " at $reading\n"; + $ol->add_matching_form( $form ); + $ol->is_disambiguated(0); + } + } + } + } else { + warn "Lexeme layout for $reading changed; replacing the lot"; + $reading->clear_lexemes; + $reading->add_lexeme( @lexemes ); + } + } else { + $reading->clear_lexemes if $replace; + $reading->add_lexeme( @lexemes ); + } +} + +# Utility function so that we can cheat and use it when we need both the path +# and its text. +sub _text_from_path { + my( $normalize, @path ) = @_; + my $pathtext = ''; + my $last; + foreach my $r ( @path ) { + unless ( $r->join_prior || !$last || $last->join_next ) { + $pathtext .= ' '; + } + $pathtext .= $normalize ? $r->normal_form : $r->text; + $last = $r; + } + return $pathtext; +} + +# Utility function that actually calls the tree tagger. +sub _treetag_string { + my( $text, $lang ) = @_; + my $wittext = encode_utf8( $text ); + # Then see if we have TreeTagger + try { + load( 'Lingua::TreeTagger' ); + } catch { + warn "Cannot run TreeTagger without Lingua::TreeTagger module"; + return ''; + } + # OK, we can run it then. + # First upgrade to UTF8 for necessary languages. + my @utf8_supported = qw/ French Latin Greek /; + my %ttopts = ( 'language' => $lang, 'options' => [ qw/ -token -lemma / ] ); + if( grep { $_ eq $lang } @utf8_supported ) { + $ttopts{'use_utf8'} = 1; + } + # Now instantiate and run the tagger. + my $tagger = Lingua::TreeTagger->new( %ttopts ); + my $tagresult = $tagger->tag_text( \$text ); + + # TODO maybe send the tokens back rather than the interpreted string... + return $tagresult->as_text(); +} + +=head2 lfs_morph_tags + +Return a data structure describing the available parts of speech and their attributes +from the Lingua::Features::Structure class currently defined. + +=cut + +sub lfs_morph_tags { + load('Lingua::Features::StructureType'); + my $tagset = { 'structures' => [], 'features' => {} }; + foreach my $lfs ( sort { _by_structid( $a->id, $b->id ) } Lingua::Features::StructureType->types() ) { + my $tsstruct = { 'id' => $lfs->id, 'desc' => $lfs->desc, 'use_features' => [] }; + foreach my $ftid ( Lingua::Features::StructureType->type($lfs->id)->features ) { + my $ftype = $lfs->feature_type( $ftid ); + if( !$ftype && $lfs->base ) { + $ftype = $lfs->base->feature_type( $ftid ); + } + if( $ftype ) { + push( @{$tsstruct->{'use_features'}}, $ftid ); + if( $ftid eq 'type' ) { + # Type values change according to category + $ftid .= " (" . $lfs->id . ")"; + } + my $tfstruct = { 'id' => $ftid, 'values' => [] }; + foreach my $fval( $ftype->values ) { + push( @{$tfstruct->{'values'}}, + { 'short' => $fval, 'long' => $ftype->value_name( $fval ) } ); + } + $tagset->{'features'}->{$ftid} = $tfstruct; + } + } + push( @{$tagset->{'structures'}}, $tsstruct ); + } + return $tagset; +} + +sub _by_structid { + my( $a, $b ) = @_; + return -1 if $a eq 'cat'; + return 1 if $b eq 'cat'; + return $a cmp $b; +} + +1; + +=head2 TODO + +=over + +=item * Handle package dependencies more gracefully + +=back + +=head1 LICENSE + +This package is free software and is provided "as is" without express +or implied warranty. You can redistribute it and/or modify it under +the same terms as Perl itself. + +=head1 AUTHOR + +Tara L Andrews Eaurum@cpan.orgE diff --git a/lib/Text/Tradition/Language/English.pm b/lib/Text/Tradition/Language/English.pm new file mode 100644 index 0000000..e1fee8d --- /dev/null +++ b/lib/Text/Tradition/Language/English.pm @@ -0,0 +1,110 @@ +package Text::Tradition::Language::English; + +use strict; +use warnings; +use Lingua::TagSet::TreeTagger::English; +use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger + lfs_morph_tags /; +use TryCatch; + +=head1 NAME + +Text::Tradition::Language::English - language-specific module for English + +=head1 DESCRIPTION + +Implements morphology lookup for English words in context. This module +depends on the TreeTagger software +(L), which is +(for now) expected to be installed in $MORPHDIR/TreeTagger. + +=head1 SUBROUTINES + +=head2 lemmatize( $text ) + +Evaluates the string using the TreeTagger, and returns the results. + +=begin testing + +binmode STDOUT, ':utf8'; +use Text::Tradition; +use_ok( 'Text::Tradition::Language::English' ); + +=end testing + +=cut + +sub lemmatize { + my $tradition = shift; + my %opts = ( + 'language' => 'English', + 'callback' => sub { _parse_wordform( @_ ) } + ); + return lemmatize_treetagger( $tradition, %opts ); +} + +=head2 reading_lookup( $rdg[, $rdg, ...] ) + +Looks up one or more readings using the Flemm package, and returns the +possible results. This uses the same logic as L above for the +entire tradition, but can also be used to (re-)analyze individual readings. + +=cut + +sub reading_lookup { + my( @path ) = @_; + my %opts = ( + 'language' => 'English', + 'callback' => sub { _parse_wordform( @_ ) }, + 'path' => \@path, + ); + return reading_lookup_treetagger( %opts ); +} + +=head2 morphology_tags + +Return a data structure describing the available parts of speech and their attributes. + +=cut + +sub morphology_tags { + return lfs_morph_tags(); +} + +# Utility function to turn a TreeTagger result into a WordForm +sub _parse_wordform { + my $tagresult = shift; + my( $orig, $tag, $lemma ) = split( /\t/, $tagresult ); + return () unless $tag =~ /\w/; # skip punct-only "tags" + my $morphobj = Lingua::TagSet::TreeTagger::English->tag2structure( $tag ); + if( $morphobj ) { + return ( Text::Tradition::Collation::Reading::WordForm->new( + 'language' => 'English', + 'lemma' => $lemma, + 'morphology' => $morphobj, + ) ); + } else { + warn "No morphology found for word: $tagresult"; + return (); + } +} + +1; + +=head2 TODO + +=over + +=item * Tests! + +=back + +=head1 LICENSE + +This package is free software and is provided "as is" without express +or implied warranty. You can redistribute it and/or modify it under +the same terms as Perl itself. + +=head1 AUTHOR + +Tara L Andrews Eaurum@cpan.orgE diff --git a/lib/Text/Tradition/Language/French.pm b/lib/Text/Tradition/Language/French.pm index 8028c98..9dbfd01 100644 --- a/lib/Text/Tradition/Language/French.pm +++ b/lib/Text/Tradition/Language/French.pm @@ -1,21 +1,25 @@ package Text::Tradition::Language::French; -use Encode qw/ encode_utf8 decode_utf8 /; -use IPC::Run qw/ run binary /; -use Module::Load; -use Text::Tradition::Collation::Reading::Lexeme; -use Text::Tradition::Collation::Reading::WordForm; +use strict; +use warnings; +use Lingua::TagSet::Multext; +use Lingua::TagSet::TreeTagger::French; +use Module::Load qw/ load /; +use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /; use TryCatch; -my $MORPHDIR = '/Users/tla/Projects/morphology'; - =head1 NAME -Text::Tradition::Language::French - language-specific modules for French +Text::Tradition::Language::French - language-specific module for French =head1 DESCRIPTION -Implements morphology lookup for French words in context. +Implements morphology lookup for French words in context. This module +depends on the Flemm module for French lemmatization +(L in conjunction with +the TreeTagger software +(L), which is +(for now) expected to be installed in $MORPHDIR/TreeTagger. =head1 SUBROUTINES @@ -23,181 +27,100 @@ Implements morphology lookup for French words in context. Evaluates the string using the Flemm package, and returns the results. -=cut - -sub lemmatize { - my $tradition = shift; - - # Given a tradition, lemmatize it witness by witness and see what we get. - my $workdir = File::Temp->newdir(); - my $c = $tradition->collation; - # First, clear out all existing lexemes from the readings. Save the - # path as long as we went to the trouble of generating it. - my %witness_paths; - foreach my $wit ( $tradition->witnesses ) { - my @sigla = ( $wit->sigil ); - push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered; - foreach my $sig ( @sigla ) { - my @path = grep { !$_->is_meta } - $c->reading_sequence( $c->start, $c->end, $sig ); - map { $_->clear_lexemes } @path; - $witness_paths{$sig} = \@path; - } - } - - foreach my $sig ( keys %witness_paths ) { - # Get the text as a sequence of readings and as a string - print STDERR "Morphologizing witness $sig\n"; - my @path = @{$witness_paths{$sig}}; - my $tagresult = _treetag_string( $c->_text_from_path( @path ) ); - if( $tagresult ) { - # Map the tagged words onto the original readings, splitting - # them up into lexemes where necessary. - # NOTE we can have multiple lexemes in a reading, but not - # multiple readings to a lexeme. - my @tags = split( /\n/, $tagresult ); - my @lexemes; - my $curr_rdg = shift @path; - my @curr_lexemes; - my $unused_rdg_part; - foreach my $tag ( @tags ) { - # Get the original word - my( $lexeme, @rest ) = split( /\t/, $tag ); - # Lemmatize the whole - my @forms = _parse_wordform( _flemm_lookup( $tag ) ); - my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new( - 'string' => $lexeme, 'language' => 'French', - 'wordform_matchlist' => \@forms ); - # Find the next non-meta reading - while( $curr_rdg->is_meta ) { - $curr_rdg = shift @path; - } - unless( $curr_rdg ) { - warn "Ran out of readings in sequence for " . $wit->sigil - . " at $lexeme"; - last; - } - if( $unused_rdg_part && - $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { - # Nth part of curr_rdg - $unused_rdg_part = $2; - push( @curr_lexemes, $lexobj ); - } elsif( $curr_rdg->text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { - # Flag an error if there is already an unused reading part. - warn "Skipped over unused text $unused_rdg_part at $curr_rdg" - if $unused_rdg_part; - $unused_rdg_part = $2; # will be empty if the whole reading matched - push( @curr_lexemes, $lexobj ); - } else { - # We do not cope with the idea of a lexeme being - # spread across multiple readings. - warn "Word sequence changed unexpectedly in text"; - # See if we can find a matching reading - my @lookahead; - my $matched; - while( my $nr = shift @path ) { - if( $nr->text =~ /^\Q$lexeme\E/ ) { - $curr_rdg = $lookahead[-1] if @lookahead; - $matched = 1; - last; - } else { - push( @lookahead, $nr ); - } - } - # No match? Restore the state we had - unless( $matched ) { - unshift( @path, @lookahead ); - } - # Trigger a move - $unused_rdg_part = ''; - } - - unless( $unused_rdg_part ) { - # Record the lexemes for the given reading. - #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n", - # join( ' ', map { $_->string } @curr_lexemes ), - # $curr_rdg->id, $curr_rdg->text ); - _update_reading_lexemes( $curr_rdg, @curr_lexemes ); - $curr_rdg = shift @path; - @curr_lexemes = (); - } +=begin testing + +binmode STDOUT, ':utf8'; +use Text::Tradition; +use_ok( 'Text::Tradition::Language::French' ); + +eval "use Flemm"; +my $err = $@; + +SKIP: { + skip "Package Flemm not found" if $err; + my $tf = Text::Tradition->new( + 'input' => 'Self', + 'file' => 't/data/besoin.xml', + 'language' => 'French' ); + + is( $tf->language, 'French', "Set language okay" ); + $tf->lemmatize(); + # Test the lemmatization. How many readings now have morphological info? + # Do the lexemes match the reading? + my $ambig = 0; + my $flemmed = 0; + foreach my $r ( $tf->collation->readings ) { + next if $r->is_meta; + ok( $r->has_lexemes, "Reading $r has one or more lexemes" ); + my @lex = $r->lexemes; + my $lexstr = join( '', map { $_->string } @lex ); + my $textstr = $r->text; + $textstr =~ s/\s+//g; + is( $textstr, $lexstr, "Lexemes for reading $r match the reading" ); + foreach my $l ( @lex ) { + # Check to see if Flemm actually ran + foreach my $wf ( $l->matching_forms ) { + $flemmed++ if $wf->morphology->get_feature('num'); } + next if $l->is_disambiguated; + $ambig++; } } + is( $ambig, 102, "Found 102 ambiguous forms as expected" ); + ok( $flemmed > 500, "Found enough Flemm info in wordforms" ); + + # Try setting the normal form of a reading and re-analyzing + my $mr = $tf->collation->reading('r99.2'); + is( $mr->text, 'minspire', "Picked correct test reading" ); + is( $mr->language, 'French', "Reading has correct language setting" ); + $mr->normal_form( "m'inspire" ); + $mr->lemmatize; + my @l = $mr->lexemes; + is( @l, 2, "Got two lexemes for new m'inspire reading" ); + is( $l[0]->form->to_string, + '"French // se|le|lui // cat@pron type@pers pers@1 num@sing case@acc|dat"', + "New reading has correct first lexeme" ); } -sub _update_reading_lexemes { - my( $reading, @lexemes ) = @_; - if( $reading->has_lexemes ) { - # We need to merge what is in @lexemes with what we have already. - my @oldlex = $reading->lexemes; - my $cmp1 = join( '||', map { $_->string } @oldlex ); - my $cmp2 = join( '||', map { $_->string } @lexemes ); - if ( @oldlex == @lexemes && $cmp1 == $cmp2 ) { - # The lexeme strings are the same, so merge the possible - # word forms from new to old. - foreach my $i ( 0 .. $#lexemes ) { - my $ol = $oldlex[$i]; - my $nl = $lexemes[$i]; - my %ofw; - map { $ofw{$_->_stringify} = 1 } $ol->matching_forms; - foreach my $form ( $nl->matching_forms ) { - unless( $ofw{$form->_stringify} ) { - print STDERR "Adding form " . $form->_stringify . - " to lexeme " . $nl->string . " at $reading\n"; - $ol->add_matching_form( $form ); - $ol->is_disambiguated(0); - } - } - } - } else { - $DB::single = 1; - warn "Lexeme layout for $reading changed; replacing the lot"; - $reading->clear_lexemes; - $reading->add_lexeme( @lexemes ); - } - } else { - $reading->add_lexeme( @lexemes ); - } +=end testing + +=cut + +sub lemmatize { + my $tradition = shift; + my %opts = ( + 'language' => 'French', + 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) } + ); + return lemmatize_treetagger( $tradition, %opts ); } -=head2 word_lookup( $word ) +=head2 reading_lookup( $rdg[, $rdg, ...] ) -Looks up a word using the Flemm package, and returns the possible results. -It is better to use L for context sensitivity. +Looks up one or more readings using the Flemm package, and returns the +possible results. This uses the same logic as L above for the +entire tradition, but can also be used to (re-)analyze individual readings. =cut -sub word_lookup { - my $word = shift; - my $tagresult = _treetag_string( $word ); - my $lemmatizer; - try { - load 'Flemm'; - $lemmatizer = Flemm->new( 'Encoding' => 'utf8', 'Tagger' => 'treetagger' ); - } catch { - warn "Cannot do French word lemmatization without Flemm: @_"; - return; - } - return _parse_wordform( _flemm_lookup( $tagresult ) ); +sub reading_lookup { + my( @path ) = @_; + my %opts = ( + 'language' => 'French', + 'callback' => sub { _parse_wordform( _flemm_lookup( @_ ) ) }, + 'path' => \@path, + ); + return reading_lookup_treetagger( %opts ); } -# Utility function that actually calls the tree tagger. -sub _treetag_string { - my( $text ) = @_; - my $wittext = encode_utf8( $text ); - # Then see if we have TreeTagger - my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-french-utf8"; - unless( -f $taggercmd ) { - warn "Cannot do French word lemmatization without TreeTagger"; - return; - } - # OK, we can run it then. - my @cmd = ( $taggercmd ); - my( $tagresult, $err ); # Capture the output and error - run( \@cmd, \$wittext, \$tagresult, \$err ); - # TODO check for error - return decode_utf8( $tagresult ); +=head2 morphology_tags + +Return a data structure describing the available parts of speech and their attributes. + +=cut + +sub morphology_tags { + return lfs_morph_tags(); } # Closure and utility function for the package lemmatizer @@ -229,18 +152,37 @@ sub _parse_wordform { foreach ( @results ) { my( $orig, $tag, $lemma ) = split( /\t/, $_ ); my( $pos, $morph ) = split( /:/, $tag ); - my $wf = Text::Tradition::Collation::Reading::WordForm->new( - 'language' => 'French', - 'lemma' => $lemma, - 'morphology' => [ split( //, $morph ) ], - ); - push( @forms, $wf ); + my $morphobj; + if( $morph ) { + $morphobj = Lingua::TagSet::Multext->tag2structure( $morph ); + } else { + # Use the TreeTagger info if there is no Flemm morphology. + $morphobj = Lingua::TagSet::TreeTagger::French->tag2structure( $pos ); + } + if( $morphobj ) { + my $wf = Text::Tradition::Collation::Reading::WordForm->new( + 'language' => 'French', + 'lemma' => $lemma, + 'morphology' => $morphobj, + ); + push( @forms, $wf ); + } else { + warn "No morphology found for word: $_"; + } } return @forms; } 1; +=head2 TODO + +=over + +=item * Try to do more things with Perl objects in Flemm and TT + +=back + =head1 LICENSE This package is free software and is provided "as is" without express diff --git a/lib/Text/Tradition/Language/Greek.pm b/lib/Text/Tradition/Language/Greek.pm new file mode 100644 index 0000000..28d69cf --- /dev/null +++ b/lib/Text/Tradition/Language/Greek.pm @@ -0,0 +1,73 @@ +package Text::Tradition::Language::Greek; + +use strict; +use warnings; +use Module::Load; +use parent qw/ Text::Tradition::Language::Perseus /; + +=head1 NAME + +Text::Tradition::Language::Greek - language-specific module for Greek + +=head1 DESCRIPTION + +Implements morphology lookup for Greek words in context. This module +depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data. + +=head1 SUBROUTINES + +=head2 lemmatize( $text ) + +Evaluates the string using Treetagger and Perseus, and returns the results. + +=begin testing + +use Text::Tradition; +use_ok( 'Text::Tradition::Language::Greek' ); + +eval "use Lingua::Morph::Perseus"; +my $err = $@; + +SKIP: { + skip "Greek linguistic data not read yet"; + + my $trad = Text::Tradition->new( + 'language' => 'Greek', + 'file' => 't/data/florilegium_graphml.xml', + 'input' => 'Self' ); + $trad->lemmatize(); + my $ambig = 0; + foreach my $r ( $trad->collation->readings ) { + next if $r->is_meta; + ok( $r->has_lexemes, "Reading $r has one or more lexemes" ); + my @lex = $r->lexemes; + my $lexstr = join( '', map { $_->string } @lex ); + my $textstr = $r->text; + $textstr =~ s/\s+//g; + is( $textstr, $lexstr, "Lexemes for reading $r match the reading" ); + foreach my $l ( @lex ) { + next unless $l->matches; + next if $l->is_disambiguated; + printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id, + join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) ); + $ambig++; + } + } + is( $ambig, 4, "Found 4 ambiguous forms as expected" ); +} + +=end testing + +=cut + +our $dbhandle; + +sub lemmatize { + return __PACKAGE__->perseus_lemmatize( @_ ); +} + +sub reading_lookup { + return __PACKAGE__->perseus_reading_lookup( @_ ); +} + +1; diff --git a/lib/Text/Tradition/Language/Latin.pm b/lib/Text/Tradition/Language/Latin.pm new file mode 100644 index 0000000..2f4a42a --- /dev/null +++ b/lib/Text/Tradition/Language/Latin.pm @@ -0,0 +1,73 @@ +package Text::Tradition::Language::Latin; + +use strict; +use warnings; +use Module::Load; +use parent qw/ Text::Tradition::Language::Perseus /; + +=head1 NAME + +Text::Tradition::Language::Latin - language-specific module for Latin + +=head1 DESCRIPTION + +Implements morphology lookup for Latin words in context. This module +depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data. + +=head1 SUBROUTINES + +=head2 lemmatize( $text ) + +Evaluates the string using Treetagger and Perseus, and returns the results. + +=begin testing + +use Text::Tradition; +use_ok( 'Text::Tradition::Language::Latin' ); + +eval "use Lingua::Morph::Perseus"; +my $err = $@; + +SKIP: { + skip "Package Lingua::Morph::Perseus not found" if $err; + + my $trad = Text::Tradition->new( + 'language' => 'Latin', + 'file' => 't/data/legendfrag.xml', + 'input' => 'Self' ); + $trad->lemmatize(); + my $ambig = 0; + foreach my $r ( $trad->collation->readings ) { + next if $r->is_meta; + ok( $r->has_lexemes, "Reading $r has one or more lexemes" ); + my @lex = $r->lexemes; + my $lexstr = join( '', map { $_->string } @lex ); + my $textstr = $r->text; + $textstr =~ s/\s+//g; + is( $textstr, $lexstr, "Lexemes for reading $r match the reading" ); + foreach my $l ( @lex ) { + next unless $l->matches; + next if $l->is_disambiguated; + printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id, + join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) ); + $ambig++; + } + } + is( $ambig, 4, "Found 4 ambiguous forms as expected" ); +} + +=end testing + +=cut + +our $dbhandle; + +sub lemmatize { + return __PACKAGE__->perseus_lemmatize( @_ ); +} + +sub reading_lookup { + return __PACKAGE__->perseus_reading_lookup( @_ ); +} + +1; diff --git a/lib/Text/Tradition/Language/Perseus.pm b/lib/Text/Tradition/Language/Perseus.pm new file mode 100644 index 0000000..ebbcd54 --- /dev/null +++ b/lib/Text/Tradition/Language/Perseus.pm @@ -0,0 +1,136 @@ +package Text::Tradition::Language::Perseus; + +use strict; +use warnings; +use Module::Load; +use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger + lfs_morph_tags /; +use TryCatch; + +=head1 NAME + +Text::Tradition::Language::Perseus - base module for those languages that rely +on a Lingua::Morph::Perseus database. + +=head1 DESCRIPTION + +Implements morphology lookup for words in context. This module depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data. + +=head1 SUBROUTINES + +=head2 lemmatize( $text ) + +Evaluates the string using Treetagger and Perseus, and returns the results. + +=cut + +# tested in child language modules + +sub perseus_lemmatize { + my $self = shift; + my $tradition = shift; + my %opts = ( + 'language' => $tradition->language, + 'callback' => sub { _perseus_lookup_tt( $self, @_ ) } + ); + return lemmatize_treetagger( $tradition, %opts ); +} + +=head2 reading_lookup( $rdg[, $rdg, ...] ) + +Looks up one or more readings using the Perseus package, and returns the +possible results. This skips the tree tagger / tokenizer, returning any +match for the word string(s) in the morphology DB. + +=cut + +sub perseus_reading_lookup { + my( $self, @words ) = @_; + my %opts = ( + 'language' => $self->_get_lang(), + 'callback' => sub { _perseus_lookup_str( $self, @_ ) }, + 'path' => \@words, + ); + return reading_lookup_treetagger( %opts ); +} + +=head2 morphology_tags + +Return a data structure describing the available parts of speech and their attributes. + +=cut + +sub morphology_tags { + return lfs_morph_tags(); +} + +sub _get_lang { + my $self = shift; + my @parts = split( /::/, $self ); + return $parts[-1]; +} + +sub _morph_connect { + my $self = shift; + unless( $self::dbhandle ) { + my $lang = $self->_get_lang(); + try { + load 'Lingua::Morph::Perseus'; + $self::dbhandle = Lingua::Morph::Perseus->connect( $lang ); + } catch { + warn "Cannot do $lang word lemmatization without Lingua::Morph::Perseus: @_"; + return; + } + } +} + +sub _perseus_lookup_tt { + my $self = shift; + my( $orig, $pos, $lemma ) = split( /\t/, $_[0] ); + $self->_morph_connect(); + return unless $self::dbhandle; + # Discard results that don't match the lemma, unless lemma is unknown + my $lookupopts = {}; + unless( $lemma eq '' || $lemma =~ /^\W+$/ ) { + my %lems; + map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma ); + $lookupopts->{'lemma'} = [ keys %lems ]; + } + $lookupopts->{'ttpos'} = $pos if $pos; + + my $result = $self::dbhandle->lexicon_lookup( $orig, $lookupopts ); + # unless( !keys( %$lookupopts ) || $result->{'filtered'} ) { + # warn "Filter on $pos / $lemma returned no results; using all results"; + # } + my @ret = @{$result->{'objects'}}; + my %unique_wordforms; + foreach my $obj ( @ret ) { + my $wf = $self->_wordform_from_row( $obj ); + $unique_wordforms{$wf->to_string} = $wf; + } + return values( %unique_wordforms ); +} + +sub _perseus_lookup_str { + my $self = shift; + my ( $orig, $pos, $lemma ) = split( /\t/, $_[0] ); + $self->_morph_connect(); + return unless $self::dbhandle; + # Simple morph DB lookup, and return the results. Disregard the treetagger. + my $result = $self::dbhandle->lexicon_lookup( $orig ); + return map { $self->_wordform_from_row( $_ ) } @{$result->{'objects'}}; +} + +sub _wordform_from_row { + my( $self, $rowobj ) = @_; + my $lemma = $rowobj->lemma; + $lemma =~ s/^(\D+)\d*$/$1/; + my $wf = Text::Tradition::Collation::Reading::WordForm->new( + 'language' => $self->_get_lang(), + 'lemma' => $lemma, + 'morphology' => $rowobj->morphology, + ); + return $wf; +} + +1; diff --git a/lib/Text/Tradition/Parser/CTE.pm b/lib/Text/Tradition/Parser/CTE.pm index 49bcfe2..be6adfc 100644 --- a/lib/Text/Tradition/Parser/CTE.pm +++ b/lib/Text/Tradition/Parser/CTE.pm @@ -78,10 +78,10 @@ sub parse { $r = $c->add_reading( { id => 'n'.$counter++, text => $item->{'content'} } ); } elsif ( $item->{'type'} eq 'anchor' ) { - $r = $c->add_reading( { id => '#ANCHOR_' . $item->{'content'} . '#', + $r = $c->add_reading( { id => '__ANCHOR_' . $item->{'content'} . '__', is_ph => 1 } ); } elsif ( $item->{'type'} eq 'app' ) { - my $tag = '#APP_' . $counter++ . '#'; + my $tag = '__APP_' . $counter++ . '__'; $r = $c->add_reading( { id => $tag, is_ph => 1 } ); $apps{$tag} = $item->{'content'}; } @@ -111,7 +111,9 @@ sub parse { sub _stringify_sigil { my( @nodes ) = @_; my @parts = grep { /\w/ } map { $_->data } @nodes; - return join( '', @parts ); + my $whole = join( '', @parts ); + $whole =~ s/\W//g; + return $whole; } # Get rid of all the formatting elements that get in the way of tokenization. @@ -207,14 +209,14 @@ sub _add_readings { # Get the lemma, which is all the readings between app and anchor, # excluding other apps or anchors. my @lemma = _return_lemma( $c, $app_id, $anchor ); - my $lemma_str = join( ' ', grep { $_ !~ /^\#/ } map { $_->text } @lemma ); + my $lemma_str = join( ' ', grep { $_ !~ /^__/ } map { $_->text } @lemma ); # For each reading, send its text to 'interpret' along with the lemma, # and then save the list of witnesses that these tokens belong to. my %wit_rdgs; # Maps from witnesses to the variant text my $ctr = 0; my $tag = $app_id; - $tag =~ s/^\#APP_(.*)\#$/$1/; + $tag =~ s/^\__APP_(.*)\__$/$1/; foreach my $rdg ( $xn->getChildrenByTagName( 'rdg' ) ) { my @text; @@ -230,11 +232,11 @@ sub _add_readings { my @rdg_nodes; if( $interpreted eq '#LACUNA#' ) { - push( @rdg_nodes, $c->add_reading( { id => $tag . "/" . $ctr++, + push( @rdg_nodes, $c->add_reading( { id => 'r'.$tag.".".$ctr++, is_lacuna => 1 } ) ); } else { foreach my $w ( split( /\s+/, $interpreted ) ) { - my $r = $c->add_reading( { id => $tag . "/" . $ctr++, + my $r = $c->add_reading( { id => 'r'.$tag.".".$ctr++, text => $w } ); push( @rdg_nodes, $r ); } @@ -272,12 +274,12 @@ sub _add_readings { sub _anchor_name { my $xmlid = shift; $xmlid =~ s/^\#//; - return sprintf( "#ANCHOR_%s#", $xmlid ); + return sprintf( "__ANCHOR_%s__", $xmlid ); } sub _return_lemma { my( $c, $app, $anchor ) = @_; - my @nodes = grep { $_->id !~ /^\#A(PP|NCHOR)/ } + my @nodes = grep { $_->id !~ /^__A(PP|NCHOR)/ } $c->reading_sequence( $c->reading( $app ), $c->reading( $anchor ), $c->baselabel ); return @nodes; diff --git a/lib/Text/Tradition/Parser/CollateX.pm b/lib/Text/Tradition/Parser/CollateX.pm index 3521d9f..628e672 100644 --- a/lib/Text/Tradition/Parser/CollateX.pm +++ b/lib/Text/Tradition/Parser/CollateX.pm @@ -132,13 +132,18 @@ sub parse { $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $wit ); } } else { # type 'relationship' - $collation->add_relationship( $from->{$IDKEY}, $to->{$IDKEY}, - { 'type' => 'transposition' } ); + if( $collation->linear ) { + $collation->add_relationship( $from->{$IDKEY}, $to->{$IDKEY}, + { 'type' => 'transposition' } ); + } else { + $collation->merge_readings( $from->{$IDKEY}, $to->{$IDKEY} ); + } } } # Rank the readings. - $collation->calculate_common_readings(); # will implicitly rank + $collation->calculate_common_readings() + if $collation->linear; # will implicitly rank # Save the text for each witness so that we can ensure consistency # later on diff --git a/lib/Text/Tradition/Parser/JSON.pm b/lib/Text/Tradition/Parser/JSON.pm index 1d618dc..0734856 100644 --- a/lib/Text/Tradition/Parser/JSON.pm +++ b/lib/Text/Tradition/Parser/JSON.pm @@ -207,7 +207,7 @@ sub make_nodes { if( exists( $unique{$word} ) ) { $rdg = $unique{$word}; } else { - my %args = ( 'id' => join( ',', $idx, $j+1 ), + my %args = ( 'id' => 'r' . join( '.', $idx, $j+1 ), 'rank' => $idx, 'text' => $word, 'collation' => $c ); diff --git a/lib/Text/Tradition/Parser/Self.pm b/lib/Text/Tradition/Parser/Self.pm index 432d8a3..8483891 100644 --- a/lib/Text/Tradition/Parser/Self.pm +++ b/lib/Text/Tradition/Parser/Self.pm @@ -114,9 +114,10 @@ if( $t ) { is( scalar $t->witnesses, 13, "Collation has all witnesses" ); } -# TODO add a relationship, write graphml, reparse it, check that the rel -# is still there +# TODO add a relationship, add a stemma, write graphml, reparse it, check that +# the new data is there $t->language('Greek'); +$t->add_stemma( 'dotfile' => 't/data/florilegium.dot' ); $t->collation->add_relationship( 'w12', 'w13', { 'type' => 'grammatical', 'scope' => 'global', 'annotation' => 'This is some note' } ); @@ -134,6 +135,8 @@ if( $newt ) { my $rel = $newt->collation->get_relationship( 'w12', 'w13' ); ok( $rel, "Found set relationship" ); is( $rel->annotation, 'This is some note', "Relationship has its properties" ); + is( scalar $newt->stemmata, 1, "Tradition has its stemma" ); + is( $newt->stemma(0)->witnesses, $t->stemma(0)->witnesses, "Stemma has correct length witness list" ); } @@ -160,6 +163,10 @@ sub parse { my $val = $graph_data->{'global'}->{$gkey}; if( $gkey eq 'version' ) { $use_version = $val; + } elsif( $gkey eq 'stemmata' ) { # Special case, yuck + foreach my $dotstr ( split( /\n/, $val ) ) { + $tradition->add_stemma( 'dot' => $dotstr ); + } } elsif( $tmeta->has_attribute( $gkey ) ) { $tradition->$gkey( $val ); } else { @@ -167,7 +174,10 @@ sub parse { } } - # Add the nodes to the graph. + # Add the nodes to the graph. + # Note any reading IDs that were changed in order to comply with XML + # name restrictions; we have to hardcode start & end. + my %namechange = ( '#START#' => '__START__', '#END#' => '__END__' ); # print STDERR "Adding collation readings\n"; foreach my $n ( @{$graph_data->{'nodes'}} ) { @@ -179,13 +189,20 @@ sub parse { next; } my $gnode = $collation->add_reading( $n ); + if( $gnode->id ne $n->{'id'} ) { + $namechange{$n->{'id'}} = $gnode->id; + } } # Now add the edges. # print STDERR "Adding collation path edges\n"; foreach my $e ( @{$graph_data->{'edges'}} ) { - my $from = $collation->reading( $e->{'source'}->{'id'} ); - my $to = $collation->reading( $e->{'target'}->{'id'} ); + my $sourceid = exists $namechange{$e->{'source'}->{'id'}} + ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'}; + my $targetid = exists $namechange{$e->{'target'}->{'id'}} + ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'}; + my $from = $collation->reading( $sourceid ); + my $to = $collation->reading( $targetid ); warn "No witness label on path edge!" unless $e->{'witness'}; my $label = $e->{'witness'} . ( $e->{'extra'} ? $collation->ac_label : '' ); @@ -206,8 +223,12 @@ sub parse { # TODO check that scoping does trt $rel_data->{'edges'} ||= []; # so that the next line doesn't break on no rels foreach my $e ( sort { _layersort_rel( $a, $b ) } @{$rel_data->{'edges'}} ) { - my $from = $collation->reading( $e->{'source'}->{'id'} ); - my $to = $collation->reading( $e->{'target'}->{'id'} ); + my $sourceid = exists $namechange{$e->{'source'}->{'id'}} + ? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'}; + my $targetid = exists $namechange{$e->{'target'}->{'id'}} + ? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'}; + my $from = $collation->reading( $sourceid ); + my $to = $collation->reading( $targetid ); delete $e->{'source'}; delete $e->{'target'}; # The remaining keys are relationship attributes. diff --git a/lib/Text/Tradition/Parser/Tabular.pm b/lib/Text/Tradition/Parser/Tabular.pm index b8e3439..a9ce519 100644 --- a/lib/Text/Tradition/Parser/Tabular.pm +++ b/lib/Text/Tradition/Parser/Tabular.pm @@ -180,11 +180,14 @@ sub parse { $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words ); } + my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000; + print STDERR "Tradition too big for row collation\n" if $nocollate; + # Now for the next rows, make nodes as necessary, assign their ranks, and # add them to the witness paths. foreach my $idx ( 1 .. $#{$alignment_table} ) { my $row = $alignment_table->[$idx]; - my $nodes = _make_nodes( $c, $row, $idx ); + my $nodes = _make_nodes( $c, $row, $idx, $nocollate ); foreach my $w ( 0 .. $#{$row} ) { # push the appropriate node onto the appropriate witness path my $word = $row->[$w]; @@ -252,11 +255,11 @@ sub parse { # Note that our ranks and common readings are set. $c->_graphcalc_done(1); # Remove redundant collation relationships. - $c->relations->filter_collations(); + $c->relations->filter_collations() unless $nocollate; } sub _make_nodes { - my( $collation, $row, $index ) = @_; + my( $collation, $row, $index, $nocollate ) = @_; my %unique; my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae. foreach my $w ( @$row ) { @@ -266,7 +269,7 @@ sub _make_nodes { my $ctr = 1; foreach my $w ( keys %unique ) { my $rargs = { - 'id' => "$index,$ctr", + 'id' => "r$index.$ctr", 'rank' => $index, 'text' => $w, }; @@ -280,24 +283,25 @@ sub _make_nodes { $ctr++; } # Collate this sequence of readings via a single 'collation' relationship. - my @rankrdgs = values %unique; - my $collation_rel; - while( @rankrdgs ) { - my $r = shift @rankrdgs; - next if $r->is_meta; - foreach my $nr ( @rankrdgs ) { - next if $nr->is_meta; - if( $collation_rel ) { - $collation->add_relationship( $r, $nr, $collation_rel ); - } else { - $collation->add_relationship( $r, $nr, - { 'type' => 'collated', - 'annotation' => "Parsed together for rank $index" } ); - $collation_rel = $collation->get_relationship( $r, $nr ); - } - } - } - + unless( $nocollate ) { + my @rankrdgs = values %unique; + my $collation_rel; + while( @rankrdgs ) { + my $r = shift @rankrdgs; + next if $r->is_meta; + foreach my $nr ( @rankrdgs ) { + next if $nr->is_meta; + if( $collation_rel ) { + $collation->add_relationship( $r, $nr, $collation_rel ); + } else { + $collation->add_relationship( $r, $nr, + { 'type' => 'collated', + 'annotation' => "Parsed together for rank $index" } ); + $collation_rel = $collation->get_relationship( $r, $nr ); + } + } + } + } return \%unique; } diff --git a/lib/Text/Tradition/Stemma.pm b/lib/Text/Tradition/Stemma.pm index c711430..d6f9646 100644 --- a/lib/Text/Tradition/Stemma.pm +++ b/lib/Text/Tradition/Stemma.pm @@ -107,7 +107,7 @@ has graph => ( isa => 'Graph', predicate => 'has_graph', ); - + sub BUILD { my( $self, $args ) = @_; # If we have been handed a dotfile, initialize it into a graph. @@ -296,7 +296,7 @@ sub extend_graph { # Iterate through, adding a.c. witnesses my $actag = $self->collation->ac_label; - my $graph = $self->graph->copy; + my $graph = $self->graph->deep_copy; foreach my $lw ( @$layerwits ) { # Add the layered witness and set it with the same attributes as # its 'main' analogue @@ -349,8 +349,27 @@ sub as_svg { # Get rid of width and height attributes to allow scaling. my $parser = XML::LibXML->new(); my $svgdoc = $parser->parse_string( decode_utf8( $svg ) ); - $svgdoc->documentElement->removeAttribute('width'); - $svgdoc->documentElement->removeAttribute('height'); + if( $opts->{'size'} ) { + my( $ew, $eh ) = @{$opts->{'size'}}; + # If the graph is wider than it is tall, set width to ew and remove height. + # Otherwise set height to eh and remove width. + my $width = $svgdoc->documentElement->getAttribute('width'); + my $height = $svgdoc->documentElement->getAttribute('height'); + $width =~ s/\D+//g; + $height =~ s/\D+//g; + my( $remove, $keep, $val ); + if( $width > $height ) { + $remove = 'height'; + $keep = 'width'; + $val = $ew . 'px'; + } else { + $remove = 'width'; + $keep = 'height'; + $val = $eh . 'px'; + } + $svgdoc->documentElement->removeAttribute( $remove ); + $svgdoc->documentElement->setAttribute( $keep, $val ); + } # Return the result return decode_utf8( $svgdoc->toString ); } diff --git a/lib/Text/Tradition/TypeMap/Entry.pm b/lib/Text/Tradition/TypeMap/Entry.pm new file mode 100644 index 0000000..00cdfd1 --- /dev/null +++ b/lib/Text/Tradition/TypeMap/Entry.pm @@ -0,0 +1,42 @@ +package Text::Tradition::TypeMap::Entry; +use Moose; + +no warnings 'recursion'; + +use namespace::clean -except => 'meta'; + +with qw(KiokuDB::TypeMap::Entry::Std); + +use YAML::XS (); + +sub compile_collapse_body { + my ( $self, $class ) = @_; + + return sub { + my ( $self, %args ) = @_; + + my $object = $args{object}; + + return $self->make_entry( + %args, + data => YAML::XS::Dump($object) + ); + }; +} + +sub compile_expand { + my ( $self, $class ) = @_; + + return sub { + my ( $self, $entry ) = @_; + $self->inflate_data( YAML::XS::Load($entry->data), \( my $obj ), $entry ); + + bless $obj, $class; + }; +} + +sub compile_refresh { return sub { die "TODO" } } + +__PACKAGE__->meta->make_immutable; + +1; diff --git a/script/dblookup.pl b/script/dblookup.pl new file mode 100755 index 0000000..2346fde --- /dev/null +++ b/script/dblookup.pl @@ -0,0 +1,81 @@ +#!/usr/bin/env perl + +use lib 'lib'; +use strict; +use warnings; +use File::Basename; +use Getopt::Long; +use Text::Tradition; +use Text::Tradition::Directory; + +binmode( STDOUT, ':utf8' ); +binmode( STDERR, ':utf8' ); + +my( $name, $delete, $dbuser, $dbpass ); +my( $list, $dsn ) = ( 1, 'dbi:SQLite:dbname=stemmaweb/db/traditions.db' ); + +GetOptions( + 'r|rename=s' => \$name, + 'd|delete' => \$delete, + 'dsn=s' => \$dsn, + 'u|user=s' => \$dbuser, + 'p|pass=s' => \$dbpass, + ); + +my @uuids = @ARGV; # UUID is whatever is left over +my %dbargs = ( 'dsn' => $dsn ); +$dbargs{'extra_args'} = { 'user' => $dbuser } if $dbuser; +$dbargs{'extra_args'}->{'password'} = $dbpass if $dbpass; +my $kdb = Text::Tradition::Directory->new( %dbargs ); +$list = !$delete; + +if( $delete ) { + print STDERR "Must specify the UUID of a tradition to delete\n" unless @uuids; + my $scope = $kdb->new_scope(); + foreach my $uuid ( @uuids ) { + if( $kdb->exists( $uuid ) ) { + $kdb->delete( $uuid ); + } else { + print STDERR "No object found with ID $uuid\n"; + } + } +} + +if( $name ) { + print STDERR "Must specify the UUID of a tradition to rename\n" unless @uuids; + if( @uuids > 1 ) { + print STDERR "Multiple traditions given for rename - do you really want to do that?\n"; + } else { + my $scope = $kdb->new_scope(); + my $tradition = $kdb->lookup( $uuids[0] ); + if( $tradition ) { + $tradition->name( $name ); + $kdb->save( $tradition ); + } else { + print STDERR "Unable to find tradition @uuids to rename\n"; + } + } +} + +# Now list the DB contents if appropriate. +if( $list ) { + my $scope = $kdb->new_scope(); + foreach my $tref ( $kdb->traditionlist ) { + my $tid = $tref->{'id'}; + # If no IDs were given on the command line, list all traditions. + if( @uuids ) { + next unless grep { $_ eq $tid } @uuids; + } + my $t = $kdb->lookup( $tid ); + print STDERR "$tid: Tradition '" . $t->name . "'\n"; + my @wits = map { $_->sigil } $t->witnesses; + print STDERR "...with witnesses @wits\n"; + my $c = $t->collation; + print STDERR "...collation has " . scalar( $c->readings ) . " readings\n"; + print STDERR "...collation has " . scalar( $c->paths ) . " paths\n"; + print STDERR "...collation has " . scalar( $c->relationships ) . " relationship links\n"; + foreach my $s ( $t->stemmata ) { + print STDERR "...associated stemma has graph " . $s->graph . "\n"; + } + } +} diff --git a/script/join_readings.pl b/script/join_readings.pl new file mode 100755 index 0000000..a5ef593 --- /dev/null +++ b/script/join_readings.pl @@ -0,0 +1,57 @@ +#!/usr/bin/env perl + +use lib 'lib'; +use feature 'say'; +use strict; +use warnings; +use Getopt::Long; +use Text::Tradition::Directory; +use TryCatch; + +binmode STDOUT, ':utf8'; +binmode STDERR, ':utf8'; +eval { no warnings; binmode $DB::OUT, ':utf8'; $DB::deep = 1000 }; + +my( $dbuser, $dbpass ); +my $dsn = 'dbi:SQLite:dbname=stemmaweb/db/traditions.db'; +my $testrun; + +GetOptions( + 'dsn=s' => \$dsn, + 'u|user=s' => \$dbuser, + 'p|pass=s' => \$dbpass, + 'n|test' => \$testrun, + ); + +my $dbopts = { dsn => $dsn }; +$dbopts->{extra_args}->{user} = $dbuser if $dbuser; +$dbopts->{extra_args}->{password} = $dbpass if $dbpass; + +my $dir = Text::Tradition::Directory->new( $dbopts ); + +my $scope = $dir->new_scope(); +my $lookfor = $ARGV[0] || ''; +foreach my $tinfo ( $dir->traditionlist() ) { + next unless $tinfo->{'name'} =~ /$lookfor/ || $tinfo->{'id'} eq $lookfor; + my $tradition = $dir->lookup( $tinfo->{'id'} ); + my $c = $tradition->collation; + + # Anywhere in the graph that there is a reading that joins only to a single + # successor, and neither of these have any relationships, just join the two + # readings. + + # Save/update the current path texts + foreach my $wit ( $tradition->witnesses ) { + my @pathtext = split( /\s+/, $c->path_text( $wit->sigil ) ); + $wit->text( \@pathtext ); + if( $wit->is_layered ) { + my @layertext = split( /\s+/, $c->path_text( $wit->sigil.$c->ac_label ) ); + $wit->layertext( \@layertext ); + } + } + + # Do the deed + $c->compress_readings(); + # ...and save it. + $dir->save( $tradition ); +} \ No newline at end of file diff --git a/script/make_svg.pl b/script/make_svg.pl deleted file mode 100644 index 334c075..0000000 --- a/script/make_svg.pl +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/perl - -use lib 'lib'; -use strict; -use warnings; -use Text::Tradition::Graph; - -# First: read the base. Make a graph, but also note which -# nodes represent line beginnings. - -open( GRAPH, $ARGV[0] ) or die "Could not read file $ARGV[0]"; -my @lines = ; -close GRAPH; -my $graphml_str = join( '', @lines ); - -my $collation_graph = Text::Tradition::Graph->new( - 'GraphML' => $graphml_str, - ); - -print $collation_graph->as_svg(); -print STDERR "DONE\n"; diff --git a/script/make_tradition.pl b/script/make_tradition.pl index 97e8145..16d8519 100755 --- a/script/make_tradition.pl +++ b/script/make_tradition.pl @@ -13,10 +13,11 @@ binmode STDERR, ":utf8"; binmode STDOUT, ":utf8"; eval { no warnings; binmode $DB::OUT, ":utf8"; }; -my( $informat, $inbase, $outformat, $help, $language, $name, $sep, $stemmafile, - $dsn, $dbuser, $dbpass, $from, $to ) - = ( '', '', '', '', 'Default', 'Tradition', "\t", '', - "dbi:SQLite:dbname=stemmaweb/db/traditions.db", undef, undef, undef, undef ); +# Variables with defaults +my( $informat, $outformat, $language, $name, $sep, $dsn ) = ( '', '', 'Default', + 'Tradition', "\t", "dbi:SQLite:dbname=stemmaweb/db/traditions.db" ); +# Variables with no default +my( $inbase, $help, $stemmafile, $dbuser, $dbpass, $from, $to, $dbid, $debug, $nonlinear ); GetOptions( 'i|in=s' => \$informat, 'b|base=s' => \$inbase, @@ -29,15 +30,18 @@ GetOptions( 'i|in=s' => \$informat, 'p|pass=s' => \$dbpass, 'f|from=s' => \$from, 't|to=s' => \$to, + 'nl|nonlinear' => \$nonlinear, 'sep=s' => \$sep, 'dsn=s' => \$dsn, + 'dbid=s' => \$dbid, + 'debug' => \$debug ); if( $help ) { help(); } -unless( $informat =~ /^(CSV|CTE|KUL|Self|TEI|CollateX|tab(ular)?)|stone$/i ) { +unless( $informat =~ /^(CSV|CTE|KUL|Self|TEI|CollateX|tab(ular)?)|stone|db$/i ) { help( "Input format must be one of CollateX, CSV, CTE, Self, TEI" ); } $informat = 'CollateX' if $informat =~ /^c(ollate)?x$/i; @@ -53,8 +57,8 @@ unless( $outformat =~ /^(graphml|svg|dot|stemma|csv|db)$/ ) { } if( $from || $to ) { - help( "Subgraphs only supported in GraphML format" ) - unless $outformat eq 'graphml'; + help( "Subgraphs only supported in GraphML, dot, or SVG format" ) + unless $outformat =~ /^(graphml|dot|svg)$/; } # Do we have a base if we need it? @@ -64,20 +68,31 @@ if( $informat =~ /^(KUL|CollateText)$/ && !$inbase ) { $sep = "\t" if $sep eq 'tab'; my $input = $ARGV[0]; - -# First: read the base. Make a graph, but also note which -# nodes represent line beginnings. -my %args = ( 'input' => $informat, - 'file' => $input ); -$args{'base'} = $inbase if $inbase; -$args{'language'} = $language if $language; -$args{'name'} = $name if $name; -$args{'sep_char'} = $sep if $informat eq 'Tabular'; -### Custom hacking for Stone -if( $informat eq 'CollateText' ) { - $args{'sigla'} = [ qw/ S M X V Z Bb B K W L / ]; +my $tradition; +my $dir; +if( $informat eq 'db' ) { + my $dbargs = { dsn => $dsn }; + $dbargs->{'extra_args'}->{'user'} = $dbuser if $dbuser; + $dbargs->{'extra_args'}->{'password'} = $dbpass if $dbpass; + $dir = Text::Tradition::Directory->new( $dbargs ); + my $scope = $dir->new_scope(); + $tradition = $dir->lookup( $input ); +} else { + # First: read the base. Make a graph, but also note which + # nodes represent line beginnings. + my %args = ( 'input' => $informat, + 'file' => $input ); + $args{'linear'} = 0 if $nonlinear; + $args{'base'} = $inbase if $inbase; + $args{'language'} = $language if $language; + $args{'name'} = $name if $name; + $args{'sep_char'} = $sep if $informat eq 'Tabular'; + ### Custom hacking for Stone + if( $informat eq 'CollateText' ) { + $args{'sigla'} = [ qw/ S M X V Z Bb B K W L / ]; + } + $tradition = Text::Tradition->new( %args ); } -my $tradition = Text::Tradition->new( %args ); if( $stemmafile ) { my $stemma = $tradition->add_stemma( dotfile => $stemmafile ); print STDERR "Saved stemma at $stemmafile\n" if $stemma; @@ -92,19 +107,27 @@ if( $outformat eq 'stemma' ) { print STDERR "Bad result: " . $e->message; } } elsif( $outformat eq 'db' ) { - my $extra_args = { 'create' => 1 }; - $extra_args->{'user'} = $dbuser if $dbuser; - $extra_args->{'password'} = $dbpass if $dbpass; - my $dir = Text::Tradition::Directory->new( 'dsn' => $dsn, - 'extra_args' => $extra_args ); + unless( $dir ) { + my $extra_args = { 'create' => 1 }; + $extra_args->{'user'} = $dbuser if $dbuser; + $extra_args->{'password'} = $dbpass if $dbpass; + $dir = Text::Tradition::Directory->new( 'dsn' => $dsn, + 'extra_args' => $extra_args ); + } my $scope = $dir->new_scope; - my $uuid = $dir->store( $tradition ); + my $uuid; + if( $dbid ) { + $uuid = $dir->store( $dbid => $tradition ); + } else { + $uuid = $dir->store( $tradition ); + } print STDERR "Saved tradition to database with ID $uuid\n"; } else { my $output = "as_$outformat"; my $opts = {}; $opts->{'from'} = $from if $from; $opts->{'to'} = $to if $to; + $opts->{'nocalc'} = 1 if $debug; print $tradition->collation->$output( $opts ); } diff --git a/script/orth_case_links.pl b/script/orth_case_links.pl index a2ddbf1..9487984 100755 --- a/script/orth_case_links.pl +++ b/script/orth_case_links.pl @@ -1,27 +1,40 @@ #!/usr/bin/env perl use lib 'lib'; +use feature 'say'; use strict; use warnings; +use Getopt::Long; use Text::Tradition::Directory; +use TryCatch; -binmode STDERR, ':utf8'; binmode STDOUT, ':utf8'; -eval { no warnings; binmode $DB::OUT, ':utf8' }; +binmode STDERR, ':utf8'; +eval { no warnings; binmode $DB::OUT, ':utf8'; $DB::deep = 1000 }; + +my( $dbuser, $dbpass ); +my $dsn = 'dbi:SQLite:dbname=stemmaweb/db/traditions.db'; +my $testrun; + +GetOptions( + 'dsn=s' => \$dsn, + 'u|user=s' => \$dbuser, + 'p|pass=s' => \$dbpass, + 'n|test' => \$testrun, + ); -my( $dsn, $user, $pass ) = @ARGV; +my $dbopts = { dsn => $dsn }; +$dbopts->{extra_args}->{user} = $dbuser if $dbuser; +$dbopts->{extra_args}->{password} = $dbpass if $dbpass; -my $connect_args = { dsn => $dsn }; -$connect_args->{'extra_args'} = { user => $user, password => $pass } - if $user && $pass; -my $dir = Text::Tradition::Directory->new( $connect_args ); +my $dir = Text::Tradition::Directory->new( $dbopts ); -foreach my $text ( $dir->traditionlist ) { - my $id = $text->{'id'}; - next unless $text->{'name'} =~ /Virtutes/; - my $scope = $dir->new_scope; - my $tradition = $dir->lookup( $id ); - print STDERR "Processing tradition " . $tradition->name . "\n"; +my $scope = $dir->new_scope(); +my $lookfor = $ARGV[0] || ''; +foreach my $tinfo ( $dir->traditionlist() ) { + next unless $tinfo->{'name'} =~ /$lookfor/ || $tinfo->{'id'} eq $lookfor; + my $tradition = $dir->lookup( $tinfo->{id} ); + say STDERR "Processing tradition " . $tradition->name; my $c = $tradition->collation; $c->flatten_ranks(); # just in case foreach my $rank ( 1 .. $c->end->rank - 1 ) { @@ -34,15 +47,19 @@ foreach my $text ( $dir->traditionlist ) { my @orthmatch = grep { lc( $r->text ) eq lc( $_->text ) } @readings; foreach my $om ( @orthmatch ) { if( $r->text eq $om->text ) { - print STDERR "Merging identical readings $r and $om (" - . $r->text . ")\n"; + say STDERR "Merging identical readings $r and $om (" + . $r->text . ")"; $merged{$om->id} = 1; $c->merge_readings( $r, $om ); - } elsif ( $c->get_relationship( $r, $om ) ) { - print STDERR sprintf( "Adding orthographic link for %s and %s (%s / %s)\n", + } else { + say STDERR sprintf( "Adding orthographic link for %s and %s (%s / %s)", $r->id, $om->id, $r->text, $om->text ); - $c->add_relationship( $r, $om, - { 'type' => 'orthographic', 'scope' => 'global' } ); + try { + $c->add_relationship( $r, $om, + { 'type' => 'orthographic', 'scope' => 'global' } ); }; + } catch ( Text::Tradition::Error $e ) { + say STDERR "Relationship skipped: " . $e->message; + } } } } diff --git a/script/poslink.pl b/script/poslink.pl new file mode 100755 index 0000000..4b99b9f --- /dev/null +++ b/script/poslink.pl @@ -0,0 +1,182 @@ +#!/usr/bin/env perl + +use lib 'lib'; +use feature 'say'; +use strict; +use warnings; +use Getopt::Long; +use Lingua::Features::Structure; +use Text::Tradition::Directory; +use XML::Easy::Syntax qw/ $xml10_name_rx $xml10_namestartchar_rx /; +use TryCatch; + +binmode STDOUT, ':utf8'; +binmode STDERR, ':utf8'; +eval { no warnings; binmode $DB::OUT, ':utf8'; $DB::deep = 1000 }; + +my( $dbuser, $dbpass ); +my $dsn = 'dbi:SQLite:dbname=stemmaweb/db/traditions.db'; + +GetOptions( + 'dsn=s' => \$dsn, + 'u|user=s' => \$dbuser, + 'p|pass=s' => \$dbpass, + ); + +my $dbopts = { dsn => $dsn }; +$dbopts->{extra_args}->{user} = $dbuser if $dbuser; +$dbopts->{extra_args}->{password} = $dbpass if $dbpass; + +my $dir = Text::Tradition::Directory->new( $dbopts ); + +my $scope = $dir->new_scope(); +my $lookfor = $ARGV[0] || ''; +foreach my $tinfo ( $dir->traditionlist() ) { + next unless $tinfo->{'name'} =~ /$lookfor/ || $tinfo->{'id'} eq $lookfor; + my $tradition = $dir->lookup( $tinfo->{'id'} ); + say STDERR "Found " . $tradition->name; + my $c = $tradition->collation; + $c->_set_tradition( $tradition ); + + # Propagate lexeme forms across transposition links + foreach my $rel ( $c->relationships ) { + next unless $c->get_relationship( $rel )->type eq 'transposition'; + my $rdg_a = $c->reading( $rel->[0] ); + my $rdg_b = $c->reading( $rel->[1] ); + if( $rdg_a->disambiguated && !$rdg_b->disambiguated ) { + propagate_lexemes( $rdg_a, $rdg_b ); + } elsif( $rdg_b->disambiguated && !$rdg_a->disambiguated ) { + propagate_lexemes( $rdg_b, $rdg_a ); + } elsif( !$rdg_a->disambiguated && !$rdg_b->disambiguated ) { + say STDERR "Transposition link with nothing disambiguated: @$rel"; + } + } + + + # Make the changes + foreach my $rank ( 1 .. $c->end->rank - 1 ) { + my @rankrdgs = $c->readings_at_rank( $rank ); + # Propagate lexemes and normal forms across spelling / orthographic links + my %propagated; + foreach my $r ( @rankrdgs ) { + next if $propagated{$r->id}; + my @samewords = $c->related_readings( $r, + sub { $_[0]->type eq 'spelling' || $_[0]->type eq 'orthographic' } ); + push( @samewords, $r ); + map { $propagated{$_->id} = 1 } @samewords; + next if @samewords == 1; + + my( @haslex, @needslex ); + foreach my $w ( @samewords ) { + if( $w->disambiguated ) { + push( @haslex, $w ); + } else { + push( @needslex, $w ); + } + } + # Check that the lexeme forms match for the readings in @haslex + unless( @haslex ) { + say STDERR "Multiple same word readings with no disambiguation at rank $rank"; + next; + } + my $form; + my $consistent = 1; + foreach my $w ( @haslex ) { + my $wf = join( '//', map { $_->form->to_string } $w->lexemes ); + $form = $wf unless $form; + unless( $wf eq $form ) { + warn "Conflicting lexeme on $w at rank $rank"; + $consistent = 0; + } + } + if( $consistent && @haslex ) { + my $ref = shift @haslex; + foreach my $w ( @needslex ) { + propagate_lexemes( $ref, $w ); + } + } + } + + while( @rankrdgs ) { + my $r = shift @rankrdgs; + next if $r->is_meta; + next if $r->is_nonsense; + next unless $r->has_lexemes; + next if grep { !$_->is_disambiguated } $r->lexemes; + my $rlem = join( ' ', map { $_->form->lemma } $r->lexemes ); + my @rpos = map { $_->form->morphstr } $r->lexemes; + foreach my $rdg ( @rankrdgs ) { + next if $r eq $rdg; + next if $rdg->is_nonsense; + next unless $rdg->has_lexemes; + next if grep { !$_->is_disambiguated } $rdg->lexemes; + next if is_sameword( $c, $r, $rdg ); + # Do the grammatical link if applicable + my $gram; + if( join( ' ', map { $_->form->lemma } $rdg->lexemes ) eq $rlem + && $rlem !~ /\/ ) { + say sprintf( "Linking %s (%s) and %s (%s) with grammatical rel", + $r, $r->text, $rdg, $rdg->text ); + $c->add_relationship( $r, $rdg, { 'type' => 'grammatical' } ); + $gram = 1; + } + + # Do a punctuation link (instead of a lexical link) if applicable + my $punct; + if( $rdg->text =~ /^[[:punct:]]$/ && $r->text =~ /^[[:punct:]]$/ ) { + say sprintf( "Linking %s (%s) and %s (%s) with punctuation rel", + $r, $r->text, $rdg, $rdg->text ); + $c->add_relationship( $r, $rdg, { 'type' => 'punctuation' } ); + $punct = 1; + } + + # Do the lexical link if applicable + my @rdgpos = map { $_->form->morphstr } $rdg->lexemes; + next unless @rpos == @rdgpos; + my $lex = 1; + foreach my $i ( 0 .. $#rpos ) { + my $rst = Lingua::Features::Structure->from_string( $rpos[$i] ); + my $rdgst = Lingua::Features::Structure->from_string( $rdgpos[$i] ); + unless( $rst && $rdgst ) { + warn "Did not get morph structure from " . + $rst ? $rdgpos[$i] : $rpos[$i]; + next; + } + unless( $rst->is_compatible( $rdgst ) ) { + $lex = 0; + } + } + if( $lex && !$punct ) { + if( $gram ) { + warn sprintf( "Grammatical link already made for %s (%s) / %s (%s)", + $r, $r->text, $rdg, $rdg->text ); + } else { + say sprintf( "Linking %s (%s) and %s (%s) with lexical rel", + $r, $r->text, $rdg, $rdg->text ); + $c->add_relationship( $r, $rdg, { 'type' => 'lexical' } ); + } + } + } + } + } + + # Save the lot + # print $c->as_svg( { nocalc => 1 } ); + $dir->save( $tradition ); +} + +sub is_sameword { + my( $c, $rdg1, $rdg2 ) = @_; + my @samewords = $c->related_readings( $rdg1, + sub { $_[0]->type eq 'spelling' || $_[0]->type eq 'orthographic' } ); + my @in_set = grep { $_ eq $rdg2 } @samewords; + return scalar @in_set; +} + +sub propagate_lexemes { + my( $from, $to ) = @_; + say sprintf( "Copying lexical form from %s (%s) to %s (%s)", + $from, $from->text, $to, $to->text ); + $to->normal_form( $from->normal_form ); + $to->_deserialize_lexemes( $from->_serialize_lexemes ); +} \ No newline at end of file diff --git a/script/propagate_transitive.pl b/script/propagate_transitive.pl new file mode 100755 index 0000000..2d49467 --- /dev/null +++ b/script/propagate_transitive.pl @@ -0,0 +1,123 @@ +#!/usr/bin/env perl + +use lib 'lib'; +use feature 'say'; +use strict; +use warnings; +use Getopt::Long; +use Text::Tradition::Directory; +use TryCatch; + +binmode STDOUT, ':utf8'; +binmode STDERR, ':utf8'; +eval { no warnings; binmode $DB::OUT, ':utf8'; $DB::deep = 1000 }; + +my %TYPEVALUES = ( + orthographic => 1, + spelling => 2, + grammatical => 3, + lexical => 3, + collated => 50, + ); + +my( $dbuser, $dbpass ); +my $dsn = 'dbi:SQLite:dbname=stemmaweb/db/traditions.db'; +my $testrun; + +GetOptions( + 'dsn=s' => \$dsn, + 'u|user=s' => \$dbuser, + 'p|pass=s' => \$dbpass, + 'n|test' => \$testrun, + ); + +my $dbopts = { dsn => $dsn }; +$dbopts->{extra_args}->{user} = $dbuser if $dbuser; +$dbopts->{extra_args}->{password} = $dbpass if $dbpass; + +my $dir = Text::Tradition::Directory->new( $dbopts ); + +my $scope = $dir->new_scope(); +my $lookfor = $ARGV[0] || ''; +foreach my $tinfo ( $dir->traditionlist() ) { + next unless $tinfo->{'name'} =~ /$lookfor/ || $tinfo->{'id'} eq $lookfor; + my $tradition = $dir->lookup( $tinfo->{'id'} ); + my $c = $tradition->collation; + + my $represented_by = {}; + my $representative = {}; + # For each set of ranked relationships, make all the implied links + # explicit. Start with orthographic readings + push_rel_type( $c, 'orthographic', $representative, $represented_by ); + # then move on to spelling readings + push_rel_type( $c, 'spelling', $representative, $represented_by ); + + # Now all orth/spelling linked words are the same word for the purposes of + # other colocated links, and in our representation hashes. + # Go through the other relationships and propagate them to all words that are + # the same word. + foreach my $rel ( $c->relationships ) { + my $relobj = $c->get_relationship( $rel ); + next unless $relobj->type =~ /^(grammatical|lexical)$/; + my $r1pool = $represented_by->{$representative->{$rel->[0]}}; + my $r2pool = $represented_by->{$representative->{$rel->[1]}}; + # Error check + if( check_distinct( $r1pool, $r2pool ) ) { + map { propagate_rel( $c, $relobj->type, $_, @$r2pool ) } @$r1pool; + } else { + warn "Pools not distinct for " . join( ' and ', @$rel ); + } + } + $dir->save( $tradition ) unless $testrun; +} + +sub propagate_rel { + my( $c, $type, @list ) = @_; + my $curr = shift @list; + while( @list ) { + foreach my $r ( @list ) { + next if $curr eq $r; + my $hasrel = $c->get_relationship( $curr, $r ); + if( !$hasrel || $hasrel->type eq 'collated' ) { + say STDERR "Propagating $type relationship $curr -> $r"; + $c->add_relationship( $curr, $r, { type => $type } ); + } elsif( $hasrel->type ne $type ) { + warn "Found relationship conflict at $curr / $r: " + . $hasrel->type . " instead of $type" + unless( $TYPEVALUES{$hasrel->type} < $TYPEVALUES{$type} ); + } + } + $curr = shift @list; + } +} + +sub push_rel_type { + my( $c, $type, $r2rep, $rep2r ) = @_; + my %handled; + foreach my $rdg ( $c->readings ) { + next if $rdg->is_meta; + next if $handled{"$rdg"}; + if( exists $r2rep->{"$rdg"} ) { + $rdg = $r2rep->{"$rdg"}; + } + # Get the specified relationships + my @set = $rdg->related_readings( sub { + $_[0]->colocated && ( $_[0]->type eq $type || + $TYPEVALUES{$_[0]->type} < $TYPEVALUES{$type} ) } ); + push( @set, $rdg ); + propagate_rel( $c, $type, @set ) if @set > 2; + # Set up the representatives + map { $r2rep->{"$_"} = $rdg } @set; + $rep2r->{"$rdg"} = \@set; + map { $handled{"$_"} = 1 } @set; + } +} + +sub check_distinct { + my( $l1, $l2 ) = @_; + my %seen; + map { $seen{"$_"} = 1 } @$l1; + map { return 0 if $seen{"$_"} } @$l2; + return 1; +} + diff --git a/script/save_to_db.pl b/script/save_to_db.pl deleted file mode 100755 index 2d734b3..0000000 --- a/script/save_to_db.pl +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env perl - -use lib 'lib'; -use strict; -use warnings; -use File::Basename; -use Getopt::Long; -use Text::Tradition; -use Text::Tradition::Directory; - -binmode( STDOUT, ':utf8' ); -binmode( STDERR, ':utf8' ); - -my( $tfile, $format, $sfile, $delete, $list, $dsn ) = - ( undef, 'Self', undef, undef, 0, 'dbi:SQLite:dbname=db/traditions.db' ); - -GetOptions( - 't|tradition=s' => \$tfile, - 'f|format=s' => \$format, - 's|stemma=s' => \$sfile, - 'l|list' => \$list, - 'd|delete=s' => \$delete, - 'dsn=s' => \$dsn, - ); - -# Make a KiokuDB store from the traditions data we have. - -my $kdb = Text::Tradition::Directory->new( - 'dsn' => $dsn, - 'extra_args' => { 'create' => 1 }, - ); - -unless( $tfile || $delete || $list ) { - print STDERR "Please specify a tradition file, an ID to delete, or the --list option\n"; - exit; -} - -if( $tfile && $delete ) { - print STDERR "Specify deletion by UUID, not by tradition file\n"; - exit; -} - -my( $tradition, $stemma ); -if( $tfile ) { - print STDERR "Reading tradition from $tfile\n"; - $tradition = Text::Tradition->new( - 'input' => $format, - 'file' => $tfile, - 'linear' => 1, - ); - if( $tradition && $sfile ) { - $stemma = $tradition->add_stemma( dotfile => $sfile ); - warn "Did not get stemma from $sfile\n" unless $stemma; - } - - my $scope = $kdb->new_scope(); - my $tid = $kdb->save( $tradition ); - print STDERR "Stored tradition for " . $tradition->name . " at $tid\n"; - print STDERR "...and associated stemma from $sfile\n" if $stemma; -} - -if( $delete ) { - my $scope = $kdb->new_scope(); - if( $kdb->exists( $delete ) ) { - $kdb->delete( $delete ); - } else { - print STDERR "Object $delete does not appear to be a Text::Tradition in the DB\n"; - } -} - -# Now try reading the objects from the DB. -if( $list ) { - foreach my $tref ( $kdb->traditionlist ) { - my $tid = $tref->{'id'}; - my $scope = $kdb->new_scope(); - my $t = $kdb->tradition( $tid ); - print STDERR "$tid: Tradition '" . $t->name . "'\n"; - my @wits = map { $_->sigil } $t->witnesses; - print STDERR "...with witnesses @wits\n"; - my $c = $t->collation; - print STDERR "...collation has " . scalar( $c->readings ) . " readings\n"; - print STDERR "...collation has " . scalar( $c->paths ) . " paths\n"; - print STDERR "...collation has " . scalar( $c->relationships ) . " relationship links\n"; - foreach my $s ( $t->stemmata ) { - print STDERR "...associated stemma has graph " . $s->graph . "\n"; - } - } -} \ No newline at end of file diff --git a/script/svg_from_csv.pl b/script/svg_from_csv.pl deleted file mode 100644 index 9b72e7a..0000000 --- a/script/svg_from_csv.pl +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/perl - -use lib 'lib'; -use strict; -use warnings; -use Text::Tradition; - -# First: read the base. Make a graph, but also note which -# nodes represent line beginnings. - -my $tradition = Text::Tradition->new( - 'CSV' => $ARGV[0], - 'base' => $ARGV[1], - 'linear' => 0, - ); - - -print $tradition->collation->as_svg(); -print STDERR "DONE\n"; -__END__ -my $rows = 0; -my $matrix = []; -foreach my $pos ( $collation_graph->{'positions'}->all ) { - my @p_nodes = $collation_graph->{'positions'}->nodes_at_position( $pos ); - $rows = scalar @p_nodes - if $rows < scalar @p_nodes; - push( @$matrix, \@p_nodes ); -} -print "A table\n"; -foreach my $i ( 0 .. $rows-1 ) { - print "\t\n"; - foreach my $col( @$matrix ) { - my $str = ''; - if( $col->[$i] ) { - $str = $collation_graph->node( $col->[$i] )->label; - } - printf( "\t\t\n", $str ); - } - print "\t\n"; -} -print "
%s
\n"; diff --git a/script/svg_from_graphml.pl b/script/svg_from_graphml.pl deleted file mode 100644 index 2555fa7..0000000 --- a/script/svg_from_graphml.pl +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/perl - -use lib 'lib'; -use strict; -use warnings; -use Text::Tradition; - -# First: read the base. Make a graph, but also note which -# nodes represent line beginnings. -my $type = 'CollateX'; # either Self or CollateX - -open( GRAPH, $ARGV[0] ) or die "Could not read file $ARGV[0]"; -my @lines = ; -close GRAPH; -my $graphml_str = join( '', @lines ); - -my $tradition = Text::Tradition->new( - $type => $graphml_str, - 'linear' => 1, - ); - -print $tradition->collation->as_svg(); -print STDERR "DONE\n"; diff --git a/stemmaweb/Makefile.PL b/stemmaweb/Makefile.PL index 7c833af..40572d0 100644 --- a/stemmaweb/Makefile.PL +++ b/stemmaweb/Makefile.PL @@ -24,7 +24,7 @@ requires 'Catalyst::Plugin::Authentication'; requires 'Catalyst::Plugin::Session'; requires 'Catalyst::Plugin::Session::Store::File'; requires 'Catalyst::Plugin::Session::State::Cookie'; -requires 'CatalystX::Controller::Auth'; +requires 'CatalystX::Controller::Auth' => '0.22'; requires 'Catalyst::TraitFor::Controller::reCAPTCHA'; requires 'LWP::Protocol::https'; ## diff --git a/stemmaweb/lib/stemmaweb.pm b/stemmaweb/lib/stemmaweb.pm index 4563222..ba6ea4a 100644 --- a/stemmaweb/lib/stemmaweb.pm +++ b/stemmaweb/lib/stemmaweb.pm @@ -99,9 +99,9 @@ __PACKAGE__->config( model => 'User', login_id_field => 'username', login_db_field => 'username', - action_after_login => '/index', - action_after_register => '/index', - register_email_from => '"MyApp" ', + action_after_login => '/users/success', + action_after_register => '/users/success', + register_email_from => '"Stemmaweb" ', register_email_subject => 'Registration to stemmaweb', register_email_template_plain => 'register-plain.tt', realm => 'default', @@ -114,8 +114,8 @@ __PACKAGE__->config( }, recaptcha => { - pub_key => '', - priv_key => '', + pub_key => '6LfR19MSAAAAACy2meHvLfZGRn3PM2rRYIAfh665', + priv_key => '6LfR19MSAAAAAMlQb8BdyecWNRE1bAL2YSgz2sah', }, ); diff --git a/stemmaweb/lib/stemmaweb/Controller/Relation.pm b/stemmaweb/lib/stemmaweb/Controller/Relation.pm index 0082624..0842124 100644 --- a/stemmaweb/lib/stemmaweb/Controller/Relation.pm +++ b/stemmaweb/lib/stemmaweb/Controller/Relation.pm @@ -1,5 +1,6 @@ package stemmaweb::Controller::Relation; use Moose; +use Module::Load; use namespace::autoclean; use TryCatch; @@ -29,19 +30,6 @@ sub index :Path :Args(0) { $c->stash->{'template'} = 'relate.tt'; } -=head2 help - - GET relation/help - -Returns the help window HTML. - -=cut - -sub help :Local :Args(0) { - my( $self, $c ) = @_; - $c->stash->{'template'} = 'relatehelp.tt'; -} - =head2 definitions GET relation/definitions @@ -70,6 +58,12 @@ sub text :Chained('/') :PathPart('relation') :CaptureArgs(1) { my( $self, $c, $textid ) = @_; # If the tradition has more than 500 ranks or so, split it up. my $tradition = $c->model('Directory')->tradition( $textid ); + # Account for a bad interaction between FastCGI and KiokuDB + unless( $tradition->collation->tradition ) { + $c->log->warn( "Fixing broken tradition link" ); + $tradition->collation->_set_tradition( $tradition ); + $c->model('Directory')->save( $tradition ); + } # See how big the tradition is. Edges are more important than nodes # when it comes to rendering difficulty. my $numnodes = scalar $tradition->collation->readings; @@ -100,7 +94,6 @@ sub text :Chained('/') :PathPart('relation') :CaptureArgs(1) { push( @{$c->stash->{'textsegments'}}, $seg ); } } - $DB::single = 1; $c->stash->{'textid'} = $textid; $c->stash->{'tradition'} = $tradition; } @@ -129,9 +122,37 @@ sub main :Chained('text') :PathPart('') :Args(0) { $c->stash->{'startseg'} = $startseg if defined $startseg; $c->stash->{'svg_string'} = $svg_str; $c->stash->{'text_title'} = $tradition->name; + $c->stash->{'text_lang'} = $tradition->language; $c->stash->{'template'} = 'relate.tt'; } +=head2 help + + GET relation/help/$language + +Returns the help window HTML. + +=cut + +sub help :Local :Args(1) { + my( $self, $c, $lang ) = @_; + # Display the morphological help for the language if it is defined. + if( $lang && $lang ne 'Default' ) { + my $mod = 'Text::Tradition::Language::' . $lang; + try { + load( $mod ); + } catch { + $c->log->debug("Warning: could not load $mod"); + } + my $has_mod = $mod->can('morphology_tags'); + if( $has_mod ) { + my $tagset = &$has_mod; + $c->stash->{'tagset'} = $tagset; + } + } + $c->stash->{'template'} = 'relatehelp.tt'; +} + =head2 relationships GET relation/$textid/relationships @@ -159,6 +180,7 @@ sub relationships :Chained('text') :PathPart :Args(0) { foreach my $p ( @pairs ) { my $relobj = $collation->relations->get_relationship( @$p ); next if $relobj->type eq 'collated'; # Don't show these + next if $p->[0] eq $p->[1]; # HACK until bugfix my $relhash = { source => $p->[0], target => $p->[1], type => $relobj->type, scope => $relobj->scope }; $relhash->{'note'} = $relobj->annotation if $relobj->has_annotation; @@ -198,8 +220,142 @@ sub relationships :Chained('text') :PathPart :Args(0) { } } $c->forward('View::JSON'); -} - +} + +=head2 readings + + GET relation/$textid/readings + +Returns the list of readings defined for this text along with their metadata. + +=cut + +my %read_write_keys = ( + 'id' => 0, + 'text' => 0, + 'is_meta' => 0, + 'grammar_invalid' => 1, + 'is_nonsense' => 1, + 'normal_form' => 1, +); + +sub _reading_struct { + my( $reading ) = @_; + # Return a JSONable struct of the useful keys. Keys meant to be writable + # have a true value; read-only keys have a false value. + my $struct = {}; + map { $struct->{$_} = $reading->$_ } keys( %read_write_keys ); + # Special case + $struct->{'lexemes'} = [ $reading->lexemes ]; + # Look up any words related via spelling or orthography + my $sameword = sub { + my $t = $_[0]->type; + return $t eq 'spelling' || $t eq 'orthographic'; + }; + my @variants; + foreach my $sr ( $reading->related_readings( $sameword ) ) { + push( @variants, $sr->text ); + } + $struct->{'variants'} = \@variants; + return $struct; +} + +sub readings :Chained('text') :PathPart :Args(0) { + my( $self, $c ) = @_; + my $tradition = delete $c->stash->{'tradition'}; + my $collation = $tradition->collation; + my $m = $c->model('Directory'); + if( $c->request->method eq 'GET' ) { + my $rdginfo = {}; + foreach my $rdg ( $collation->readings ) { + $rdginfo->{$rdg->id} = _reading_struct( $rdg ); + } + $c->stash->{'result'} = $rdginfo; + } + $c->forward('View::JSON'); +} + +=head2 reading + + GET relation/$textid/reading/$id + +Returns the list of readings defined for this text along with their metadata. + + POST relation/$textid/reading/$id { request } + +Alters the reading according to the values in request. Returns 403 Forbidden if +the alteration isn't allowed. + +=cut + +sub reading :Chained('text') :PathPart :Args(1) { + my( $self, $c, $reading_id ) = @_; + my $tradition = delete $c->stash->{'tradition'}; + my $collation = $tradition->collation; + my $rdg = $collation->reading( $reading_id ); + my $m = $c->model('Directory'); + if( $c->request->method eq 'GET' ) { + $c->stash->{'result'} = $rdg ? _reading_struct( $rdg ) + : { 'error' => "No reading with ID $reading_id" }; + } elsif ( $c->request->method eq 'POST' ) { + my $errmsg; + # Are we re-lemmatizing? + if( $c->request->param('relemmatize') ) { + my $nf = $c->request->param('normal_form'); + # TODO throw error unless $nf + $rdg->normal_form( $nf ); + # TODO throw error if lemmatization fails + # TODO skip this if normal form hasn't changed + $rdg->lemmatize(); + } else { + # Set all the values that we have for the reading. + # TODO error handling + foreach my $p ( keys %{$c->request->params} ) { + if( $p =~ /^morphology_(\d+)$/ ) { + # Set the form on the correct lexeme + my $morphval = $c->request->param( $p ); + next unless $morphval; + my $midx = $1; + my $lx = $rdg->lexeme( $midx ); + my $strrep = $rdg->language . ' // ' . $morphval; + my $idx = $lx->has_form( $strrep ); + unless( defined $idx ) { + # Make the word form and add it to the lexeme. + try { + $idx = $lx->add_matching_form( $strrep ) - 1; + } catch( Text::Tradition::Error $e ) { + $c->response->status( '403' ); + $errmsg = $e->message; + } catch { + # Something else went wrong, probably a Moose error + $c->response->status( '403' ); + $errmsg = 'Something went wrong with the request'; + } + } + $lx->disambiguate( $idx ) if defined $idx; + } elsif( $read_write_keys{$p} ) { + my $val = _clean_booleans( $rdg, $p, $c->request->param( $p ) ); + $rdg->$p( $val ); + } + } + } + $m->save( $rdg ); + $c->stash->{'result'} = $errmsg ? { 'error' => $errmsg } + : _reading_struct( $rdg ); + + } + $c->forward('View::JSON'); + +} + +sub _clean_booleans { + my( $rdg, $param, $val ) = @_; + if( $rdg->meta->get_attribute( $param )->type_constraint->name eq 'Bool' ) { + $val = 1 if $val eq 'true'; + $val = undef if $val eq 'false'; + } + return $val; +} =head2 end diff --git a/stemmaweb/lib/stemmaweb/Controller/Root.pm b/stemmaweb/lib/stemmaweb/Controller/Root.pm index 6e26299..90f9e8a 100644 --- a/stemmaweb/lib/stemmaweb/Controller/Root.pm +++ b/stemmaweb/lib/stemmaweb/Controller/Root.pm @@ -125,7 +125,7 @@ sub stemma :Local :Args(1) { } $c->stash->{'result'} = $tradition->stemma_count - ? $tradition->stemma(0)->as_svg + ? $tradition->stemma(0)->as_svg( { size => [ 500, 375 ] } ) : ''; $c->forward('View::SVG'); } diff --git a/stemmaweb/lib/stemmaweb/Controller/Stexaminer.pm b/stemmaweb/lib/stemmaweb/Controller/Stexaminer.pm index a0f4fcb..4b8bff6 100644 --- a/stemmaweb/lib/stemmaweb/Controller/Stexaminer.pm +++ b/stemmaweb/lib/stemmaweb/Controller/Stexaminer.pm @@ -1,9 +1,12 @@ package stemmaweb::Controller::Stexaminer; use Moose; use namespace::autoclean; +use Encode qw/ decode_utf8 /; use File::Temp; use JSON; use Text::Tradition::Analysis qw/ run_analysis wit_stringify /; +use Text::Tradition::Collation; +use Text::Tradition::Stemma; BEGIN { extends 'Catalyst::Controller' } @@ -18,12 +21,12 @@ The stemma analysis tool with the pretty colored table. =head1 METHODS +=head2 index + GET stexaminer/$textid Renders the application for the text identified by $textid. -=head2 index - =cut sub index :Path :Args(1) { @@ -32,12 +35,29 @@ sub index :Path :Args(1) { my $tradition = $m->tradition( $textid ); if( $tradition->stemma_count ) { my $stemma = $tradition->stemma(0); - # TODO Think about caching the stemma in a session - $c->stash->{svg} = $stemma->as_svg; + $c->stash->{svg} = $stemma->as_svg( { size => [ 600, 350 ] } ); + $c->stash->{graphdot} = $stemma->editable({ linesep => ' ' }); $c->stash->{text_title} = $tradition->name; $c->stash->{template} = 'stexaminer.tt'; + + # Get the analysis options + my( $use_type1, $ignore_sort ) = ( 0, 'none' ); + if( $c->req->method eq 'POST' ) { + $use_type1 = $c->req->param( 'show_type1' ) ? 1 : 0; + $ignore_sort = $c->req->param( 'ignore_variant' ); + } + $c->stash->{'show_type1'} = $use_type1; + $c->stash->{'ignore_variant'} = $ignore_sort; # TODO Run the analysis as AJAX from the loaded page. - my $t = run_analysis( $tradition, 'exclude_type1' => 1 ); + my %analysis_options; + $analysis_options{'exclude_type1'} = !$use_type1; + if( $ignore_sort eq 'spelling' ) { + $analysis_options{'merge_types'} = [ qw/ spelling orthographic / ]; + } elsif( $ignore_sort eq 'orthographic' ) { + $analysis_options{'merge_types'} = 'orthographic'; + } + + my $t = run_analysis( $tradition, %analysis_options ); # Stringify the reading groups foreach my $loc ( @{$t->{'variants'}} ) { my $mst = wit_stringify( $loc->{'missing'} ); @@ -45,6 +65,11 @@ sub index :Path :Args(1) { foreach my $rhash ( @{$loc->{'readings'}} ) { my $gst = wit_stringify( $rhash->{'group'} ); $rhash->{'group'} = $gst; + my $roots = join( ', ', @{$rhash->{'independent_occurrence'}} ); + $rhash->{'independent_occurrence'} = $roots; + unless( $rhash->{'text'} ) { + $rhash->{'text'} = $rhash->{'readingid'}; + } } } # Values for TT rendering @@ -60,6 +85,34 @@ sub index :Path :Args(1) { } } +=head2 graphsvg + + POST stexaminer/graphsvg + dot: + layerwits: [ request->param('dot'); + my @layerwits = $c->request->param('layerwits[]'); + open my $stemma_fh, '<', \$dot; + binmode( $stemma_fh, ':encoding(UTF-8)' ); + my $emptycoll = Text::Tradition::Collation->new(); + my $tempstemma = Text::Tradition::Stemma->new( + collation => $emptycoll, 'dot' => $stemma_fh ); + my $svgopts = { size => [ 600, 350 ] }; + if( @layerwits ) { + $svgopts->{'layerwits'} = \@layerwits; + } + $c->stash->{'result'} = $tempstemma->as_svg( $svgopts ); + $c->forward('View::SVG'); +} + =head2 end Attempt to render a view, if needed. diff --git a/stemmaweb/lib/stemmaweb/Controller/Users.pm b/stemmaweb/lib/stemmaweb/Controller/Users.pm index 8a5c6ac..32f629f 100644 --- a/stemmaweb/lib/stemmaweb/Controller/Users.pm +++ b/stemmaweb/lib/stemmaweb/Controller/Users.pm @@ -47,7 +47,7 @@ sub index :Path :Args(0) { Logging in with openid/google requires two passes through the login action, on the 2nd pass the C value is passed in when the openid providing webserver links the user back to the stemmaweb -site. This adaption to the C action sets the realm we are +site. This adaptation to the C action sets the realm we are authenticating against to be C in this case. =cut @@ -86,6 +86,31 @@ before register => sub { } }; +=head2 success + +A stub page returned on login / registration success. + +=cut + +sub success :Local :Args(0) { + my ( $self, $c ) = @_; + + $c->load_status_msgs; + $c->stash->{template} = 'auth/success.tt'; +} + +=head2 post_logout + +Return to the index page, not to the login page. + +=cut + +sub post_logout { + my( $self, $c ) = @_; + $c->response->redirect( $c->uri_for_action( '/index' ) ); + $c->detach; +} + =head1 AUTHOR A clever guy diff --git a/stemmaweb/lib/stemmaweb/View/JSON.pm b/stemmaweb/lib/stemmaweb/View/JSON.pm index e8a9284..347c034 100644 --- a/stemmaweb/lib/stemmaweb/View/JSON.pm +++ b/stemmaweb/lib/stemmaweb/View/JSON.pm @@ -3,6 +3,16 @@ package stemmaweb::View::JSON; use strict; use base 'Catalyst::View::JSON'; +use JSON::XS (); + +sub encode_json { + my( $self, $c, $data ) = @_; + my $json = JSON::XS->new->utf8->convert_blessed(1); + $json->encode( $data ); +} + +1; + =head1 NAME stemmaweb::View::JSON - Catalyst JSON View @@ -23,7 +33,3 @@ Tara Andrews This library is free software, you can redistribute it and/or modify it under the same terms as Perl itself. - -=cut - -1; diff --git a/stemmaweb/root/css/auth.css b/stemmaweb/root/css/auth.css new file mode 100644 index 0000000..0717b5f --- /dev/null +++ b/stemmaweb/root/css/auth.css @@ -0,0 +1,8 @@ +#topbanner { + width: 100%; + height: 50px; + margin-top: 20px; +} +.error { + color: #d24848; +} diff --git a/stemmaweb/root/css/relationship.css b/stemmaweb/root/css/relationship.css index 9b778b6..d64acab 100644 --- a/stemmaweb/root/css/relationship.css +++ b/stemmaweb/root/css/relationship.css @@ -200,3 +200,34 @@ span.apimore { list-style: none; margin-bottom: 3px; } + +.draggable { + cursor:pointer; +} + +.noselect { +-webkit-touch-callout: none; +-webkit-user-select: none; +-khtml-user-select: none; +-moz-user-select: none; +-ms-user-select: none; +user-select: none; +} + +#normalization { + float: left; + padding: 10px; +} +#relemmatize_pending { + float: left; + padding: 10px; + display: none; +} +#morph_outer { + clear: both; + float: left; +} +#morphology { + text-align: right; + margin: 10px; +} diff --git a/stemmaweb/root/css/stexaminer.css b/stemmaweb/root/css/stexaminer.css index 59c523d..703dfc1 100644 --- a/stemmaweb/root/css/stexaminer.css +++ b/stemmaweb/root/css/stexaminer.css @@ -1,16 +1,26 @@ +#topbanner { + width: 100%; + height: 100px; + margin-top: 20px; +} +#bannerinfo { + float: right; + margin-right: 12%; + margin-top: 15px; +} #variants_table { - float: left; + clear: both; width: 90%; - height: 90px; + height: 190px; border: 1px #c6dcf1 solid; margin-bottom: 20px; overflow: auto; } #stemma_graph { - height: 450px; + height: 350px; clear: both; float: left; - width: 700px; + width: 600px; text-align: center; border: 1px #c6dcf1 solid; } @@ -24,7 +34,24 @@ position: relative; top: -15px; } -#stats_template { +.reading_statistics { + margin: 7pt; + border-bottom: 1px solid #488dd2; +} +.readinglabel { + font-weight: bold; +} +.readingroots { + font-weight: bold; + color: #488dd2; +} +.reading_copied { + color: #33dd33; +} +.reading_changed { + color: #dd3333; +} +.template { display: none; } .genealogical { @@ -67,3 +94,25 @@ .cellb7 { border-right: 20px solid #ffd5e5; } + +/* Clearfix hack to make div container height work */ +.clearfix:after { + content: "."; + display: block; + clear: both; + visibility: hidden; + line-height: 0; + height: 0; +} + +.clearfix { + display: inline-block; +} + +html[xmlns] .clearfix { + display: block; +} + +* html .clearfix { + height: 1%; +} \ No newline at end of file diff --git a/stemmaweb/root/css/style.css b/stemmaweb/root/css/style.css index 264e6de..4aaa2b9 100644 --- a/stemmaweb/root/css/style.css +++ b/stemmaweb/root/css/style.css @@ -58,6 +58,22 @@ div.button:hover span { background-position: bottom left; } +#topbanner { + width: 100%; + height: 100px; + margin-top: 20px; +} +#bannerinfo { + float: right; + margin-right: 12%; + margin-top: 15px; +} +.navlink { + color: #488dd2; + text-decoration: underline; +} + + /* Index page components */ @@ -121,10 +137,9 @@ div.button:hover span { } #stexaminer_button { bottom: 0; - margin-top: 5px; + margin-top: 13px; } #relater_button { float: left; - margin-left: 83px; - margin-top: 7px; + margin-left: 100px; } diff --git a/stemmaweb/root/js/componentload.js b/stemmaweb/root/js/componentload.js index e881f84..87cc0ef 100644 --- a/stemmaweb/root/js/componentload.js +++ b/stemmaweb/root/js/componentload.js @@ -6,21 +6,11 @@ function loadTradition( textid, textname ) { }; var imghtml = 'Loading SVG...' $('#stemma_graph').empty(); - $('#variant_graph').empty(); $('#stemma_graph').append( imghtml ); - $('#variant_graph').append( imghtml ); // Then get and load the actual content. // TODO: scale #stemma_grpah both horizontally and vertically // TODO: load svgs from SVG.Jquery (to make scaling react in Safari) $('#stemma_graph').load( basepath + "/stemma/" + textid ); - $('#variant_graph').load( basepath + "/variantgraph/" + textid , function() { - var variant_svg_element = $('#variant_graph svg').svg().svg('get').root(); - var svg_height = variant_svg_element.height.baseVal.value; - var svg_width = variant_svg_element.width.baseVal.value; - var container_height = $('#variant_graph').height(); - variant_svg_element.height.baseVal.value = container_height; - variant_svg_element.width.baseVal.value = (svg_width/svg_height * container_height); - }); // Then populate the various elements with the right text name/ID. // Stemma and variant graph titles diff --git a/stemmaweb/root/js/relationship.js b/stemmaweb/root/js/relationship.js index ed1b150..cd3d867 100644 --- a/stemmaweb/root/js/relationship.js +++ b/stemmaweb/root/js/relationship.js @@ -1,5 +1,12 @@ +var MARGIN=30; +var svg_root = null; +var svg_root_element = null; +var start_element_height = 0; +var reltypes = {}; +var readingdata = {}; + function getTextPath() { - var currpath = window.location.pathname + var currpath = window.location.pathname; // Get rid of trailing slash if( currpath.lastIndexOf('/') == currpath.length - 1 ) { currpath = currpath.slice( 0, currpath.length - 1) @@ -20,11 +27,147 @@ function getRelativePath() { return path_parts[0]; } -function getRelationshipURL() { +function getTextURL( which ) { + var path_parts = getTextPath(); + return path_parts[0] + '/' + path_parts[1] + '/' + which; +} + +function getReadingURL( reading_id ) { var path_parts = getTextPath(); - return path_parts[0] + '/' + path_parts[1] + '/relationships'; + return path_parts[0] + '/' + path_parts[1] + '/reading/' + reading_id; +} + +// Make an XML ID into a valid selector +function jq(myid) { + return '#' + myid.replace(/(:|\.)/g,'\\$1'); +} + +// Actions for opening the reading panel +function node_dblclick_listener( evt ) { + // Open the reading dialogue for the given node. + // First get the reading info + var reading_id = $(this).attr('id'); + var reading_info = readingdata[reading_id]; + // and then populate the dialog box with it. + // Set the easy properties first + $('#reading-form').dialog( 'option', 'title', 'Reading information for "' + reading_info['text'] + '"' ); + $('#reading_id').val( reading_id ); + toggle_checkbox( $('#reading_is_nonsense'), reading_info['is_nonsense'] ); + toggle_checkbox( $('#reading_grammar_invalid'), reading_info['grammar_invalid'] ); + // Use .text as a backup for .normal_form + var normal_form = reading_info['normal_form']; + if( !normal_form ) { + normal_form = reading_info['text']; + } + var nfboxsize = 10; + if( normal_form.length > 9 ) { + nfboxsize = normal_form.length + 1; + } + $('#reading_normal_form').attr( 'size', nfboxsize ) + $('#reading_normal_form').val( normal_form ); + // Now do the morphological properties. + morphology_form( reading_info['lexemes'] ); + // and then open the dialog. + $('#reading-form').dialog("open"); +} + +function toggle_checkbox( box, value ) { + if( value == null ) { + value = false; + } + box.attr('checked', value ); +} + +function morphology_form ( lexlist ) { + $('#morphology').empty(); + $.each( lexlist, function( idx, lex ) { + var morphoptions = []; + if( 'wordform_matchlist' in lex ) { + $.each( lex['wordform_matchlist'], function( tdx, tag ) { + var tagstr = stringify_wordform( tag ); + morphoptions.push( tagstr ); + }); + } + var formtag = 'morphology_' + idx; + var formstr = ''; + if( 'form' in lex ) { + formstr = stringify_wordform( lex['form'] ); + } + var form_morph_elements = morph_elements( + formtag, lex['string'], formstr, morphoptions ); + $.each( form_morph_elements, function( idx, el ) { + $('#morphology').append( el ); + }); + }); +} + +function stringify_wordform ( tag ) { + if( tag ) { + var elements = tag.split(' // '); + return elements[1] + ' // ' + elements[2]; + } + return '' +} + +function morph_elements ( formtag, formtxt, currform, morphoptions ) { + var clicktag = '(Click to select)'; + if ( !currform ) { + currform = clicktag; + } + var formlabel = $('