From: tla Date: Tue, 17 May 2011 14:12:16 +0000 (+0200) Subject: some more rehoming of functionality X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=4a8828f0ac801bd0a198cdaf74a7b6769db0ec6b;p=scpubgit%2Fstemmatology.git some more rehoming of functionality --- diff --git a/lib/Text/Tradition.pm b/lib/Text/Tradition.pm index 695f4e2..92673be 100644 --- a/lib/Text/Tradition.pm +++ b/lib/Text/Tradition.pm @@ -1,5 +1,6 @@ package Text::Tradition; +use Module::Load; use Moose; use Text::Tradition::Collation; use Text::Tradition::Witness; @@ -15,12 +16,10 @@ has 'witnesses' => ( is => 'rw', isa => 'ArrayRef[Text::Tradition::Witness]', handles => { - all_options => 'elements', - add_option => 'push', - map_options => 'map', - option_count => 'count', - sorted_options => 'sort', + all => 'elements', + add => 'push', }, + default => sub { [] }, ); sub BUILD { @@ -50,13 +49,51 @@ sub BUILD { # TODO Now how to collate these? } } else { - # Else we got passed args intended for the collator. - $init_args->{'tradition'} = $self; - $self->_save_collation( Text::Tradition::Collation->new( %$init_args ) ); - $self->witnesses( $self->collation->create_witnesses() ); + # Else we need to parse some collation data. Make a Collation object + my $collation = Text::Tradition::Collation->new( %$init_args, + 'tradition' => $self ); + $self->_save_collation( $collation ); + + # Call the appropriate parser on the given data + my @formats = grep { /^(GraphML|CSV|CTE|TEI)$/ } keys( %$init_args ); + my $format = shift( @formats ); + unless( $format ) { + warn "No data given to create a collation; will initialize an empty one"; + } + if( $format && $format =~ /^(CSV|CTE)$/ && + !exists $init_args->{'base'} ) { + warn "Cannot make a collation from $format without a base text"; + return; + } + + # Starting point for all texts + my $last_node = $collation->add_reading( '#START#' ); + + # Now do the parsing. + my @sigla; + if( $format ) { + my @parseargs; + if( $format =~ /^(CSV|CTE)$/ ) { + @parseargs = ( 'base' => $init_args->{'base'}, + 'data' => $init_args->{$format}, + 'format' => $format ); + $format = 'BaseText'; + } else { + @parseargs = ( $init_args->{ $format } ); + } + my $mod = "Text::Tradition::Parser::$format"; + load( $mod ); + $mod->can('parse')->( $self, @parseargs ); + } } } +sub add_witness { + my $self = shift; + my $new_wit = Text::Tradition::Witness->new( @_ ); + push( @{$self->witnesses}, $new_wit ); +} + # The user will usually be instantiating a Tradition object, and # examining its collation. The information about the tradition can # come via several routes: diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index 106ab55..fd5bda5 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -2,7 +2,6 @@ package Text::Tradition::Collation; use Graph::Easy; use IPC::Run qw( run binary ); -use Module::Load; use Text::Tradition::Collation::Reading; use Moose; @@ -49,6 +48,12 @@ has 'graphml' => ( predicate => 'has_graphml', ); +has 'wit_list_separator' => ( + is => 'rw', + isa => 'Str', + default => ', ', + ); + # The collation can be created two ways: # 1. Collate a set of witnesses (with CollateX I guess) and process # the results as in 2. @@ -66,51 +71,11 @@ has 'graphml' => ( sub BUILD { my( $self, $args ) = @_; - - # Call the appropriate parser on the given data - my @formats = grep { /^(GraphML|CSV|CTE|TEI)$/ } keys( %$args ); - my $format = shift( @formats ); - unless( $format ) { - warn "No data given to create a graph; will initialize an empty one"; - } - if( $format && $format =~ /^(CSV|CTE)$/ && !exists $args->{'base'} ) { - warn "Cannot make a graph from $format without a base text"; - return; - } - - # Initialize our graph object. $self->graph->use_class('node', 'Text::Tradition::Collation::Reading'); - $self->graph->set_attribute( 'node', 'shape', 'ellipse' ); - # Starting point for all texts - my $last_node = $self->add_reading( '#START#' ); - - # Now do the parsing. - my @sigla; - if( $format ) { - my @parseargs; - if( $format =~ /^(CSV|CTE)$/ ) { - @parseargs = ( 'base' => $args->{'base'}, - 'data' => $args->{$format}, - 'format' => $format ); - $format = 'BaseText'; - } else { - @parseargs = ( $args->{ $format } ); - } - my $mod = "Text::Tradition::Parser::$format"; - load( $mod ); - # TODO parse needs to return witness IDs - @sigla = $mod->can('parse')->( $self, @parseargs ); - } - # Do we need to initialize the witnesses? - unless( $args->{'have_witnesses'} ) { - # initialize Witness objects for all our witnesses - my @witnesses; - foreach my $sigil ( @sigla ) { - push( @witnesses, Text::Tradition::Witness->new( 'sigil' => $sigil ) ); - } - $self->tradition->witnesses( \@witnesses ); - } + # Pass through any graph-specific options. + my $shape = exists( $args->{'shape'} ) ? $args->{'shape'} : 'ellipse'; + $self->graph->set_attribute( 'node', 'shape', $shape ); } # Wrappers around some methods @@ -133,7 +98,7 @@ sub merge_readings { print $graph->as_svg( $recalculate ); Returns an SVG string that represents the graph. Uses GraphViz to do -this, because Graph::Easy doesn't cope well with long graphs. Unless +this, because Graph::Easy doesn\'t cope well with long graphs. Unless $recalculate is passed (and is a true value), the method will return a cached copy of the SVG after the first call to the method. @@ -268,7 +233,7 @@ Returns the beginning of the collation, a meta-reading with label '#START#'. =cut sub start { - # Return the beginning node of the graph. + # Return the beginning reading of the graph. my $self = shift; my( $new_start ) = @_; if( $new_start ) { @@ -278,37 +243,37 @@ sub start { return $self->reading('#START#'); } -=item B +=item B -my $next_node = $graph->next_word( $node, $path ); +my $next_reading = $graph->next_reading( $reading, $witpath ); -Returns the node that follows the given node along the given witness +Returns the reading that follows the given reading along the given witness path. TODO These are badly named. =cut -sub next_word { +sub next_reading { # Return the successor via the corresponding edge. my $self = shift; - return $self->_find_linked_word( 'next', @_ ); + return $self->_find_linked_reading( 'next', @_ ); } -=item B +=item B -my $prior_node = $graph->prior_word( $node, $path ); +my $prior_reading = $graph->prior_reading( $reading, $witpath ); -Returns the node that precedes the given node along the given witness +Returns the reading that precedes the given reading along the given witness path. TODO These are badly named. =cut -sub prior_word { +sub prior_reading { # Return the predecessor via the corresponding edge. my $self = shift; - return $self->_find_linked_word( 'prior', @_ ); + return $self->_find_linked_reading( 'prior', @_ ); } -sub _find_linked_word { +sub _find_linked_reading { my( $self, $direction, $node, $edge ) = @_; $edge = 'base text' unless $edge; my @linked_edges = $direction eq 'next' @@ -317,9 +282,9 @@ sub _find_linked_word { # We have to find the linked edge that contains all of the # witnesses supplied in $edge. - my @edge_wits = split( /, /, $edge ); + my @edge_wits = $self->witnesses_of_label( $edge ); foreach my $le ( @linked_edges ) { - my @le_wits = split( /, /, $le->name() ); + my @le_wits = $self->witnesses_of_label( $le->name ); if( _is_within( \@edge_wits, \@le_wits ) ) { # This is the right edge. return $direction eq 'next' ? $le->to() : $le->from(); @@ -330,11 +295,152 @@ sub _find_linked_word { return undef; } -sub create_witnesses { - # TODO Given a new collation, make a bunch of Witness objects. +# Some set logic. +sub _is_within { + my( $set1, $set2 ) = @_; + my $ret = 1; + foreach my $el ( @$set1 ) { + $ret = 0 unless grep { /^\Q$el\E$/ } @$set2; + } + return $ret; +} + +# Walk the paths for each witness in the graph, and return the nodes +# that the graph has in common. + +sub walk_witness_paths { + my( $self, $end ) = @_; + # For each witness, walk the path through the graph. + # Then we need to find the common nodes. + # TODO This method is going to fall down if we have a very gappy + # text in the collation. + my $paths = {}; + my @common_nodes; + foreach my $wit ( @{$self->tradition->witnesses} ) { + my $curr_reading = $self->start; + my @wit_path = ( $curr_reading ); + my %seen_readings; + # TODO Detect loops at some point + while( $curr_reading->name ne $end->name ) { + if( $seen_readings{$curr_reading->name} ) { + warn "Detected loop walking path for witness " . $wit->sigil + . " at reading " . $curr_reading->name; + last; + } + my $next_reading = $self->next_reading( $curr_reading, + $wit->sigil ); + push( @wit_path, $next_reading ); + $seen_readings{$curr_reading->name} = 1; + $curr_reading = $next_reading; + } + $wit->path( \@wit_path ); + if( @common_nodes ) { + my @cn; + foreach my $n ( @wit_path ) { + push( @cn, $n ) if grep { $_ eq $n } @common_nodes; + } + @common_nodes = (); + push( @common_nodes, @cn ); + } else { + push( @common_nodes, @wit_path ); + } + } + + # Mark all the nodes as either common or not. + foreach my $cn ( @common_nodes ) { + print STDERR "Setting " . $cn->name . " as common node\n"; + $cn->make_common; + } + foreach my $n ( $self->readings() ) { + $n->make_variant unless $n->is_common; + } +} + +sub common_readings { + my $self = shift; + my @common = grep { $_->is_common } $self->readings(); + return @common; +} + +# Calculate the relative positions of nodes in the graph, if they +# were not given to us. +sub calculate_positions { + my $self = shift; + + # We have to calculate the position identifiers for each word, + # keyed on the common nodes. This will be 'fun'. The end result + # is a hash per witness, whose key is the word node and whose + # value is its position in the text. Common nodes are always N,1 + # so have identical positions in each text. + my @common = $self->common_readings(); + + my $node_pos = {}; + foreach my $wit ( @{$self->tradition->witnesses} ) { + # First we walk each path, making a matrix for each witness that + # corresponds to its eventual position identifier. Common nodes + # always start a new row, and are thus always in the first column. + + my $wit_matrix = []; + my $cn = 0; # We should hit the common readings in order. + my $row = []; + foreach my $wn ( @{$wit->path} ) { + if( $wn eq $common[$cn] ) { + # Set up to look for the next common node, and + # start a new row of words. + $cn++; + push( @$wit_matrix, $row ) if scalar( @$row ); + $row = []; + } + push( @$row, $wn ); + } + push( @$wit_matrix, $row ); # Push the last row onto the matrix + + # Now we have a matrix per witness, so that each row in the + # matrix begins with a common node, and continues with all the + # variant words that appear in the witness. We turn this into + # real positions in row,cell format. But we need some + # trickery in order to make sure that each node gets assigned + # to only one position. + + foreach my $li ( 1..scalar(@$wit_matrix) ) { + foreach my $di ( 1..scalar(@{$wit_matrix->[$li-1]}) ) { + my $reading = $wit_matrix->[$li-1]->[$di-1]; + my $position = "$li,$di"; + # If we have seen this node before, we need to compare + # its position with what went before. + unless( $reading->has_position && + _cmp_position( $position, $reading->position ) < 1 ) { + # The new position ID replaces the old one. + $reading->position( $position ); + } # otherwise, the old position needs to stay. + } + } + } +} + +sub _cmp_position { + my( $a, $b ) = @_; + my @pos_a = split(/,/, $a ); + my @pos_b = split(/,/, $b ); - return []; + my $big_cmp = $pos_a[0] <=> $pos_b[0]; + return $big_cmp if $big_cmp; + # else + return $pos_a[1] <=> $pos_b[1]; } + +# Return the string that joins together a list of witnesses for +# display on a single path. +sub path_label { + my $self = shift; + return join( $self->wit_list_separator, @_ ); +} + +sub witnesses_of_label { + my $self = shift; + my $regex = $self->wit_list_separator; + return split( /^\Q$regex\E$/, @_ ); +} no Moose; __PACKAGE__->meta->make_immutable; diff --git a/lib/Text/Tradition/Collation/Reading.pm b/lib/Text/Tradition/Collation/Reading.pm index 146eadf..e3d6d6d 100644 --- a/lib/Text/Tradition/Collation/Reading.pm +++ b/lib/Text/Tradition/Collation/Reading.pm @@ -14,6 +14,7 @@ subtype 'Position' has 'position' => ( is => 'rw', isa => 'Position', + predicate => 'has_position', ); # This contains an array of reading objects; the array is a pool, @@ -132,6 +133,21 @@ sub set_relationship { $self->relationships->{ $rel } = $value; } +sub is_common { + my( $self ) = shift; + return $self->get_attribute( 'class' ) eq 'common'; +} + +sub make_common { + my( $self ) = shift; + $self->set_attribute( 'class', 'common' ); +} + +sub make_variant { + my( $self ) = shift; + $self->set_attribute( 'class', 'variant' ); +} + no Moose; __PACKAGE__->meta->make_immutable; diff --git a/lib/Text/Tradition/Parser/GraphML.pm b/lib/Text/Tradition/Parser/GraphML.pm index d731f1d..63e8deb 100644 --- a/lib/Text/Tradition/Parser/GraphML.pm +++ b/lib/Text/Tradition/Parser/GraphML.pm @@ -29,29 +29,44 @@ graph. =cut +use vars qw/ $xpc %nodedata /; + sub parse { - my( $collation, $graphml_str ) = @_; + my( $tradition, $graphml_str ) = @_; + my $collation = $tradition->collation; my $parser = XML::LibXML->new(); my $doc = $parser->parse_string( $graphml_str ); my $graphml = $doc->documentElement(); - my $xpc = XML::LibXML::XPathContext->new( $graphml ); + $xpc = XML::LibXML::XPathContext->new( $graphml ); $xpc->registerNs( 'g', 'http://graphml.graphdrawing.org/xmlns' ); # First get the ID keys, for witnesses and for collation data - my %nodedata; my %witnesses; foreach my $k ( $xpc->findnodes( '//g:key' ) ) { # Each key has a 'for' attribute; the edge keys are witnesses, and # the node keys contain an ID and string for each node. if( $k->getAttribute( 'for' ) eq 'node' ) { + # The node data keys we expect are: + # 'number' -> unique node identifier + # 'token' -> reading for the node + # 'identical' -> the node of which this node is + # a transposed version + # 'position' -> a calculated position for the node $nodedata{ $k->getAttribute( 'attr.name' ) } = $k->getAttribute( 'id' ); } else { $witnesses{ $k->getAttribute( 'id' ) } = $k->getAttribute( 'attr.name' ); } } + my $has_explicit_positions = defined $nodedata{'position'}; + + # Add the witnesses that we have found + foreach my $wit ( values %witnesses ) { + $tradition->add_witness( 'sigil' => $wit ); + } + my $graph_el = $xpc->find( '/g:graphml/g:graph' )->[0]; # Add the nodes to the graph. First delete the start node, because @@ -63,19 +78,17 @@ sub parse { my $extra_data = {}; my @nodes = $xpc->findnodes( '//g:node' ); foreach my $n ( @nodes ) { - my $lookup_xpath = './g:data[@key="%s"]/child::text()'; - my $id = $xpc->findvalue( sprintf( $lookup_xpath, $nodedata{'number'} ), $n ); - my $label = $xpc->findvalue( sprintf( $lookup_xpath, $nodedata{'token'} ), $n ); + my $id = _lookup_node_data( $n, 'number' ); + my $label = _lookup_node_data( $n, 'token' ); my $gnode = $collation->add_reading( $id ); $node_name{ $n->getAttribute('id') } = $id; $gnode->set_attribute( 'label', $label ); - # Now get the rest of the data + # Now get the rest of the data, i.e. not the ID or label my $extra = {}; - my @keys = grep { $_ !~ /^(number|token)$/ } keys( %nodedata ); - foreach my $k ( @keys ) { - my $data = $xpc->findvalue( sprintf( $lookup_xpath, $nodedata{ $k } ), $n ); - $extra->{ $k } = $data; + foreach my $k ( keys %nodedata ) { + next if $k =~ /^(number|token)$/; + $extra->{ $k } = _lookup_node_data( $n, $k ); } $extra_data->{ $id } = $extra; } @@ -88,8 +101,7 @@ sub parse { # Label according to the witnesses present. my @wit_ids = $xpc->findnodes( './g:data/attribute::key', $e ); my @wit_names = map { $witnesses{ $_->getValue() } } @wit_ids; - my $label = join( ', ', @wit_names ); - + my $label = $collation->path_label( @wit_names ); $collation->add_path( $from, $to, $label ); } @@ -107,85 +119,51 @@ sub parse { $node_name{ $xpc->findvalue( $id_xpath, $tn ) } ) ); } - # Find the beginning and end nodes of the graph. The beginning node # has no incoming edges; the end node has no outgoing edges. my( $begin_node, $end_node ); foreach my $gnode ( $collation->readings() ) { - print STDERR "Checking node " . $gnode->name . "\n"; + # print STDERR "Checking node " . $gnode->name . "\n"; my @outgoing = $gnode->outgoing(); my @incoming = $gnode->incoming(); unless( scalar @incoming ) { warn "Already have a beginning node" if $begin_node; - my $node_xml_id = $node_id{ $gnode->name() }; - my @bn = $xpc->findnodes( '//g:node[@id="' . $node_xml_id . '"]' ); - warn "XPath did not find a node for id $node_xml_id" - unless scalar @bn; - $begin_node = $bn[0]; + $begin_node = $gnode; $collation->start( $gnode ); - $node_name{ $begin_node->getAttribute( 'id' ) } = '#START#'; - $node_id{'#START#'} = $begin_node->getAttribute( 'id' ); } unless( scalar @outgoing ) { warn "Already have an ending node" if $end_node; - my $node_xml_id = $node_id{ $gnode->name() }; - my @bn = $xpc->findnodes( '//g:node[@id="' . $node_xml_id . '"]' ); - warn "XPath did not find a node for id $node_xml_id" - unless scalar @bn; - $end_node = $bn[0]; + $end_node = $gnode; } } - # Now for each witness, walk the path through the graph. - # Then we need to find the common nodes. - # TODO This method is going to fall down if we have a very gappy - # text in the collation. - # TODO think about whether it makes more sense to do this in the - # XML or in the graph. Right now it's the XML. - my $paths = {}; - my @common_nodes; - foreach my $wit ( keys %witnesses ) { - my $node_id = $begin_node->getAttribute('id'); - my @wit_path = ( $node_name{ $node_id } ); - # TODO Detect loops at some point - while( $node_id ne $end_node->getAttribute('id') ) { - # Find the node which is the target of the edge whose - # source is $node_id and applies to this witness. - my $xpath_expr = '//g:edge[child::g:data[@key="' - . $wit . '"] and attribute::source="' - . $node_id . '"]'; - my $next_edge = $xpc->find( $xpath_expr, $graph_el )->[0]; - print STDERR " - at $wit / $node_id\n"; - $node_id = $next_edge->getAttribute('target'); - push( @wit_path, $node_name{ $node_id } ); - } - $paths->{ $witnesses{ $wit }} = \@wit_path; - if( @common_nodes ) { - my @cn; - foreach my $n ( @wit_path) { - push( @cn, $n ) if grep { $_ eq $n } @common_nodes; - } - @common_nodes = (); - push( @common_nodes, @cn ); - } else { - push( @common_nodes, @wit_path ); + $collation->walk_witness_paths( $end_node ); + # Now we have added the witnesses and their paths, so have also + # implicitly marked the common nodes. Now we can calculate their + # explicit permissions. This is separate because it won't always + # be necessary with the GraphML parsing. + $collation->calculate_positions() unless $has_explicit_positions; + if( $has_explicit_positions ) { + # Record the positions that came with each graph node. + # TODO we really need to translate these into our own style of + # position identifier. That's why we defer this until now. + foreach my $node_id ( keys %$extra_data ) { + my $pos = $extra_data->{$node_id}->{'position'}; + $collation->reading( $node_name{$node_id} )->position( $pos ); } + } else { + # Calculate a position for each graph node. + $collation->calculate_positions(); } +} - # Mark all the nodes as either common or not. - foreach my $cn ( @common_nodes ) { - print STDERR "Setting $cn as common node\n"; - $collation->reading( $cn )->set_attribute( 'class', 'common' ); - } - foreach my $n ( $collation->readings() ) { - $n->set_attribute( 'class', 'variant' ) - unless $n->get_attribute( 'class' ) eq 'common'; - } - - # Now calculate graph positions. - # $collation->make_positions( \@common_nodes, $paths ); - +sub _lookup_node_data { + my( $xmlnode, $key ) = @_; + my $lookup_xpath = './g:data[@key="%s"]/child::text()'; + my $data = $xpc->findvalue( sprintf( $lookup_xpath, $nodedata{$key} ), + $xmlnode ); + return $data; } =back diff --git a/lib/Text/Tradition/Witness.pm b/lib/Text/Tradition/Witness.pm index 1eccbb0..8480e71 100644 --- a/lib/Text/Tradition/Witness.pm +++ b/lib/Text/Tradition/Witness.pm @@ -24,6 +24,11 @@ has 'source' => ( predicate => 'has_source', ); +has 'path' => ( + is => 'rw', + isa => 'ArrayRef[Text::Tradition::Collation::Reading]', + ); + sub BUILD { my $self = shift; if( $self->has_source ) {