From: Tara L Andrews Date: Wed, 31 Aug 2011 22:38:07 +0000 (+0200) Subject: various things; headline change is reworking of node positions X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=910a0a6d9f858731358772a45e52817b039cf019;p=scpubgit%2Fstemmatology.git various things; headline change is reworking of node positions --- diff --git a/lib/Text/Tradition.pm b/lib/Text/Tradition.pm index 9f7a334..601a5d6 100644 --- a/lib/Text/Tradition.pm +++ b/lib/Text/Tradition.pm @@ -13,11 +13,10 @@ has 'collation' => ( has 'witnesses' => ( traits => ['Array'], - is => 'rw', isa => 'ArrayRef[Text::Tradition::Witness]', handles => { - all => 'elements', - add => 'push', + witnesses => 'elements', + add_witness => 'push', }, default => sub { [] }, ); @@ -27,89 +26,93 @@ has 'name' => ( isa => 'Str', default => 'Tradition', ); + +around 'add_witness' => sub { + my $orig = shift; + my $self = shift; + my $new_wit = Text::Tradition::Witness->new( @_ ); + $self->$orig( $new_wit ); + return $new_wit; +}; + sub BUILD { my( $self, $init_args ) = @_; if( exists $init_args->{'witnesses'} ) { - # We got passed an uncollated list of witnesses. Make a - # witness object for each witness, and then send them to the - # collator. - my $autosigil = 0; - foreach my $wit ( %{$init_args->{'witnesses'}} ) { - # Each item in the list is either a string or an arrayref. - # If it's a string, it is a filename; if it's an arrayref, - # it is a tuple of 'sigil, file'. Handle either case. - my $args; - if( ref( $wit ) eq 'ARRAY' ) { - $args = { 'sigil' => $wit->[0], - 'file' => $wit->[1] }; - } else { - $args = { 'sigil' => chr( $autosigil+65 ), - 'file' => $wit }; - $autosigil++; - } - $self->witnesses->push( Text::Tradition::Witness->new( $args ) ); - # TODO Now how to collate these? - } + # We got passed an uncollated list of witnesses. Make a + # witness object for each witness, and then send them to the + # collator. + my $autosigil = 0; + foreach my $wit ( %{$init_args->{'witnesses'}} ) { + # Each item in the list is either a string or an arrayref. + # If it's a string, it is a filename; if it's an arrayref, + # it is a tuple of 'sigil, file'. Handle either case. + my $args; + if( ref( $wit ) eq 'ARRAY' ) { + $args = { 'sigil' => $wit->[0], + 'file' => $wit->[1] }; + } else { + $args = { 'sigil' => chr( $autosigil+65 ), + 'file' => $wit }; + $autosigil++; + } + $self->witnesses->add_witness( $args ); + # TODO Now how to collate these? + } } else { - # Else we need to parse some collation data. Make a Collation object - my $collation = Text::Tradition::Collation->new( %$init_args, - 'tradition' => $self ); - $self->_save_collation( $collation ); + # Else we need to parse some collation data. Make a Collation object + my $collation = Text::Tradition::Collation->new( %$init_args, + 'tradition' => $self ); + $self->_save_collation( $collation ); - # Call the appropriate parser on the given data - my @formats = grep { /^(Self|CollateX|CSV|CTE|TEI)$/ } keys( %$init_args ); - my $format = shift( @formats ); - unless( $format ) { - warn "No data given to create a collation; will initialize an empty one"; - } - if( $format && $format =~ /^(CSV|CTE)$/ && - !exists $init_args->{'base'} ) { - warn "Cannot make a collation from $format without a base text"; - return; - } + # Call the appropriate parser on the given data + my @formats = grep { /^(Self|CollateX|CSV|CTE|KUL|TEI)$/ } keys( %$init_args ); + my $format = shift( @formats ); + unless( $format ) { + warn "No data given to create a collation; will initialize an empty one"; + } + if( $format && $format =~ /^(KUL|CTE)$/ && + !exists $init_args->{'base'} ) { + warn "Cannot make a collation from $format without a base text"; + return; + } - # Starting point for all texts - my $last_node = $collation->add_reading( '#START#' ); + # Start and end points for all texts + $collation->add_reading( '#START#' ); + $collation->add_reading( '#END#' ); + - # Now do the parsing. - my @sigla; - if( $format ) { - my @parseargs; - if( $format =~ /^(CSV|CTE)$/ ) { - $init_args->{'data'} = $init_args->{$format}; - $init_args->{'format'} = $format; - $format = 'BaseText'; - @parseargs = %$init_args; - } else { - @parseargs = ( $init_args->{ $format } ); - } - my $mod = "Text::Tradition::Parser::$format"; - load( $mod ); - $mod->can('parse')->( $self, @parseargs ); - } + # Now do the parsing. + my @sigla; + if( $format ) { + my @parseargs; + if( $format =~ /^(KUL|CTE)$/ ) { + $init_args->{'data'} = $init_args->{$format}; + $init_args->{'format'} = $format; + $format = 'BaseText'; + @parseargs = %$init_args; + } else { + @parseargs = ( $init_args->{ $format } ); + } + my $mod = "Text::Tradition::Parser::$format"; + load( $mod ); + $mod->can('parse')->( $self, @parseargs ); + } } } sub witness { my( $self, $sigil ) = @_; my $requested_wit; - foreach my $wit ( @{$self->witnesses} ) { - $requested_wit = $wit if $wit->sigil eq $sigil; + foreach my $wit ( $self->witnesses ) { + $requested_wit = $wit if $wit->sigil eq $sigil; } # We depend on an undef return value for no such witness. # warn "No such witness $sigil" unless $requested_wit; return $requested_wit; } - - -sub add_witness { - my $self = shift; - my $new_wit = Text::Tradition::Witness->new( @_ ); - push( @{$self->witnesses}, $new_wit ); - return $new_wit; -} + # The user will usually be instantiating a Tradition object, and # examining its collation. The information about the tradition can diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index 40cc565..8e42931 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -1,7 +1,10 @@ package Text::Tradition::Collation; +use Encode qw( decode_utf8 ); +use File::Temp; use Graph::Easy; use IPC::Run qw( run binary ); +use Text::CSV_XS; use Text::Tradition::Collation::Path; use Text::Tradition::Collation::Position; use Text::Tradition::Collation::Reading; @@ -14,20 +17,21 @@ has 'graph' => ( is => 'ro', isa => 'Graph::Easy', handles => { - add_reading => 'add_node', - del_reading => 'del_node', - add_path => 'add_edge', - del_path => 'del_edge', - reading => 'node', - path => 'edge', - readings => 'nodes', - segments => 'nodes', - paths => 'edges', - relationships => 'edges', + add_reading => 'add_node', + del_reading => 'del_node', + del_segment => 'del_node', + add_path => 'add_edge', + del_path => 'del_edge', + reading => 'node', + path => 'edge', + readings => 'nodes', + segments => 'nodes', + paths => 'edges', + relationships => 'edges', }, default => sub { Graph::Easy->new( undirected => 0 ) }, ); - + has 'tradition' => ( # TODO should this not be ro? is => 'rw', @@ -48,6 +52,13 @@ has 'graphml' => ( predicate => 'has_graphml', ); +has 'csv' => ( + is => 'ro', + isa => 'Str', + writer => '_save_csv', + predicate => 'has_csv', + ); + # Keeps track of the lemmas within the collation. At most one lemma # per position in the graph. has 'lemmata' => ( @@ -119,20 +130,20 @@ around add_path => sub { # Make sure there are three arguments unless( @_ == 3 ) { - warn "Call add_path with args source, target, witness"; - return; + warn "Call add_path with args source, target, witness"; + return; } # Make sure the proposed path does not yet exist # NOTE 'reading' will currently return readings and segments my( $source, $target, $wit ) = @_; $source = $self->reading( $source ) - unless ref( $source ) eq 'Text::Tradition::Collation::Reading'; + unless ref( $source ) eq 'Text::Tradition::Collation::Reading'; $target = $self->reading( $target ) - unless ref( $target ) eq 'Text::Tradition::Collation::Reading'; + unless ref( $target ) eq 'Text::Tradition::Collation::Reading'; foreach my $path ( $source->edges_to( $target ) ) { - if( $path->label eq $wit && $path->class eq 'edge.path' ) { - return; - } + if( $path->label eq $wit && $path->class eq 'edge.path' ) { + return; + } } # Do the deed $self->$orig( @_ ); @@ -205,23 +216,21 @@ sub add_relationship { # Make sure there is not another relationship between these two # readings or segments already $source = $self->reading( $source ) - unless ref( $source ) && $source->isa( 'Graph::Easy::Node' ); + unless ref( $source ) && $source->isa( 'Graph::Easy::Node' ); $target = $self->reading( $target ) - unless ref( $target ) && $target->isa( 'Graph::Easy::Node' ); + unless ref( $target ) && $target->isa( 'Graph::Easy::Node' ); foreach my $rel ( $source->edges_to( $target ), $target->edges_to( $source ) ) { - if( $rel->class eq 'edge.relationship' ) { - return ( undef, "Relationship already exists between these readings" ); - } else { - return ( undef, "There is a witness path between these readings" ); - } + if( $rel->class eq 'edge.relationship' ) { + return ( undef, "Relationship already exists between these readings" ); + } } - - if( $source->has_position && $target->has_position ) { - unless( grep { $_ eq $target } $self->same_position_as( $source ) ) { - return( undef, "Cannot set relationship at different positions" ); - } + if( $options->{'equal_rank'} && !relationship_valid( $source, $target ) ) { + return ( undef, 'Relationship creates witness loop' ); } + # TODO Think about positional hilarity if relationships are added after positions + # are assigned. + my @joined = ( [ $source->name, $target->name ] ); # Keep track of the nodes we join. $options->{'this_relation'} = [ $source, $target ]; @@ -231,29 +240,38 @@ sub add_relationship { return ( undef, $@ ); } $self->graph->add_edge( $source, $target, $rel ); - if( $options->{'global'} ) { - # Look for all readings with the source label, and if there are - # colocated readings with the target label, join them too. - foreach my $r ( grep { $_->label eq $source->label } $self->readings() ) { - next if $r->name eq $source->name; - my @colocated = grep { $_->label eq $target->label } - $self->same_position_as( $r ); - if( @colocated ) { - warn "Multiple readings with same label at same position!" - if @colocated > 1; - my $colo = $colocated[0]; - next if $colo->edges_to( $r ) || $r->edges_to( $colo ); - $options->{'primary_relation'} = $options->{'this_relation'}; - $options->{'this_relation'} = [ $r, $colocated[0] ]; - my $dup_rel = Text::Tradition::Collation::Relationship->new( %$options ); - $self->graph->add_edge( $r, $colocated[0], $dup_rel ); - push( @joined, [ $r->name, $colocated[0]->name ] ); - } - } - } + + # TODO Handle global relationship setting + return( 1, @joined ); } +sub relationship_valid { + my( $source, $target ) = @_; + # Check that linking the source and target in a relationship won't lead + # to a path loop for any witness. + my @proposed_related = ( $source, $target ); + push( @proposed_related, $source->related_readings ); + push( @proposed_related, $target->related_readings ); + my %pr_ids; + map { $pr_ids{ $_->name } = 1 } @proposed_related; + # The lists of 'in' and 'out' should not have any element that appears + # in 'proposed_related'. + foreach my $pr ( @proposed_related ) { + foreach my $e ( $pr->incoming ) { + if( exists $pr_ids{ $e->from->name } ) { + return 0; + } + } + foreach my $e ( $pr->outgoing ) { + if( exists $pr_ids{ $e->to->name } ) { + return 0; + } + } + } + return 1; +} + =head2 Output method(s) =over @@ -277,8 +295,12 @@ sub as_svg { my @cmd = qw/dot -Tsvg/; my( $svg, $err ); - my $in = $self->as_dot(); - run( \@cmd, \$in, ">", binary(), \$svg ); + my $dotfile = File::Temp->new(); + binmode $dotfile, ':utf8'; + print $dotfile $self->as_dot(); + push( @cmd, $dotfile->filename ); + run( \@cmd, ">", binary(), \$svg ); + $svg = decode_utf8( $svg ); $self->_save_svg( $svg ); $self->expand_graph_paths(); return $svg; @@ -305,25 +327,25 @@ sub as_dot { $dot .= "\tedge [ arrowhead=open ];\n"; $dot .= "\tgraph [ rankdir=LR ];\n"; $dot .= sprintf( "\tnode [ fontsize=%d, fillcolor=%s, style=%s, shape=%s ];\n", - 11, "white", "filled", $self->graph->get_attribute( 'node', 'shape' ) ); + 11, "white", "filled", $self->graph->get_attribute( 'node', 'shape' ) ); foreach my $reading ( $self->readings ) { - # Need not output nodes without separate labels - next if $reading->name eq $reading->label; - # TODO output readings or segments, but not both - next if $reading->class eq 'node.segment'; - $dot .= sprintf( "\t\"%s\" [ label=\"%s\" ];\n", $reading->name, $reading->label ); + # Need not output nodes without separate labels + next if $reading->name eq $reading->label; + # TODO output readings or segments, but not both + next if $reading->class eq 'node.segment'; + $dot .= sprintf( "\t\"%s\" [ label=\"%s\" ];\n", $reading->name, $reading->label ); } my @edges = $view eq 'relationship' ? $self->relationships : $self->paths; foreach my $edge ( @edges ) { - my %variables = ( 'color' => '#000000', - 'fontcolor' => '#000000', - 'label' => $edge->label, - ); - my $varopts = join( ', ', map { $_.'="'.$variables{$_}.'"' } sort keys %variables ); - $dot .= sprintf( "\t\"%s\" -> \"%s\" [ %s ];\n", - $edge->from->name, $edge->to->name, $varopts ); + my %variables = ( 'color' => '#000000', + 'fontcolor' => '#000000', + 'label' => $edge->label, + ); + my $varopts = join( ', ', map { $_.'="'.$variables{$_}.'"' } sort keys %variables ); + $dot .= sprintf( "\t\"%s\" -> \"%s\" [ %s ];\n", + $edge->from->name, $edge->to->name, $varopts ); } $dot .= "}\n"; return $dot; @@ -348,7 +370,7 @@ sub as_graphml { my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns'; my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance'; my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' . - 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd'; + 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd'; # Create the document and root node my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" ); @@ -363,24 +385,24 @@ sub as_graphml { my %node_data_keys; my $ndi = 0; foreach my $datum ( qw/ name reading identical position class / ) { - $node_data_keys{$datum} = 'dn'.$ndi++; - my $key = $root->addNewChild( $graphml_ns, 'key' ); - $key->setAttribute( 'attr.name', $datum ); - $key->setAttribute( 'attr.type', 'string' ); - $key->setAttribute( 'for', 'node' ); - $key->setAttribute( 'id', $node_data_keys{$datum} ); + $node_data_keys{$datum} = 'dn'.$ndi++; + my $key = $root->addNewChild( $graphml_ns, 'key' ); + $key->setAttribute( 'attr.name', $datum ); + $key->setAttribute( 'attr.type', 'string' ); + $key->setAttribute( 'for', 'node' ); + $key->setAttribute( 'id', $node_data_keys{$datum} ); } # Add the data keys for edges, i.e. witnesses my $edi = 0; my %edge_data_keys; foreach my $edge_key( qw/ witness_main witness_ante_corr relationship class / ) { - $edge_data_keys{$edge_key} = 'de'.$edi++; - my $key = $root->addNewChild( $graphml_ns, 'key' ); - $key->setAttribute( 'attr.name', $edge_key ); - $key->setAttribute( 'attr.type', 'string' ); - $key->setAttribute( 'for', 'edge' ); - $key->setAttribute( 'id', $edge_data_keys{$edge_key} ); + $edge_data_keys{$edge_key} = 'de'.$edi++; + my $key = $root->addNewChild( $graphml_ns, 'key' ); + $key->setAttribute( 'attr.name', $edge_key ); + $key->setAttribute( 'attr.type', 'string' ); + $key->setAttribute( 'for', 'edge' ); + $key->setAttribute( 'id', $edge_data_keys{$edge_key} ); } # Add the graph, its nodes, and its edges @@ -397,55 +419,55 @@ sub as_graphml { my %node_hash; # Add our readings to the graph foreach my $n ( sort { $a->name cmp $b->name } $self->readings ) { - my $node_el = $graph->addNewChild( $graphml_ns, 'node' ); - my $node_xmlid = 'n' . $node_ctr++; - $node_hash{ $n->name } = $node_xmlid; - $node_el->setAttribute( 'id', $node_xmlid ); - _add_graphml_data( $node_el, $node_data_keys{'name'}, $n->name ); - _add_graphml_data( $node_el, $node_data_keys{'reading'}, $n->label ); - _add_graphml_data( $node_el, $node_data_keys{'position'}, $n->position->reference ) - if $n->has_position; - _add_graphml_data( $node_el, $node_data_keys{'class'}, $n->sub_class ); - _add_graphml_data( $node_el, $node_data_keys{'identical'}, $n->primary->name ) - if $n->has_primary; + my $node_el = $graph->addNewChild( $graphml_ns, 'node' ); + my $node_xmlid = 'n' . $node_ctr++; + $node_hash{ $n->name } = $node_xmlid; + $node_el->setAttribute( 'id', $node_xmlid ); + _add_graphml_data( $node_el, $node_data_keys{'name'}, $n->name ); + _add_graphml_data( $node_el, $node_data_keys{'reading'}, $n->label ); + _add_graphml_data( $node_el, $node_data_keys{'position'}, $n->position->reference ) + if $n->has_position; + _add_graphml_data( $node_el, $node_data_keys{'class'}, $n->sub_class ); + _add_graphml_data( $node_el, $node_data_keys{'identical'}, $n->primary->name ) + if $n->has_primary; } # Add any segments we have foreach my $n ( sort { $a->name cmp $b->name } $self->segments ) { - my $node_el = $graph->addNewChild( $graphml_ns, 'node' ); - my $node_xmlid = 'n' . $node_ctr++; - $node_hash{ $n->name } = $node_xmlid; - $node_el->setAttribute( 'id', $node_xmlid ); - _add_graphml_data( $node_el, $node_data_keys{'class'}, $n->sub_class ); - _add_graphml_data( $node_el, $node_data_keys{'name'}, $n->name ); + my $node_el = $graph->addNewChild( $graphml_ns, 'node' ); + my $node_xmlid = 'n' . $node_ctr++; + $node_hash{ $n->name } = $node_xmlid; + $node_el->setAttribute( 'id', $node_xmlid ); + _add_graphml_data( $node_el, $node_data_keys{'class'}, $n->sub_class ); + _add_graphml_data( $node_el, $node_data_keys{'name'}, $n->name ); } # Add the path, relationship, and segment edges my $edge_ctr = 0; foreach my $e ( sort { $a->from->name cmp $b->from->name } $self->graph->edges() ) { - my( $name, $from, $to ) = ( 'e'.$edge_ctr++, - $node_hash{ $e->from->name() }, - $node_hash{ $e->to->name() } ); - my $edge_el = $graph->addNewChild( $graphml_ns, 'edge' ); - $edge_el->setAttribute( 'source', $from ); - $edge_el->setAttribute( 'target', $to ); - $edge_el->setAttribute( 'id', $name ); - # Add the edge class - _add_graphml_data( $edge_el, $edge_data_keys{'class'}, $e->sub_class ); - if( $e->sub_class eq 'path' ) { - # It's a witness path, so add the witness - my $base = $e->label; - my $key = $edge_data_keys{'witness_main'}; - # TODO kind of hacky - if( $e->label =~ /^(.*?)\s+(\(a\.c\.\))$/ ) { - $base = $1; - $key = $edge_data_keys{'witness_ante_corr'}; - } - _add_graphml_data( $edge_el, $key, $base ); - } elsif( $e->sub_class eq 'relationship' ) { - # It's a relationship - _add_graphml_data( $edge_el, $edge_data_keys{'relationship'}, $e->label ); - } # else a segment, nothing to record but source, target, class + my( $name, $from, $to ) = ( 'e'.$edge_ctr++, + $node_hash{ $e->from->name() }, + $node_hash{ $e->to->name() } ); + my $edge_el = $graph->addNewChild( $graphml_ns, 'edge' ); + $edge_el->setAttribute( 'source', $from ); + $edge_el->setAttribute( 'target', $to ); + $edge_el->setAttribute( 'id', $name ); + # Add the edge class + _add_graphml_data( $edge_el, $edge_data_keys{'class'}, $e->sub_class ); + if( $e->sub_class eq 'path' ) { + # It's a witness path, so add the witness + my $base = $e->label; + my $key = $edge_data_keys{'witness_main'}; + # TODO kind of hacky + if( $e->label =~ /^(.*?)\s+(\(a\.c\.\))$/ ) { + $base = $1; + $key = $edge_data_keys{'witness_ante_corr'}; + } + _add_graphml_data( $edge_el, $key, $base ); + } elsif( $e->sub_class eq 'relationship' ) { + # It's a relationship + _add_graphml_data( $edge_el, $edge_data_keys{'relationship'}, $e->label ); + } # else a segment, nothing to record but source, target, class } # Return the thing @@ -461,6 +483,81 @@ sub _add_graphml_data { $data_el->appendText( $value ); } +=item B + +print $graph->as_csv( $recalculate ) + +Returns a CSV alignment table representation of the collation graph, one +row per witness (or witness uncorrected.) Unless $recalculate is passed +(and is a true value), the method will return a cached copy of the CSV +after the first call to the method. + +=cut + +sub as_csv { + my( $self, $recalc ) = @_; + return $self->csv if $self->has_csv; + my $table = $self->make_alignment_table; + my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } ); + my @result; + foreach my $row ( @$table ) { + $csv->combine( @$row ); + push( @result, decode_utf8( $csv->string ) ); + } + $self->_save_csv( join( "\n", @result ) ); + return $self->csv; +} + +sub make_alignment_table { + my $self = shift; + unless( $self->linear ) { + warn "Need a linear graph in order to make an alignment table"; + return; + } + my $table; + my @all_pos = sort { $a <=> $b } $self->possible_positions; + foreach my $wit ( $self->tradition->witnesses ) { + my @row = _make_witness_row( $wit->path, \@all_pos ); + unshift( @row, $wit->sigil ); + push( @$table, \@row ); + if( $wit->has_ante_corr ) { + my @ac_row = _make_witness_row( $wit->uncorrected_path, \@all_pos ); + unshift( @ac_row, $wit->sigil . $self->ac_label ); + push( @$table, \@ac_row ); + } + } + # Return a table where the witnesses read in columns rather than rows. + my $turned = _turn_table( $table ); + return $turned; +} + +sub _make_witness_row { + my( $path, $positions ) = @_; + my %char_hash; + map { $char_hash{$_} = undef } @$positions; + foreach my $rdg ( @$path ) { + $char_hash{$rdg->rank} = $rdg->text; + } + my @row = map { $char_hash{$_} } @$positions; + return @row; +} + +# Helper to turn the witnesses along columns rather than rows. Assumes +# equal-sized rows. +sub _turn_table { + my( $table ) = @_; + my $result = []; + return $result unless scalar @$table; + my $nrows = scalar @{$table->[0]}; + foreach my $idx ( 0 .. $nrows - 1 ) { + foreach my $wit ( 0 .. $#{$table} ) { + $result->[$idx]->[$wit] = $table->[$wit]->[$idx]; + } + } + return $result; +} + + sub collapse_graph_paths { my $self = shift; # Our collation graph has an path per witness. This is great for @@ -472,42 +569,35 @@ sub collapse_graph_paths { print STDERR "Collapsing witness paths in graph...\n"; # Don't list out every witness if we have more than half to list. - my $majority = int( scalar( @{$self->tradition->witnesses} ) / 2 ) + 1; + my $majority = int( scalar( $self->tradition->witnesses ) / 2 ) + 1; # But don't compress if there are only a few witnesses. $majority = 4 if $majority < 4; foreach my $node ( $self->readings ) { - my $newlabels = {}; - # We will visit each node, so we only look ahead. - foreach my $edge ( $node->outgoing() ) { - next unless $edge->class eq 'edge.path'; - add_hash_entry( $newlabels, $edge->to->name, $edge->name ); - $self->del_path( $edge ); - } - - foreach my $newdest ( keys %$newlabels ) { - my $label; - my @compressed_wits = (); - if( @{$newlabels->{$newdest}} < $majority ) { - $label = join( ', ', sort( @{$newlabels->{$newdest}} ) ); - } else { - ## TODO FIX THIS HACK - my @aclabels; - foreach my $wit ( @{$newlabels->{$newdest}} ) { - if( $wit =~ /^(.*?)(\s*\(?a\.\s*c\.\)?)$/ ) { - push( @aclabels, $wit ); - } else { - push( @compressed_wits, $wit ); - } - } - $label = join( ', ', 'majority', sort( @aclabels ) ); - } - - my $newpath = - $self->add_path( $node, $self->reading( $newdest ), $label ); - if( @compressed_wits ) { - $newpath->hidden_witnesses( \@compressed_wits ); - } - } + my $newlabels = {}; + # We will visit each node, so we only look ahead. + foreach my $edge ( $node->outgoing() ) { + next unless $edge->class eq 'edge.path'; + add_hash_entry( $newlabels, $edge->to->name, $edge->name ); + $self->del_path( $edge ); + } + + foreach my $newdest ( keys %$newlabels ) { + my $label; + my @compressed_wits = @{$newlabels->{$newdest}}; + if( @compressed_wits < $majority ) { + $label = join( ', ', sort( @{$newlabels->{$newdest}} ) ); + } else { + ## TODO FIX THIS HACK + my @aclabels; + foreach my $wit ( @compressed_wits ) { + push( @aclabels, $wit ) if( $wit =~ /^(.*?)(\s*\(?a\.\s*c\.\)?)$/ ); + } + $label = join( ', ', 'majority', sort( @aclabels ) ); + } + + my $newpath = $self->add_path( $node, $self->reading( $newdest ), $label ); + $newpath->hidden_witnesses( \@compressed_wits ); + } } $self->collapsed( 1 ); @@ -523,16 +613,15 @@ sub expand_graph_paths { print STDERR "Expanding witness paths in graph...\n"; foreach my $path( $self->paths ) { - my $from = $path->from; - my $to = $path->to; - my @wits = split( /, /, $path->label ); - if( $path->has_hidden_witnesses ) { - push( @wits, @{$path->hidden_witnesses} ); - } - $self->del_path( $path ); - foreach ( @wits ) { - $self->add_path( $from, $to, $_ ); - } + my $from = $path->from; + my $to = $path->to; + warn sprintf( "No hidden witnesses on %s -> %s ?", $from->name, $to->name ) + unless $path->has_hidden_witnesses; + my @wits = @{$path->hidden_witnesses}; + $self->del_path( $path ); + foreach ( @wits ) { + $self->add_path( $from, $to, $_ ); + } } $self->collapsed( 0 ); } @@ -556,12 +645,34 @@ sub start { my $self = shift; my( $new_start ) = @_; if( $new_start ) { - $self->del_reading( '#START#' ); - $self->graph->rename_node( $new_start, '#START#' ); + $self->del_reading( '#START#' ); + $self->graph->rename_node( $new_start, '#START#' ); + } + # Make sure the start node has a start position. + unless( $self->reading( '#START#' )->has_position ) { + $self->reading( '#START#' )->position( '0,0' ); } return $self->reading('#START#'); } +=item B + +my $end = $collation->end(); + +Returns the end of the collation, a meta-reading with label '#END#'. + +=cut + +sub end { + my $self = shift; + my( $new_end ) = @_; + if( $new_end ) { + $self->del_reading( '#END#' ); + $self->graph->rename_node( $new_end, '#END#' ); + } + return $self->reading('#END#'); +} + =item B my @readings = $graph->reading_sequence( $first, $last, $path[, $alt_path] ); @@ -572,6 +683,8 @@ assume that the path is that of the base text (if any.) =cut +# TODO Think about returning some lazy-eval iterator. + sub reading_sequence { my( $self, $start, $end, $witness, $backup ) = @_; @@ -580,23 +693,23 @@ sub reading_sequence { my %seen; my $n = $start; while( $n && $n ne $end ) { - if( exists( $seen{$n->name()} ) ) { - warn "Detected loop at " . $n->name(); - last; - } - $seen{$n->name()} = 1; - - my $next = $self->next_reading( $n, $witness, $backup ); - warn "Did not find any path for $witness from reading " . $n->name - unless $next; - push( @readings, $next ); - $n = $next; + if( exists( $seen{$n->name()} ) ) { + warn "Detected loop at " . $n->name(); + last; + } + $seen{$n->name()} = 1; + + my $next = $self->next_reading( $n, $witness, $backup ); + warn "Did not find any path for $witness from reading " . $n->name + unless $next; + push( @readings, $next ); + $n = $next; } # Check that the last reading is our end reading. my $last = $readings[$#readings]; warn "Last reading found from " . $start->label() . - " for witness $witness is not the end!" - unless $last eq $end; + " for witness $witness is not the end!" + unless $last eq $end; return @readings; } @@ -634,7 +747,7 @@ sub prior_reading { sub _find_linked_reading { my( $self, $direction, $node, $path, $alt_path ) = @_; my @linked_paths = $direction eq 'next' - ? $node->outgoing() : $node->incoming(); + ? $node->outgoing() : $node->incoming(); return undef unless scalar( @linked_paths ); # We have to find the linked path that contains all of the @@ -645,29 +758,29 @@ sub _find_linked_reading { my $base_le; my $alt_le; foreach my $le ( @linked_paths ) { - if( $le->name eq $self->baselabel ) { - $base_le = $le; - } else { - my @le_wits = $self->witnesses_of_label( $le->name ); - if( _is_within( \@path_wits, \@le_wits ) ) { - # This is the right path. - return $direction eq 'next' ? $le->to() : $le->from(); - } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) { - $alt_le = $le; - } - } + if( $le->name eq $self->baselabel ) { + $base_le = $le; + } else { + my @le_wits = $self->witnesses_of_label( $le->name ); + if( _is_within( \@path_wits, \@le_wits ) ) { + # This is the right path. + return $direction eq 'next' ? $le->to() : $le->from(); + } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) { + $alt_le = $le; + } + } } # Got this far? Return the alternate path if it exists. return $direction eq 'next' ? $alt_le->to() : $alt_le->from() - if $alt_le; + if $alt_le; # Got this far? Return the base path if it exists. return $direction eq 'next' ? $base_le->to() : $base_le->from() - if $base_le; + if $base_le; # Got this far? We have no appropriate path. warn "Could not find $direction node from " . $node->label - . " along path $path"; + . " along path $path"; return undef; } @@ -676,7 +789,7 @@ sub _is_within { my( $set1, $set2 ) = @_; my $ret = @$set1; # will be 0, i.e. false, if set1 is empty foreach my $el ( @$set1 ) { - $ret = 0 unless grep { /^\Q$el\E$/ } @$set2; + $ret = 0 unless grep { /^\Q$el\E$/ } @$set2; } return $ret; } @@ -695,24 +808,24 @@ sub walk_witness_paths { # text in the collation. my $paths = {}; my @common_readings; - foreach my $wit ( @{$self->tradition->witnesses} ) { - my $curr_reading = $self->start; - my @wit_path = $self->reading_sequence( $self->start, $end, - $wit->sigil ); - $wit->path( \@wit_path ); - - # Detect the common readings. - @common_readings = _find_common( \@common_readings, \@wit_path ); + foreach my $wit ( $self->tradition->witnesses ) { + my $curr_reading = $self->start; + my @wit_path = $self->reading_sequence( $self->start, $end, + $wit->sigil ); + $wit->path( \@wit_path ); + + # Detect the common readings. + @common_readings = _find_common( \@common_readings, \@wit_path ); } # Mark all the nodes as either common or not. foreach my $cn ( @common_readings ) { - print STDERR "Setting " . $cn->name . " / " . $cn->label - . " as common node\n"; - $cn->make_common; + print STDERR "Setting " . $cn->name . " / " . $cn->label + . " as common node\n"; + $cn->make_common; } foreach my $n ( $self->readings() ) { - $n->make_variant unless $n->is_common; + $n->make_variant unless $n->is_common; } # Return an array of the common nodes in order. return @common_readings; @@ -722,11 +835,11 @@ sub _find_common { my( $common_readings, $new_path ) = @_; my @cr; if( @$common_readings ) { - foreach my $n ( @$new_path ) { - push( @cr, $n ) if grep { $_ eq $n } @$common_readings; - } + foreach my $n ( @$new_path ) { + push( @cr, $n ) if grep { $_ eq $n } @$common_readings; + } } else { - push( @cr, @$new_path ); + push( @cr, @$new_path ); } return @cr; } @@ -737,7 +850,7 @@ sub _remove_common { my %diverged; map { $diverged{$_->name} = 1 } @$divergence; foreach( @$common_readings ) { - push( @cr, $_ ) unless $diverged{$_->name}; + push( @cr, $_ ) unless $diverged{$_->name}; } return @cr; } @@ -751,11 +864,11 @@ sub make_witness_paths { my( $self ) = @_; my @common_readings; - foreach my $wit ( @{$self->tradition->witnesses} ) { - print STDERR "Making path for " . $wit->sigil . "\n"; - $self->make_witness_path( $wit ); - @common_readings = _find_common( \@common_readings, $wit->path ); - @common_readings = _find_common( \@common_readings, $wit->uncorrected_path ); + foreach my $wit ( $self->tradition->witnesses ) { + print STDERR "Making path for " . $wit->sigil . "\n"; + $self->make_witness_path( $wit ); + @common_readings = _find_common( \@common_readings, $wit->path ); + @common_readings = _find_common( \@common_readings, $wit->uncorrected_path ); } map { $_->make_common } @common_readings; return @common_readings; @@ -766,14 +879,14 @@ sub make_witness_path { my @chain = @{$wit->path}; my $sig = $wit->sigil; foreach my $idx ( 0 .. $#chain-1 ) { - $self->add_path( $chain[$idx], $chain[$idx+1], $sig ); + $self->add_path( $chain[$idx], $chain[$idx+1], $sig ); } @chain = @{$wit->uncorrected_path}; foreach my $idx( 0 .. $#chain-1 ) { - my $source = $chain[$idx]; - my $target = $chain[$idx+1]; - $self->add_path( $source, $target, $sig.$self->ac_label ) - unless $self->has_path( $source, $target, $sig ); + my $source = $chain[$idx]; + my $target = $chain[$idx+1]; + $self->add_path( $source, $target, $sig.$self->ac_label ) + unless $self->has_path( $source, $target, $sig ); } } @@ -783,165 +896,104 @@ sub common_readings { return sort { $a->position->cmp_with( $b->position ) } @common; } -# Calculate the relative positions of nodes in the graph, if they -# were not given to us. -sub calculate_positions { - my( $self, @ordered_common ) = @_; - - # First assign positions to all the common nodes. - my $l = 1; - foreach my $oc ( @ordered_common ) { - $oc->position( $l++, 1 ); +sub calculate_ranks { + my $self = shift; + # Walk a version of the graph where every node linked by a relationship + # edge is fundamentally the same node, and do a topological ranking on + # the nodes in this graph. + my $topo_graph = Graph::Easy->new(); + my %rel_containers; + my $rel_ctr = 0; + # Add the nodes + foreach my $r ( $self->readings ) { + next if exists $rel_containers{$r->name}; + my @rels = $r->related_readings( 'colocated' ); + if( @rels ) { + # Make a relationship container. + push( @rels, $r ); + my $rn = $topo_graph->add_node( 'rel_container_' . $rel_ctr++ ); + foreach( @rels ) { + $rel_containers{$_->name} = $rn; + } + } else { + # Add a new node to mirror the old node. + $rel_containers{$r->name} = $topo_graph->add_node( $r->name ); + } } - if( $self->linear ) { - # For the space between each common node, we have to find all the chains - # from all the witnesses. The longest chain gives us our max, and the - # others get min/max ranges to fit. - my $first = shift @ordered_common; - while( @ordered_common ) { - my %paths; - my $next = shift @ordered_common; - my $longest = 0; - foreach my $wit ( @{$self->tradition->witnesses} ) { - # Key to the path is not important; we just have to get - # all unique paths. - my $length = $self->_track_paths( \%paths, $first, $next, $wit->sigil ); - $longest = $length unless $longest > $length; - if( $wit->has_ante_corr ) { - my $length = $self->_track_paths( \%paths, $first, $next, - $wit->sigil.$self->ac_label, $wit->sigil ); - $longest = $length unless $longest > $length; - } - } - - # Transform the path values from unique strings to arrays. - my @all_paths; - foreach my $k ( keys %paths ) { - my @v = split( /\s+/, $k ); - push( @all_paths, \@v ); - } - @all_paths = sort { scalar @$b <=> scalar @$a } @all_paths; - - # Now %paths has all the unique paths, and we know how long the - # longest of these is. Assign positions, starting with the - # longest. All non-common positions start at 2. - foreach my $path ( @all_paths ) { - # Initially each element has a minimum position of 2 - # plus its position in the array (1 is the common - # node), and a max position of the longest array - # length minus its position in the array. - my $range = $longest - scalar @$path; - my $min = 2; - foreach my $i ( 0 .. $#{$path} ) { - my $rdg = $self->reading( $path->[$i] ); - if( $rdg->has_position ) { - # This reading has already had a more specific - # position set, so we need to take that into - # account when calculating the min and max for - # the next reading. - my $rangeminus = $rdg->position->min - $min; - $min = $rdg->position->min + 1; - $range = $range - $rangeminus; - if( $range < 0 ) { - print STDERR "Negative range for position! " . $rdg->name . "\n"; # May remove this warning - $range = 0; - } - } else { - $rdg->position( $first->position->common, $min, $min+$range ); - $min++; - $longest = $min+$range-2 unless $longest+2 > $min+$range; # min starts at 2 but longest assumes 0 start - } - } - } - # Now go through again and make sure the positions are - # monotonic. Do this until they are. - my $monotonic = 0; - my $counter = 0; - until( $monotonic ) { - $monotonic = 1; - $counter++; - foreach my $path ( @all_paths ) { - foreach my $i ( 0 .. $#{$path} ) { - my $rdg = $self->reading( $path->[$i] ); - my $prior = $self->reading( $path->[$i-1] ) if $i > 0; - my $next = $self->reading( $path->[$i+1] ) if $i < $#{$path}; - if( $prior && $rdg->position->min <= $prior->position->min ) { - $monotonic = 0; - $rdg->position->min( $prior->position->min + 1 ); - } - if( $next && $rdg->position->max >= $next->position->max ) { - $monotonic = 0; - if( $next->position->max - 1 >= $rdg->position->min ) { - # If moving rdg/max down would not send it below - # rdg/min, do that. - $rdg->position->max( $next->position->max - 1 ); - } else { - # Otherwise increase next/max. - $next->position->max( $rdg->position->max + 1 ); - # ...min will be fixed on the next pass. - } - } - } - } - if( $counter > $#all_paths + 1 && !$monotonic ) { - # We risk an infinite loop. Get out of here. - warn "Still not monotonic after $counter passes at common point " - . $first->position->common; - last; - } - } - print STDERR "Took $counter passes for monotonicity at " . $first->position->common. "\n" - if $counter > 1; - - $first = $next; - } - - } else { - - # Non-linear positions are pretty much impossible to pin down. - # Any reading might appear anywhere in the graph. I guess we - # can do positions where there aren't transpositions... - + # Add the edges. Need only one edge between any pair of nodes. + foreach my $r ( $self->readings ) { + foreach my $n ( $r->neighbor_readings( 'forward' ) ) { + $topo_graph->add_edge_once( $rel_containers{$r->name}, + $rel_containers{$n->name} ); + } + } + + # Now do the rankings, starting with the start node. + my $topo_start = $rel_containers{$self->start->name}; + my $node_ranks = { $topo_start->name => 0 }; + my @curr_origin = ( $topo_start ); + # A little iterative function. + while( @curr_origin ) { + @curr_origin = _assign_rank( $node_ranks, @curr_origin ); + } + # Transfer our rankings from the topological graph to the real one. + foreach my $r ( $self->readings ) { + $r->rank( $node_ranks->{$rel_containers{$r->name}->name} ); } - $self->init_lemmata(); } -# Helper function for the guts of calculate_positions. -sub _track_paths { - my $self = shift; - my $track_hash = shift; - # Args are first, last, wit, backup - my @path = $self->reading_sequence( @_ ); - # Top and tail the array - shift @path; - pop @path; - $track_hash->{join( ' ', map { $_->name } @path )} = $_[2] - if @path; - return @path; +sub _assign_rank { + my( $node_ranks, @current_nodes ) = @_; + # Look at each of the children of @current_nodes. If all the child's + # parents have a rank, assign it the highest rank + 1 and add it to + # @next_nodes. Otherwise skip it. + my @next_nodes; + foreach my $c ( @current_nodes ) { + warn "Current reading " . $c->name . "has no rank!" + unless exists $node_ranks->{$c->name}; + # print STDERR "Looking at child of node " . $c->name . ", rank " + # . $node_ranks->{$c->name} . "\n"; + my @children = map { $_->to } $c->outgoing; + foreach my $child ( @children ) { + next if exists $node_ranks->{$child->name}; + my $highest_rank = -1; + my $skip = 0; + my @parents = map { $_->from } $child->incoming; + foreach my $parent ( @parents ) { + if( exists $node_ranks->{$parent->name} ) { + $highest_rank = $node_ranks->{$parent->name} + if $highest_rank <= $node_ranks->{$parent->name}; + } else { + $skip = 1; + last; + } + } + next if $skip; + # print STDERR "Assigning rank " . ( $highest_rank + 1 ) . " to node " . $child->name . "\n"; + $node_ranks->{$child->name} = $highest_rank + 1; + push( @next_nodes, $child ); + } + } + return @next_nodes; } - + sub possible_positions { my $self = shift; - my @answer; - my %positions = (); - foreach my $r ( $self->readings ) { - next unless $r->has_position; - $positions{$r->position->maxref} = 1; - } - @answer = keys %positions; - return @answer; + my %all_pos; + map { $all_pos{ $_->rank } = 1 } $self->readings; + return keys %all_pos; } # TODO think about indexing this. sub readings_at_position { my( $self, $position, $strict ) = @_; unless( ref( $position ) eq 'Text::Tradition::Collation::Position' ) { - $position = Text::Tradition::Collation::Position->new( $position ); + $position = Text::Tradition::Collation::Position->new( $position ); } my @answer; foreach my $r ( $self->readings ) { - push( @answer, $r ) if $r->is_at_position( $position, $strict ); + push( @answer, $r ) if $r->is_at_position( $position, $strict ); } return @answer; } @@ -952,11 +1004,11 @@ sub init_lemmata { my $self = shift; foreach my $position ( $self->possible_positions ) { - $self->lemmata->{$position} = undef; + $self->lemmata->{$position} = undef; } foreach my $cr ( $self->common_readings ) { - $self->lemmata->{$cr->position->maxref} = $cr->name; + $self->lemmata->{$cr->position->maxref} = $cr->name; } } @@ -1007,37 +1059,37 @@ sub lemma_readings { map { $fixed_positions{$_} = 0 } keys %{$positions_off}; map { $fixed_positions{$_} = 1 } $self->possible_positions; foreach my $pos ( sort { Text::Tradition::Collation::Position::str_cmp( $a, $b ) } keys %fixed_positions ) { - # Find the state of this position. If there is an active node, - # its name will be the state; otherwise the state will be 0 - # (nothing at this position) or undef (ellipsis at this position) - my $active = undef; - $active = $self->lemmata->{$pos} if exists $self->lemmata->{$pos}; - - # Is there a formerly active node that was toggled off? - if( exists( $positions_off->{$pos} ) ) { - my $off_node = $positions_off->{$pos}; - if( $active && $active ne $off_node) { - push( @answer, [ $off_node, 0 ], [ $active, 1 ] ); - } else { - unless( $fixed_positions{$pos} ) { - $active = 0; - delete $self->lemmata->{$pos}; - } - push( @answer, [ $off_node, $active ] ); - } - - # No formerly active node, so we just see if there is a currently - # active one. - } elsif( $active ) { - # Push the active node, whatever it is. - push( @answer, [ $active, 1 ] ); - } else { - # Push the state that is there. Arbitrarily use the first node - # at that position. - my @pos_nodes = $self->readings_at_position( $pos ); - push( @answer, [ $pos_nodes[0]->name, $self->lemmata->{$pos} ] ); - delete $self->lemmata->{$pos} unless $fixed_positions{$pos}; - } + # Find the state of this position. If there is an active node, + # its name will be the state; otherwise the state will be 0 + # (nothing at this position) or undef (ellipsis at this position) + my $active = undef; + $active = $self->lemmata->{$pos} if exists $self->lemmata->{$pos}; + + # Is there a formerly active node that was toggled off? + if( exists( $positions_off->{$pos} ) ) { + my $off_node = $positions_off->{$pos}; + if( $active && $active ne $off_node) { + push( @answer, [ $off_node, 0 ], [ $active, 1 ] ); + } else { + unless( $fixed_positions{$pos} ) { + $active = 0; + delete $self->lemmata->{$pos}; + } + push( @answer, [ $off_node, $active ] ); + } + + # No formerly active node, so we just see if there is a currently + # active one. + } elsif( $active ) { + # Push the active node, whatever it is. + push( @answer, [ $active, 1 ] ); + } else { + # Push the state that is there. Arbitrarily use the first node + # at that position. + my @pos_nodes = $self->readings_at_position( $pos ); + push( @answer, [ $pos_nodes[0]->name, $self->lemmata->{$pos} ] ); + delete $self->lemmata->{$pos} unless $fixed_positions{$pos}; + } } return @answer; @@ -1059,8 +1111,8 @@ sub toggle_reading { return unless $rname; my $reading = $self->reading( $rname ); if( !$reading || $reading->is_common() ) { - # Do nothing, it's a common node. - return; + # Do nothing, it's a common node. + return; } my $pos = $reading->position; @@ -1069,43 +1121,43 @@ sub toggle_reading { my @readings_off; if( $old_state && $old_state eq $rname ) { - # Turn off the node. We turn on no others by default. - push( @readings_off, $reading ); + # Turn off the node. We turn on no others by default. + push( @readings_off, $reading ); } else { - # Turn on the node. - $self->lemmata->{$pos->reference} = $rname; - # Any other 'on' readings in the same position should be off - # if we have a fixed position. - push( @readings_off, $self->same_position_as( $reading, 1 ) ) - if $pos->fixed; - # Any node that is an identical transposed one should be off. - push( @readings_off, $reading->identical_readings ); + # Turn on the node. + $self->lemmata->{$pos->reference} = $rname; + # Any other 'on' readings in the same position should be off + # if we have a fixed position. + push( @readings_off, $self->same_position_as( $reading, 1 ) ) + if $pos->fixed; + # Any node that is an identical transposed one should be off. + push( @readings_off, $reading->identical_readings ); } @readings_off = unique_list( @readings_off ); - + # Turn off the readings that need to be turned off. my @readings_delemmatized; foreach my $n ( @readings_off ) { - my $npos = $n->position; - my $state = undef; - $state = $self->lemmata->{$npos->reference} - if defined $self->lemmata->{$npos->reference}; - if( $state && $state eq $n->name ) { - # this reading is still on, so turn it off - push( @readings_delemmatized, $n ); - my $new_state = undef; - if( $npos->fixed && $n eq $reading ) { - # This is the reading that was clicked, so if there are no - # other readings there and this is a fixed position, turn off - # the position. In all other cases, restore the ellipsis. - my @other_n = $self->same_position_as( $n ); # TODO do we need strict? - $new_state = 0 unless @other_n; - } - $self->lemmata->{$npos->reference} = $new_state; - } elsif( $old_state && $old_state eq $n->name ) { - # another reading has already been turned on here - push( @readings_delemmatized, $n ); - } # else some other reading was on anyway, so pass. + my $npos = $n->position; + my $state = undef; + $state = $self->lemmata->{$npos->reference} + if defined $self->lemmata->{$npos->reference}; + if( $state && $state eq $n->name ) { + # this reading is still on, so turn it off + push( @readings_delemmatized, $n ); + my $new_state = undef; + if( $npos->fixed && $n eq $reading ) { + # This is the reading that was clicked, so if there are no + # other readings there and this is a fixed position, turn off + # the position. In all other cases, restore the ellipsis. + my @other_n = $self->same_position_as( $n ); # TODO do we need strict? + $new_state = 0 unless @other_n; + } + $self->lemmata->{$npos->reference} = $new_state; + } elsif( $old_state && $old_state eq $n->name ) { + # another reading has already been turned on here + push( @readings_delemmatized, $n ); + } # else some other reading was on anyway, so pass. } return @readings_delemmatized; } @@ -1147,9 +1199,9 @@ sub unique_list { sub add_hash_entry { my( $hash, $key, $entry ) = @_; if( exists $hash->{$key} ) { - push( @{$hash->{$key}}, $entry ); + push( @{$hash->{$key}}, $entry ); } else { - $hash->{$key} = [ $entry ]; + $hash->{$key} = [ $entry ]; } } diff --git a/lib/Text/Tradition/Collation/Position.pm b/lib/Text/Tradition/Collation/Position.pm index f226e2f..3fd77bd 100644 --- a/lib/Text/Tradition/Collation/Position.pm +++ b/lib/Text/Tradition/Collation/Position.pm @@ -35,18 +35,18 @@ around BUILDARGS => sub { # single argument to be parsed out into a position. my %args; if( @_ == 1 ) { - my( $common, $min, $max ) = parse_reference( $_[0] ); - %args = ( 'common' => $common, - 'min' => $min, - 'max' => $max ); + my( $common, $min, $max ) = parse_reference( $_[0] ); + %args = ( 'common' => $common, + 'min' => $min, + 'max' => $max ); } elsif ( 2 <= @_ && @_ <= 3 ) { - my( $common, $min, $max ) = @_; - $max = $min unless $max; - %args = ( 'common' => $common, - 'min' => $min, - 'max' => $max ); + my( $common, $min, $max ) = @_; + $max = $min unless $max; + %args = ( 'common' => $common, + 'min' => $min, + 'max' => $max ); } else { - %args = @_; + %args = @_; } return $class->$orig( %args ); @@ -55,19 +55,19 @@ around BUILDARGS => sub { sub BUILD { my $self = shift; if( $self->min > $self->max ) { - die "Position minimum cannot be higher than maximum"; + die "Position minimum cannot be higher than maximum"; } } sub parse_reference { my( $ref ) = @_; if( $ref =~ /^(\d+),(\d+)(\-(\d+))?$/ ) { - my( $common, $min, $max ) = ( $1, $2, $4 ); - $max = $min unless defined $max; - return( $common, $min, $max ); + my( $common, $min, $max ) = ( $1, $2, $4 ); + $max = $min unless defined $max; + return( $common, $min, $max ); } else { - warn "Bad argument $ref passed to Position constructor"; - return undef; + warn "Bad argument $ref passed to Position constructor"; + return undef; } } @@ -75,7 +75,7 @@ sub parse_reference { sub cmp_with { my( $self, $other ) = @_; return _cmp_bits( [ $self->common, $self->min, $self->max ], - [ $other->common, $other->min, $other->max ] ); + [ $other->common, $other->min, $other->max ] ); } # Class method @@ -89,9 +89,9 @@ sub str_cmp { sub _cmp_bits { my( $a, $b ) = @_; return $a->[0] <=> $b->[0] - unless $a->[0] == $b->[0]; + unless $a->[0] == $b->[0]; return $a->[1] <=> $b->[1] - unless $a->[1] == $b->[1]; + unless $a->[1] == $b->[1]; return $a->[2] <=> $b->[2]; } @@ -120,13 +120,13 @@ sub fixed { sub is_colocated { my( $self, $other, $strict ) = @_; if( $strict ) { - return $self->common == $other->common - && $self->min == $other->min - && $self->max == $other->max; + return $self->common == $other->common + && $self->min == $other->min + && $self->max == $other->max; } else { - return $self->common == $other->common - && $self->min <= $other->max - && $self->max >= $other->min; + return $self->common == $other->common + && $self->min <= $other->max + && $self->max >= $other->min; } } diff --git a/lib/Text/Tradition/Collation/Reading.pm b/lib/Text/Tradition/Collation/Reading.pm index 4debb57..5c1d866 100644 --- a/lib/Text/Tradition/Collation/Reading.pm +++ b/lib/Text/Tradition/Collation/Reading.pm @@ -11,6 +11,12 @@ has 'position' => ( isa => 'Text::Tradition::Collation::Position', predicate => 'has_position', ); + +has 'rank' => ( + is => 'rw', + isa => 'Int', + predicate => 'has_rank', + ); # This contains an array of reading objects; the array is a pool, # shared by the reading objects inside the pool. When a reading is @@ -62,6 +68,33 @@ sub text { return $self->label; } +sub witnessed_by { + my( $self, $sigil, $backup ) = @_; + my @wits = $self->witnesses; + return 1 if grep { $_ eq $sigil } @wits; + if( $backup ) { + return 1 if grep { $_ eq $backup } @wits; + } + return 0; +} + +sub witnesses { + my( $self ) = @_; + my @paths = grep { $_->get_attribute( 'class' ) eq 'path' } $self->outgoing; + push( @paths, grep { $_->get_attribute( 'class' ) eq 'path' } $self->incoming ); + my %wits; + foreach my $p ( @paths ) { + if( $p->has_hidden_witnesses ) { + foreach ( @{$p->hidden_witnesses} ) { + $wits{$_} = 1; + } + } else { + $wits{$p->label} = 1; + } + } + return keys %wits; +} + sub merge_from { my( $self, $merged_node ) = @_; # Adopt the identity pool of the other node. @@ -69,7 +102,7 @@ sub merge_from { my $new_pool = _merge_array_pool( \@now_identical, $self->same_as ) if @now_identical; - # TODO Adopt the relationship attributes of the other node. + # TODO Adopt the relationship attributes and segment memberships of the other node. } ## Dealing with transposed readings. These methods are only really @@ -141,71 +174,19 @@ sub neighbor_readings { return values( %connected ); } -sub adjust_neighbor_position { - my $self = shift; - return unless $self->position->fixed; - - # TODO This is a naive and repetitive implementation and - # I don't like it. - foreach my $neighbor ( $self->neighbor_readings('forward') ) { - next unless !$neighbor->is_common && - $neighbor->position->common == $self->position->common; - if( $neighbor->position->fixed && - $neighbor->position->min == $self->position->min ) { - warn sprintf( "Readings %s and %s are at the same position!", - $neighbor->name, $self->name ); - } - next if $neighbor->position->fixed || $neighbor->position->matched; - $neighbor->position->min( $self->position->min + 1 ); - # Recurse if necessary. - $neighbor->adjust_neighbor_position() - unless $neighbor->position->fixed; - } - foreach my $neighbor ( $self->neighbor_readings('back') ) { - next unless !$neighbor->is_common && - $neighbor->position->common == $self->position->common; - if( $neighbor->position->fixed && - $neighbor->position->min == $self->position->min ) { - warn sprintf( "Readings %s and %s are at the same position!", - $neighbor->name, $self->name ); - } - next if $neighbor->position->fixed || $neighbor->position->matched; - $neighbor->position->max( $self->position->max - 1 ); - # Recurse if necessary. - $neighbor->adjust_neighbor_position() - unless $neighbor->position->fixed; +# Returns all readings related to the one we've got. +sub related_readings { + my( $self, $colocated ) = @_; + my @related; + foreach my $e ( $self->edges ) { + next unless $e->isa( 'Text::Tradition::Collation::Relationship' ); + next if $colocated && $e->type eq 'repetition'; + push( @related, $e->from eq $self ? $e->to : $e->from ); } - return; -} - -sub match_position { - my( $self, $other ) = @_; - # Adjust the position of both these nodes to be as restrictive as possible. - unless( $self->position->is_colocated( $other->position ) ) { - warn "Cannot match positions of non-colocated readings"; - return; - } - my $sp = $self->position; - my $op = $other->position; - my $newmin = $sp->min > $op->min ? $sp->min : $op->min; - my $newmax = $sp->max < $op->max ? $sp->max : $op->max; - my $newpos = Text::Tradition::Collation::Position->new( - 'common' => $sp->common, - 'min' => $newmin, - 'max' => $newmax, - 'matched' => 1, - ); - # We are setting the positions to be the same object. I don't - # think that actually matters. We may eventually want unique - # objects for each position. - $self->position( $newpos ); - $other->position( $newpos ); - $self->adjust_neighbor_position(); - $other->adjust_neighbor_position(); + return @related; } ## Keep track of which readings are unchanged across witnesses. - sub is_common { my( $self ) = shift; return $self->get_attribute( 'class' ) eq 'common'; diff --git a/lib/Text/Tradition/Collation/Relationship.pm b/lib/Text/Tradition/Collation/Relationship.pm index 4efab5e..14e00fb 100644 --- a/lib/Text/Tradition/Collation/Relationship.pm +++ b/lib/Text/Tradition/Collation/Relationship.pm @@ -59,6 +59,11 @@ has 'non_independent' => ( is => 'rw', isa => 'Bool', ); + +has 'equal_rank' => ( + is => 'rw', + isa => 'Bool', + ); sub FOREIGNBUILDARGS { my $class = shift; diff --git a/lib/Text/Tradition/Collation/Segment.pm b/lib/Text/Tradition/Collation/Segment.pm index b3a6204..ddfec9e 100644 --- a/lib/Text/Tradition/Collation/Segment.pm +++ b/lib/Text/Tradition/Collation/Segment.pm @@ -2,6 +2,7 @@ package Text::Tradition::Collation::Segment; use Moose; use MooseX::NonMoose; +use Text::Tradition::Collation::Position; extends 'Graph::Easy::Node'; @@ -15,6 +16,12 @@ has 'members' => ( required => 1, ); +has 'position' => ( + is => 'rw', + isa => 'Text::Tradition::Collation::Position', + predicate => 'has_position', +); + sub FOREIGNBUILDARGS { my $class = shift; my %args = @_; @@ -27,17 +34,63 @@ sub FOREIGNBUILDARGS { sub BUILD { my( $self, $args ) = @_; $self->set_attribute( 'class', 'segment' ); + my $ctr = 1; + foreach my $r ( @{$args->{members}} ) { + my $seg_edge = $r->parent->add_edge( $r, $self, $ctr++ ); + $seg_edge->set_attribute( 'class', 'segment' ); + } + unless ( grep { !$_->has_position } @{$args->{members}} ) { + $self->set_position; + } +} +# We use our 'members' array for the initialization, but afterward we +# go by graph edges. This ensures that merged nodes stay merged. +around 'members' => sub { + my $orig = shift; + my $self = shift; + my @members; + foreach my $sl ( sort { $a->name <=> $b->name } + grep { $_->sub_class eq 'segment' } $self->incoming ) { + push( @members, $sl->from ); + } + return \@members; +}; + +sub set_position { + my $self = shift; + my( $common, $min, $max ); + my $readings = $self->members; foreach my $r ( @{$self->members} ) { - my $seg_edge = $r->parent->add_edge( $r, $self, 'segment' ); - $seg_edge->set_attribute( 'class', 'segment' ); + if( $r->has_position ) { + if( $common && $common != $r->position->common ) { + warn "Segment adding node with position skew"; + } elsif( !$common ) { + $common = $r->position->common; + } + $min = $r->position->min unless $min && $min < $r->position->min; + $max = $r->position->max unless $max && $max > $r->position->max; + } else { + warn "Called set_position on segment which has an unpositioned reading"; + } } + $self->position( Text::Tradition::Collation::Position->new( + common => $common, min => $min, max => $max + ) ); } - -# For now, a segment has no position in the graph. Eventually it might -# have the position of its first member. -sub has_position { - return undef; +sub neighbor_readings { + my( $self, $direction ) = @_; + $direction = 'both' unless $direction; + my @answer; + if( $direction !~ /^back/ ) { + # We want forward readings. + push( @answer, $self->members->[0]->neighbor_readings( 'forward' ) ); + } + if( $direction ne 'forward' ) { + # We want backward readings. + push( @answer, $self->members->[0]->neighbor_readings( 'back' ) ); + } + return @answer; } no Moose; diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm index eedaed9..a8ea38f 100644 --- a/lib/Text/Tradition/Parser/BaseText.pm +++ b/lib/Text/Tradition/Parser/BaseText.pm @@ -3,7 +3,7 @@ package Text::Tradition::Parser::BaseText; use strict; use warnings; use Module::Load; -use Algorithm::Diff; +use Text::Tradition::Parser::Util qw( collate_variants cmp_str check_for_repeated add_hash_entry ); =head1 NAME @@ -90,253 +90,270 @@ sub merge_base { my %all_witnesses; my @unwitnessed_lemma_nodes; foreach my $app ( @app_entries ) { - my( $line, $num ) = split( /\./, $app->{_id} ); - # DEBUG with a short graph - last if $SHORTEND && $line > $SHORTEND; - # DEBUG for problematic entries - my $scrutinize = ''; - my $first_line_reading = $base_line_starts[ $line ]; - my $too_far = $base_line_starts[ $line+1 ]; - - my $lemma = $app->{rdg_0}; - my $seq = 1; - # Is this the Nth occurrence of this reading in the line? - if( $lemma =~ s/(_)?(\d)$// ) { - $seq = $2; - } - my @lemma_words = split( /\s+/, $lemma ); - - # Now search for the lemma words within this line. - my $lemma_start = $first_line_reading; - my $lemma_end; - my %seen; - while( $lemma_start ne $too_far ) { - # Loop detection - if( $seen{ $lemma_start->name() } ) { - warn "Detected loop at " . $lemma_start->name() . - ", ref $line,$num"; - last; - } - $seen{ $lemma_start->name() } = 1; - - # Try to match the lemma. - my $unmatch = 0; - print STDERR "Matching " . cmp_str( $lemma_start) . " against " . - $lemma_words[0] . "...\n" - if "$line.$num" eq $scrutinize; - if( cmp_str( $lemma_start ) eq $lemma_words[0] ) { - # Skip it if we need a match that is not the first. - if( --$seq < 1 ) { - # Now we have to compare the rest of the words here. - if( scalar( @lemma_words ) > 1 ) { - my $next_reading = - $collation->next_reading( $lemma_start ); - foreach my $w ( @lemma_words[1..$#lemma_words] ) { - printf STDERR "Now matching %s against %s\n", - cmp_str($next_reading), $w - if "$line.$num" eq $scrutinize; - if( $w ne cmp_str($next_reading) ) { - $unmatch = 1; - last; - } else { - $lemma_end = $next_reading; - $next_reading = - $collation->next_reading( $lemma_end ); - } - } - } else { - $lemma_end = $lemma_start; - } - } else { - $unmatch = 1; - } - } - last unless ( $unmatch || !defined( $lemma_end ) ); - $lemma_end = undef; - $lemma_start = $collation->next_reading( $lemma_start ); - } - - unless( $lemma_end ) { - warn "No match found for @lemma_words at $line.$num"; - next; - } - - # Now we have found the lemma; we will record an 'edit', in - # terms of a splice operation, for each subsequent reading. - # We also note which witnesses take the given edit. - - my @lemma_set = $collation->reading_sequence( $lemma_start, - $lemma_end ); - my @reading_sets = [ @lemma_set ]; - - # For each reading that is not rdg_0, we create the variant - # reading nodes, and store the range as an edit operation on - # the base text. - my $variant_objects; - my %pc_seen; # Keep track of mss with explicit post-corr data - foreach my $k ( grep { /^rdg/ } keys( %$app ) ) { - my @mss = grep { $app->{$_} eq $k } keys( %$app ); - - # Keep track of lemma nodes that don't actually appear in - # any MSS; we will want to remove them from the collation. - push( @unwitnessed_lemma_nodes, @lemma_set ) - if !@mss && $k eq 'rdg_0'; - - # Keep track of what witnesses we have seen. - @all_witnesses{ @mss } = ( 1 ) x scalar( @mss ); - # Keep track of which witnesses bear corrected readings here. - foreach my $m ( @mss ) { - my $base = _is_post_corr( $m ); - next unless $base; - $pc_seen{$base} = 1; - } - next if $k eq 'rdg_0'; - - # Parse the variant into reading tokens. - # TODO don't hardcode the reading split operation - my @variant = split( /\s+/, $app->{$k} ); - @variant = () if $app->{$k} eq '/'; # This is an omission. - - my @variant_readings; - my $ctr = 0; - foreach my $vw ( @variant ) { - my $vwname = "$k/$line.$num.$ctr"; $ctr++; - my $vwreading = $collation->add_reading( $vwname ); - $vwreading->text( $vw ); - push( @variant_readings, $vwreading ); - } - - $variant_objects->{$k} = { 'mss' => \@mss, - 'reading' => \@variant_readings, - }; - push( @reading_sets, \@variant_readings ); - } - - # Now collate and collapse the identical readings within the - # collated sets. Modifies the reading sets that were passed. - collate_variants( $collation, @reading_sets ); - - # Record any stated relationships between the nodes and the lemma. - set_relationships( $collation, $app, \@lemma_set, $variant_objects ); - - # Now create the splice-edit objects that will be used - # to reconstruct each witness. - - foreach my $rkey ( keys %$variant_objects ) { - # Object is argument list for splice, so: - # offset, length, replacements - my $edit_object = [ $lemma_start->name, - scalar( @lemma_set ), - $variant_objects->{$rkey}->{reading} ]; - foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) { - # Is this a p.c. entry? - my $base = _is_post_corr( $ms ); - if( $base ) { # this is a post-corr witness - my $pc_key = $base . "_post"; - _add_hash_entry( $edits_required, $pc_key, $edit_object ); - } else { # this is an ante-corr witness - my $pc_key = $ms . "_post"; - _add_hash_entry( $edits_required, $ms, $edit_object ); - unless( $pc_seen{$ms} ) { - # If this witness carries no correction, add this - # same object to its post-corrected state. - _add_hash_entry( $edits_required, $pc_key, - $edit_object ); - } - } - } - } + my( $line, $num ) = split( /\./, $app->{_id} ); + # DEBUG with a short graph + last if $SHORTEND && $line > $SHORTEND; + # DEBUG for problematic entries + my $scrutinize = ''; + my $first_line_reading = $base_line_starts[ $line ]; + my $too_far = $base_line_starts[ $line+1 ]; + + my $lemma = $app->{rdg_0}; + my $seq = 1; + # Is this the Nth occurrence of this reading in the line? + if( $lemma =~ s/(_)?(\d)$// ) { + $seq = $2; + } + my @lemma_words = split( /\s+/, $lemma ); + + # Now search for the lemma words within this line. + my $lemma_start = $first_line_reading; + my $lemma_end; + my %seen; + while( $lemma_start ne $too_far ) { + # Loop detection + if( $seen{ $lemma_start->name() } ) { + warn "Detected loop at " . $lemma_start->name() . + ", ref $line,$num"; + last; + } + $seen{ $lemma_start->name() } = 1; + + # Try to match the lemma. + my $unmatch = 0; + print STDERR "Matching " . cmp_str( $lemma_start) . " against " . + $lemma_words[0] . "...\n" + if "$line.$num" eq $scrutinize; + if( cmp_str( $lemma_start ) eq $lemma_words[0] ) { + # Skip it if we need a match that is not the first. + if( --$seq < 1 ) { + # Now we have to compare the rest of the words here. + if( scalar( @lemma_words ) > 1 ) { + my $next_reading = + $collation->next_reading( $lemma_start ); + foreach my $w ( @lemma_words[1..$#lemma_words] ) { + printf STDERR "Now matching %s against %s\n", + cmp_str($next_reading), $w + if "$line.$num" eq $scrutinize; + if( $w ne cmp_str($next_reading) ) { + $unmatch = 1; + last; + } else { + $lemma_end = $next_reading; + $next_reading = + $collation->next_reading( $lemma_end ); + } + } + } else { + $lemma_end = $lemma_start; + } + } else { + $unmatch = 1; + } + } + last unless ( $unmatch || !defined( $lemma_end ) ); + $lemma_end = undef; + $lemma_start = $collation->next_reading( $lemma_start ); + } + + unless( $lemma_end ) { + warn "No match found for @lemma_words at $line.$num"; + next; + } + + # Now we have found the lemma; we will record an 'edit', in + # terms of a splice operation, for each subsequent reading. + # We also note which witnesses take the given edit. + + my @lemma_set = $collation->reading_sequence( $lemma_start, + $lemma_end ); + my @reading_sets = [ @lemma_set ]; + + # For each reading that is not rdg_0, we create the variant + # reading nodes, and store the range as an edit operation on + # the base text. + my $variant_objects; + my %pc_seen; # Keep track of mss with explicit post-corr data + foreach my $k ( grep { /^rdg/ } keys( %$app ) ) { + my @mss = grep { $app->{$_} eq $k } keys( %$app ); + + # Keep track of lemma nodes that don't actually appear in + # any MSS; we will want to remove them from the collation. + push( @unwitnessed_lemma_nodes, @lemma_set ) + if !@mss && $k eq 'rdg_0'; + + # Keep track of what witnesses we have seen. + @all_witnesses{ @mss } = ( 1 ) x scalar( @mss ); + # Keep track of which witnesses bear corrected readings here. + foreach my $m ( @mss ) { + my $base = _is_post_corr( $m ); + next unless $base; + $pc_seen{$base} = 1; + } + next if $k eq 'rdg_0'; + + # Parse the variant into reading tokens. + # TODO don't hardcode the reading split operation + my @variant = split( /\s+/, $app->{$k} ); + @variant = () if $app->{$k} eq '/'; # This is an omission. + + my @variant_readings; + my $ctr = 0; + foreach my $vw ( @variant ) { + my $vwname = "$k/$line.$num.$ctr"; $ctr++; + my $vwreading = $collation->add_reading( $vwname ); + $vwreading->text( $vw ); + push( @variant_readings, $vwreading ); + } + + $variant_objects->{$k} = { 'mss' => \@mss, + 'reading' => \@variant_readings, + }; + push( @reading_sets, \@variant_readings ); + } + + # Now collate and collapse the identical readings within the + # collated sets. Modifies the reading sets that were passed. + collate_variants( $collation, @reading_sets ); + + # Record any stated relationships between the nodes and the lemma. + set_relationships( $collation, $app, \@lemma_set, $variant_objects ); + + # Now create the splice-edit objects that will be used + # to reconstruct each witness. + + foreach my $rkey ( keys %$variant_objects ) { + # Object is argument list for splice, so: + # offset, length, replacements + my $edit_object = [ $lemma_start->name, + scalar( @lemma_set ), + $variant_objects->{$rkey}->{reading} ]; + foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) { + # Is this a p.c. entry? + my $base = _is_post_corr( $ms ); + if( $base ) { # this is a post-corr witness + my $pc_key = $base . "_post"; + add_hash_entry( $edits_required, $pc_key, $edit_object ); + } else { # this is an ante-corr witness + my $pc_key = $ms . "_post"; + add_hash_entry( $edits_required, $ms, $edit_object ); + unless( $pc_seen{$ms} ) { + # If this witness carries no correction, add this + # same object to its post-corrected state. + add_hash_entry( $edits_required, $pc_key, + $edit_object ); + } + } + } + } } # Finished going through the apparatus entries # Now make the witness objects, and create their text sequences foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) { - print STDERR "Creating witness $w\n"; - my $witness_obj = $collation->tradition->add_witness( sigil => $w ); - my $debug; # = $w eq 'Vb11'; - my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug ); - my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug ) - if exists( $edits_required->{$w."_post"} ); - - my @repeated = _check_for_repeated( @ante_corr_seq ); - warn "Repeated elements @repeated in $w a.c." - if @repeated; - @repeated = _check_for_repeated( @post_corr_seq ); - warn "Repeated elements @repeated in $w p.c." - if @repeated; - - # Now save these paths in my witness object - if( @post_corr_seq ) { - $witness_obj->path( \@post_corr_seq ); - $witness_obj->uncorrected_path( \@ante_corr_seq ); - } else { - $witness_obj->path( \@ante_corr_seq ); - } + print STDERR "Creating witness $w\n"; + my $witness_obj = $collation->tradition->add_witness( sigil => $w ); + my $debug; # = $w eq 'Vb11'; + my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug ); + my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug ) + if exists( $edits_required->{$w."_post"} ); + + my @repeated = check_for_repeated( @ante_corr_seq ); + warn "Repeated elements @repeated in $w a.c." + if @repeated; + @repeated = check_for_repeated( @post_corr_seq ); + warn "Repeated elements @repeated in $w p.c." + if @repeated; + + # Now save these paths in my witness object + if( @post_corr_seq ) { + $witness_obj->path( \@post_corr_seq ); + $witness_obj->uncorrected_path( \@ante_corr_seq ); + } else { + $witness_obj->path( \@ante_corr_seq ); + } } # Now remove our 'base text' edges, which is to say, the only # ones we have created so far. Also remove any unwitnessed # lemma nodes (TODO unless we are treating base as witness) foreach ( $collation->paths() ) { - $collation->del_path( $_ ); + $collation->del_path( $_ ); } foreach( @unwitnessed_lemma_nodes ) { - $collation->del_reading( $_ ); + $collation->del_reading( $_ ); + # TODO do we need to delete any relationship paths here? } ### HACKY HACKY Do some one-off path corrections here. if( $collation->linear ) { - my $c = $collation; - my $end = $SHORTEND ? $SHORTEND : 155; - my $path = $c->tradition->witness('Vb11')->path; - if( $end > 16 ) { - $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') ); - splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) ); - } - # What else? + my $c = $collation; + my $end = $SHORTEND ? $SHORTEND : 155; + # Vb11 + my $path; + if( $end > 16 ) { + $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') ); + $path = $c->tradition->witness('Vb11')->path; + splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) ); + $path = $c->tradition->witness('Vb11')->uncorrected_path; + splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) ); + } + # What else? + # Vb26: + $path = $c->tradition->witness('Vb26')->path; + splice( @$path, 618, 0, $c->reading('rdg_1/46.1.1') ) if $end > 46; + # Vb13: + $path = $c->tradition->witness('Vb13')->path; + splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58; + $path = $c->tradition->witness('Vb13')->uncorrected_path; + splice( @$path, 758, 0, $c->reading( '58,5' ) ) if $end > 58; + # Vb20 a.c.: + $path = $c->tradition->witness('Vb20')->uncorrected_path; + splice( @$path, 1251, 1, $c->reading( '94,4' ) ) if $end > 94; + # Vb5: + $path = $c->tradition->witness('Vb5')->path; + splice( @$path, 1436, 0, $c->reading('rdg_1/106.5.1') ) if $end > 106; + # extraneous: + $c->del_reading( 'rdg_2/147.6.13' ); + $c->del_reading( 'rdg_2/147.6.14' ); + $c->del_reading( 'rdg_2/147.6.15' ); + } else { - my $c = $collation; - my $end = $SHORTEND ? $SHORTEND : 155; - # Vb5: - my $path = $c->tradition->witness('Vb5')->path; - splice( @$path, 1436, 0, $c->reading('106,14') ) if $end > 106; - # Vb11: - $path = $c->tradition->witness('Vb11')->path; - if( $end > 16 ) { - $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') ); - splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( '16,1' ) ); - } - # Vb12 a.c.: - $path = $c->tradition->witness('Vb12')->uncorrected_path; - splice( @$path, 1828, 1, $c->reading('rdg_2/137.5.0') ) if $end > 137; - # Vb13: - $path = $c->tradition->witness('Vb13')->path; - splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58; - # Vb20 a.c.: - $path = $c->tradition->witness('Vb20')->uncorrected_path; - splice( @$path, 1251, 1, $c->reading( '94,6' ) ) if $end > 94; - # Vb26: - $path = $c->tradition->witness('Vb26')->path; - splice( @$path, 618, 0, $c->reading('46,2') ) if $end > 46; + my $c = $collation; + my $end = $SHORTEND ? $SHORTEND : 155; + # Vb5: + my $path = $c->tradition->witness('Vb5')->path; + splice( @$path, 1436, 0, $c->reading('106,14') ) if $end > 106; + # Vb11: + $path = $c->tradition->witness('Vb11')->path; + if( $end > 16 ) { + $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') ); + splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( '16,1' ) ); + } + # Vb13: + $path = $c->tradition->witness('Vb13')->path; + splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58; + $path = $c->tradition->witness('Vb13')->uncorrected_path; + splice( @$path, 758, 0, $c->reading( '58,5' ) ) if $end > 58; + # Vb20 a.c.: + $path = $c->tradition->witness('Vb20')->uncorrected_path; + splice( @$path, 1251, 1, $c->reading( '94,4' ) ) if $end > 94; + # Vb26: + $path = $c->tradition->witness('Vb26')->path; + splice( @$path, 618, 0, $c->reading('46,2') ) if $end > 46; } - # Now walk paths and calculate positions. - my @common_readings = - $collation->make_witness_paths(); - $collation->calculate_positions( @common_readings ); -} - -sub _check_for_repeated { - my @seq = @_; - my %unique; - my @repeated; - foreach ( @seq ) { - if( exists $unique{$_->name} ) { - push( @repeated, $_->name ); - } else { - $unique{$_->name} = 1; - } - } - return @repeated; + # Now walk paths and calculate positional rank. + my @common_readings = $collation->make_witness_paths(); + # Have to check relationship validity at this point, because before that + # we had no paths. +# foreach my $rel ( $collation->relationships ) { +# next unless $rel->equal_rank; +# unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) { +# warn sprintf( "Relationship type %s between %s and %s is invalid, deleting", +# $rel->type, $rel->from->name, $rel->to->name ); +# } +# } + $collation->calculate_ranks(); } =item B @@ -363,308 +380,174 @@ sub read_base { open( BASE, $base_file ) or die "Could not open file $base_file: $!"; my $i = 1; while() { - # Make the readings, and connect them up for the base, but - # also save the first reading of each line in an array for the - # purpose. - # TODO use configurable reading separator - chomp; - my @words = split; - my $started = 0; - my $wordref = 0; - my $lineref = scalar @$lineref_array; - last if $SHORTEND && $lineref > $SHORTEND; - foreach my $w ( @words ) { - my $readingref = join( ',', $lineref, ++$wordref ); - my $reading = $collation->add_reading( $readingref ); - $reading->text( $w ); - unless( $started ) { - push( @$lineref_array, $reading ); - $started = 1; - } - # Add edge paths in the graph, for easier tracking when - # we start applying corrections. These paths will be - # removed when we're done. - my $path = $collation->add_path( $last_reading, $reading, - $collation->baselabel ); - $last_reading = $reading; - - # Note an array index for the reading, for later correction splices. - $base_text_index{$readingref} = $i++; - } + # Make the readings, and connect them up for the base, but + # also save the first reading of each line in an array for the + # purpose. + # TODO use configurable reading separator + chomp; + my @words = split; + my $started = 0; + my $wordref = 0; + my $lineref = scalar @$lineref_array; + last if $SHORTEND && $lineref > $SHORTEND; + foreach my $w ( @words ) { + my $readingref = join( ',', $lineref, ++$wordref ); + my $reading = $collation->add_reading( $readingref ); + $reading->text( $w ); + unless( $started ) { + push( @$lineref_array, $reading ); + $started = 1; + } + # Add edge paths in the graph, for easier tracking when + # we start applying corrections. These paths will be + # removed when we're done. + my $path = $collation->add_path( $last_reading, $reading, + $collation->baselabel ); + $last_reading = $reading; + + # Note an array index for the reading, for later correction splices. + $base_text_index{$readingref} = $i++; + } } close BASE; # Ending point for all texts - my $endpoint = $collation->add_reading( '#END#' ); - $collation->add_path( $last_reading, $endpoint, $collation->baselabel ); - push( @$lineref_array, $endpoint ); - $base_text_index{$endpoint->name} = $i; + $collation->add_path( $last_reading, $collation->end, $collation->baselabel ); + push( @$lineref_array, $collation->end ); + $base_text_index{$collation->end->name} = $i; return( @$lineref_array ); } -=item B - -collate_variants( $collation, @reading_ranges ) - -Given a set of readings in the form -( lemma_start, lemma_end, rdg1_start, rdg1_end, ... ) -walks through each to identify those readings that are identical. The -collation is a Text::Tradition::Collation object; the elements of -@readings are Text::Tradition::Collation::Reading objects that appear -on the collation graph. - -TODO: Handle collapsed and non-collapsed transpositions. - -=cut - -sub collate_variants { - my( $collation, @reading_sets ) = @_; - - # Two different ways to do this, depending on whether we want - # transposed reading nodes to be merged into one (producing a - # nonlinear, bidirectional graph) or not (producing a relatively - # linear, unidirectional graph.) - return $collation->linear ? collate_linearly( @_ ) - : collate_nonlinearly( @_ ); -} - -sub collate_linearly { - my( $collation, $lemma_set, @variant_sets ) = @_; - - my @unique; - push( @unique, @$lemma_set ); - while( @variant_sets ) { - my $variant_set = shift @variant_sets; - # Use diff to do this job - my $diff = Algorithm::Diff->new( \@unique, $variant_set, - {'keyGen' => \&_collation_hash} ); - my @new_unique; - my %merged; - while( $diff->Next ) { - if( $diff->Same ) { - # merge the nodes - my @l = $diff->Items( 1 ); - my @v = $diff->Items( 2 ); - foreach my $i ( 0 .. $#l ) { - if( !$merged{$l[$i]->name} ) { - $collation->merge_readings( $l[$i], $v[$i] ); - $merged{$l[$i]->name} = 1; - } else { - print STDERR "Would have double merged " . $l[$i]->name . "\n"; - } - } - # splice the lemma nodes into the variant set - my( $offset ) = $diff->Get( 'min2' ); - splice( @$variant_set, $offset, scalar( @l ), @l ); - push( @new_unique, @l ); - } else { - # Keep the old unique readings - push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 ); - # Add the new readings to the 'unique' list - push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 ); - } - } - @unique = @new_unique; - } -} - -sub collate_nonlinearly { - my( $collation, $lemma_set, @variant_sets ) = @_; - - my @unique; - push( @unique, @$lemma_set ); - while( @variant_sets ) { - my $variant_set = shift @variant_sets; - # Simply match the first reading that carries the same word, so - # long as that reading has not yet been used to match another - # word in this variant. That way lies loopy madness. - my @distinct; - my %merged; - foreach my $idx ( 0 .. $#{$variant_set} ) { - my $vw = $variant_set->[$idx]; - my @same = grep { cmp_str( $_ ) eq $vw->label } @unique; - my $matched; - if( @same ) { - foreach my $i ( 0 .. $#same ) { - unless( $merged{$same[$i]->name} ) { - #print STDERR sprintf( "Merging %s into %s\n", - # $vw->name, - # $same[$i]->name ); - $collation->merge_readings( $same[$i], $vw ); - $merged{$same[$i]->name} = 1; - $matched = $i; - $variant_set->[$idx] = $same[$i]; - } - } - } - unless( @same && defined($matched) ) { - push( @distinct, $vw ); - } - } - push( @unique, @distinct ); - } -} - - - -sub _collation_hash { - my $node = shift; - return cmp_str( $node ); -} - sub set_relationships { my( $collation, $app, $lemma, $variants ) = @_; foreach my $rkey ( keys %$variants ) { - my $var = $variants->{$rkey}->{'reading'}; - my $type = $app->{sprintf( "_%s_type", $rkey )}; - my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )}; - my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )}; - - my %rel_options = (); - $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/; - $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/; - - if( $type =~ /^(inv|tr|rep)$/i ) { - # Transposition or repetition: look for nodes with the - # same label but different IDs and mark them. - $type = 'repetition' if $type =~ /^rep/i; - $rel_options{'type'} = $type; - my %labels; - foreach my $r ( @$lemma ) { - $labels{cmp_str( $r )} = $r; - } - foreach my $r( @$var ) { - if( exists $labels{$r->label} && - $r->name ne $labels{$r->label}->name ) { - if( $type eq 'repetition' ) { - # Repetition - $collation->add_relationship( $r, $labels{$r->label}, \%rel_options ); - } else { - # Transposition - $r->set_identical( $labels{$r->label} ); - } - } - } - } elsif( $type =~ /^(gr|lex|sp(el)?)$/i ) { - - # Grammar/spelling/lexical: this can be a one-to-one or - # one-to-many mapping. We should think about merging - # readings if it is one-to-many. - - $type = 'grammatical' if $type =~ /gr/i; - $type = 'spelling' if $type =~ /sp/i; - $type = 'repetition' if $type =~ /rep/i; - $type = 'lexical' if $type =~ /lex/i; - $rel_options{'type'} = $type; - if( @$lemma == @$var ) { - foreach my $i ( 0 .. $#{$lemma} ) { - $collation->add_relationship( $var->[$i], $lemma->[$i], - \%rel_options ); - } - } else { - # An uneven many-to-many mapping. Make a segment out of - # whatever we have. - my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0]; - my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0]; - $collation->add_relationship( $varseg, $lemseg, \%rel_options ); - } - } elsif( $type !~ /^(add|om)$/i ) { - warn "Unrecognized type $type"; - } + my $var = $variants->{$rkey}->{'reading'}; + my $type = $app->{sprintf( "_%s_type", $rkey )}; + my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )}; + my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )}; + + my %rel_options = (); + $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/; + $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/; + + if( $type =~ /^(inv|tr|rep)$/i ) { + # Transposition or repetition: look for nodes with the + # same label but different IDs and mark them. + $type = 'repetition' if $type =~ /^rep/i; + $rel_options{'type'} = $type; + $rel_options{'equal_rank'} = undef; + my %labels; + foreach my $r ( @$lemma ) { + $labels{cmp_str( $r )} = $r; + } + foreach my $r( @$var ) { + if( exists $labels{$r->label} && + $r->name ne $labels{$r->label}->name ) { + if( $type eq 'repetition' ) { + # Repetition + $collation->add_relationship( $r, $labels{$r->label}, \%rel_options ); + } else { + # Transposition + $r->set_identical( $labels{$r->label} ); + } + } + } + } elsif( $type =~ /^(gr|sp(el)?)$/i ) { + + # Grammar/spelling/lexical: this can be a one-to-one or + # one-to-many mapping. We should think about merging + # readings if it is one-to-many. + + $type = 'grammatical' if $type =~ /gr/i; + $type = 'spelling' if $type =~ /sp/i; + $type = 'repetition' if $type =~ /rep/i; + # $type = 'lexical' if $type =~ /lex/i; + $rel_options{'type'} = $type; + $rel_options{'equal_rank'} = 1; + if( @$lemma == @$var ) { + foreach my $i ( 0 .. $#{$lemma} ) { + $collation->add_relationship( $var->[$i], $lemma->[$i], + \%rel_options ); + } + } else { + # An uneven many-to-many mapping. Skip for now. + # We really want to make a segment out of whatever we have. + # my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0]; + # my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0]; + # $collation->add_relationship( $varseg, $lemseg, \%rel_options ); + if( @$lemma == 1 && @$var == 1 ) { + $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options ); + } + } + } elsif( $type !~ /^(add|om|lex)$/i ) { + warn "Unrecognized type $type"; + } } } - + sub apply_edits { my( $collation, $edit_sequence, $debug ) = @_; my @lemma_text = $collation->reading_sequence( $collation->start, - $collation->reading( '#END#' ) ); + $collation->reading( '#END#' ) ); my $drift = 0; foreach my $correction ( @$edit_sequence ) { - my( $lemma_start, $length, $items ) = @$correction; - my $offset = $base_text_index{$lemma_start}; - my $realoffset = $offset + $drift; - if( $debug || - $lemma_text[$realoffset]->name ne $lemma_start ) { - my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1]; - my @base_phrase; - my $i = $realoffset; - my $l = $collation->reading( $lemma_start ); - while( $i < $realoffset+$length ) { - push( @base_phrase, $l ); - $l = $collation->next_reading( $l ); - $i++; - } - - print STDERR sprintf( "Trying to replace %s (%s) starting at %d " . - "with %s (%s) with drift %d\n", - join( ' ', map {$_->label} @base_phrase ), - join( ' ', map {$_->name} @base_phrase ), - $realoffset, - join( ' ', map {$_->label} @$items ), - join( ' ', map {$_->name} @$items ), - $drift, - ) if $debug; - - if( $lemma_text[$realoffset]->name ne $lemma_start ) { - warn( sprintf( "Should be replacing %s (%s) with %s (%s) " . - "but %s (%s) is there instead", - join( ' ', map {$_->label} @base_phrase ), - join( ' ', map {$_->name} @base_phrase ), - join( ' ', map {$_->label} @$items ), - join( ' ', map {$_->name} @$items ), - join( ' ', map {$_->label} @this_phrase ), - join( ' ', map {$_->name} @this_phrase ), - ) ); - # next; - } - } - splice( @lemma_text, $realoffset, $length, @$items ); - $drift += @$items - $length; + my( $lemma_start, $length, $items ) = @$correction; + my $offset = $base_text_index{$lemma_start}; + my $realoffset = $offset + $drift; + if( $debug || + $lemma_text[$realoffset]->name ne $lemma_start ) { + my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1]; + my @base_phrase; + my $i = $realoffset; + my $l = $collation->reading( $lemma_start ); + while( $i < $realoffset+$length ) { + push( @base_phrase, $l ); + $l = $collation->next_reading( $l ); + $i++; + } + + print STDERR sprintf( "Trying to replace %s (%s) starting at %d " . + "with %s (%s) with drift %d\n", + join( ' ', map {$_->label} @base_phrase ), + join( ' ', map {$_->name} @base_phrase ), + $realoffset, + join( ' ', map {$_->label} @$items ), + join( ' ', map {$_->name} @$items ), + $drift, + ) if $debug; + + if( $lemma_text[$realoffset]->name ne $lemma_start ) { + warn( sprintf( "Should be replacing %s (%s) with %s (%s) " . + "but %s (%s) is there instead", + join( ' ', map {$_->label} @base_phrase ), + join( ' ', map {$_->name} @base_phrase ), + join( ' ', map {$_->label} @$items ), + join( ' ', map {$_->name} @$items ), + join( ' ', map {$_->label} @this_phrase ), + join( ' ', map {$_->name} @this_phrase ), + ) ); + # next; + } + } + splice( @lemma_text, $realoffset, $length, @$items ); + $drift += @$items - $length; } return @lemma_text; } - + # Helper function. Given a witness sigil, if it is a post-correctione # sigil,return the base witness. If not, return a false value. sub _is_post_corr { my( $sigil ) = @_; if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) { - return $1; + return $1; } return undef; } -sub _add_hash_entry { - my( $hash, $key, $entry ) = @_; - if( exists $hash->{$key} ) { - push( @{$hash->{$key}}, $entry ); - } else { - $hash->{$key} = [ $entry ]; - } -} - - -=item B - -Pretend you never saw this method. Really it needs to not be hardcoded. - -=cut - -sub cmp_str { - my( $reading ) = @_; - my $word = $reading->label(); - $word = lc( $word ); - $word =~ s/\W//g; - $word =~ s/v/u/g; - $word =~ s/j/i/g; - $word =~ s/cha/ca/g; - $word =~ s/quatuor/quattuor/g; - $word =~ s/ioannes/iohannes/g; - return $word; -} =back diff --git a/lib/Text/Tradition/Parser/CollateX.pm b/lib/Text/Tradition/Parser/CollateX.pm index 7775a75..63a5635 100644 --- a/lib/Text/Tradition/Parser/CollateX.pm +++ b/lib/Text/Tradition/Parser/CollateX.pm @@ -43,80 +43,81 @@ sub parse { # Add the nodes to the graph. First delete the start node, because # GraphML graphs will have their own start nodes. $collation->del_reading( $collation->start() ); + $collation->del_reading( $collation->end() ); my $extra_data = {}; # Keep track of info to be processed after all # nodes have been created foreach my $n ( @{$graph_data->{'nodes'}} ) { - my %node_data = %$n; - my $nodeid = delete $node_data{$IDKEY}; - my $token = delete $node_data{$CONTENTKEY}; - unless( defined $nodeid && defined $token ) { - $DB::single = 1; - warn "Did not find an ID or token for graph node, can't add it"; - next; - } - my $gnode = $collation->add_reading( $nodeid ); - $gnode->text( $token ); - - # Whatever is left is extra info to be processed later. - if( keys %node_data ) { - $extra_data->{$nodeid} = \%node_data; - } + my %node_data = %$n; + my $nodeid = delete $node_data{$IDKEY}; + my $token = delete $node_data{$CONTENTKEY}; + unless( defined $nodeid && defined $token ) { + warn "Did not find an ID or token for graph node, can't add it"; + next; + } + my $gnode = $collation->add_reading( $nodeid ); + $gnode->text( $token ); + + # Whatever is left is extra info to be processed later. + if( keys %node_data ) { + $extra_data->{$nodeid} = \%node_data; + } } - + # Now add the edges. foreach my $e ( @{$graph_data->{'edges'}} ) { - my %edge_data = %$e; - my $from = delete $edge_data{'source'}; - my $to = delete $edge_data{'target'}; - - # In CollateX, we have a distinct witness data ID per witness, - # so that we can have multiple witnesses per edge. We want to - # translate this to one witness per edge in our own - # representation. - foreach my $ekey ( keys %edge_data ) { - my $wit = $edge_data{$ekey}; - # Create the witness object if it does not yet exist. - unless( $witnesses{$wit} ) { - $tradition->add_witness( 'sigil' => $wit ); - $witnesses{$wit} = 1; - } - $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $wit ); - } + my %edge_data = %$e; + my $from = delete $edge_data{'source'}; + my $to = delete $edge_data{'target'}; + + # In CollateX, we have a distinct witness data ID per witness, + # so that we can have multiple witnesses per edge. We want to + # translate this to one witness per edge in our own + # representation. + foreach my $ekey ( keys %edge_data ) { + my $wit = $edge_data{$ekey}; + # Create the witness object if it does not yet exist. + unless( $witnesses{$wit} ) { + $tradition->add_witness( 'sigil' => $wit ); + $witnesses{$wit} = 1; + } + $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $wit ); + } } # Process the extra node data if it exists. foreach my $nodeid ( keys %$extra_data ) { - my $ed = $extra_data->{$nodeid}; - if( exists $ed->{$TRANSKEY} ) { - - my $tn_reading = $collation->reading( $nodeid ); - my $main_reading = $collation->reading( $ed->{$TRANSKEY} ); - if( $collation->linear ) { - $tn_reading->set_identical( $main_reading ); - } else { - $collation->merge_readings( $main_reading, $tn_reading ); - } - } # else we don't have any other tags to process yet. + my $ed = $extra_data->{$nodeid}; + if( exists $ed->{$TRANSKEY} ) { + + my $tn_reading = $collation->reading( $nodeid ); + my $main_reading = $collation->reading( $ed->{$TRANSKEY} ); + if( $collation->linear ) { + $tn_reading->set_identical( $main_reading ); + } else { + $collation->merge_readings( $main_reading, $tn_reading ); + } + } # else we don't have any other tags to process yet. } # Find the beginning and end nodes of the graph. The beginning node # has no incoming edges; the end node has no outgoing edges. my( $begin_node, $end_node ); foreach my $gnode ( $collation->readings() ) { - # print STDERR "Checking node " . $gnode->name . "\n"; - my @outgoing = $gnode->outgoing(); - my @incoming = $gnode->incoming(); - - unless( scalar @incoming ) { - warn "Already have a beginning node" if $begin_node; - $begin_node = $gnode; - $collation->start( $gnode ); - } - unless( scalar @outgoing ) { - warn "Already have an ending node" if $end_node; - $end_node = $gnode; - } + # print STDERR "Checking node " . $gnode->name . "\n"; + my @outgoing = $gnode->outgoing(); + my @incoming = $gnode->incoming(); + + unless( scalar @incoming ) { + warn "Already have a beginning node" if $begin_node; + $begin_node = $gnode; + $collation->start( $gnode ); + } + unless( scalar @outgoing ) { + warn "Already have an ending node" if $end_node; + $end_node = $gnode; + $collation->end( $gnode ); + } } # Record for each witness its sequence of readings, and determine diff --git a/lib/Text/Tradition/Parser/GraphML.pm b/lib/Text/Tradition/Parser/GraphML.pm index 4e191a4..7807189 100644 --- a/lib/Text/Tradition/Parser/GraphML.pm +++ b/lib/Text/Tradition/Parser/GraphML.pm @@ -39,7 +39,7 @@ sub parse { my( $graphml_str ) = @_; my $graph_hash = { 'nodes' => [], - 'edges' => [] }; + 'edges' => [] }; my $parser = XML::LibXML->new(); my $doc = $parser->parse_string( $graphml_str ); @@ -49,18 +49,18 @@ sub parse { # First get the ID keys, for witnesses and for collation data foreach my $k ( $xpc->findnodes( '//g:key' ) ) { - # Each key has a 'for' attribute; the edge keys are witnesses, and - # the node keys contain an ID and string for each node. - my $keyid = $k->getAttribute( 'id' ); - my $keyname = $k->getAttribute( 'attr.name' ); - - if( $k->getAttribute( 'for' ) eq 'node' ) { - # Keep track of the XML identifiers for the data carried - # in each node element. - $nodedata->{$keyid} = $keyname - } else { - $witnesses->{$keyid} = $keyname; - } + # Each key has a 'for' attribute; the edge keys are witnesses, and + # the node keys contain an ID and string for each node. + my $keyid = $k->getAttribute( 'id' ); + my $keyname = $k->getAttribute( 'attr.name' ); + + if( $k->getAttribute( 'for' ) eq 'node' ) { + # Keep track of the XML identifiers for the data carried + # in each node element. + $nodedata->{$keyid} = $keyname + } else { + $witnesses->{$keyid} = $keyname; + } } my $graph_el = $xpc->find( '/g:graphml/g:graph' )->[0]; @@ -70,36 +70,36 @@ sub parse { # Add the nodes to the graph hash. my @nodes = $xpc->findnodes( '//g:node' ); foreach my $n ( @nodes ) { - # Could use a better way of registering these - my $node_hash = {}; - foreach my $dkey ( keys %$nodedata ) { - my $keyname = $nodedata->{$dkey}; - my $keyvalue = _lookup_node_data( $n, $dkey ); - $node_hash->{$keyname} = $keyvalue if defined $keyvalue; - } - $node_reg->{$n->getAttribute( 'id' )} = $node_hash; - push( @{$graph_hash->{'nodes'}}, $node_hash ); + # Could use a better way of registering these + my $node_hash = {}; + foreach my $dkey ( keys %$nodedata ) { + my $keyname = $nodedata->{$dkey}; + my $keyvalue = _lookup_node_data( $n, $dkey ); + $node_hash->{$keyname} = $keyvalue if defined $keyvalue; + } + $node_reg->{$n->getAttribute( 'id' )} = $node_hash; + push( @{$graph_hash->{'nodes'}}, $node_hash ); } - + # Now add the edges, and cross-ref with the node objects. my @edges = $xpc->findnodes( '//g:edge' ); foreach my $e ( @edges ) { - my $from = $e->getAttribute('source'); - my $to = $e->getAttribute('target'); - - # We don't know whether the edge data is one per witness - # or one per witness type, or something else. So we just - # save it and let our calling parser decide. - my $edge_hash = { - 'source' => $node_reg->{$from}, - 'target' => $node_reg->{$to}, - }; - foreach my $wkey( keys %$witnesses ) { - my $wname = $witnesses->{$wkey}; - my $wlabel = _lookup_node_data( $e, $wkey ); - $edge_hash->{$wname} = $wlabel if $wlabel; - } - push( @{$graph_hash->{'edges'}}, $edge_hash ); + my $from = $e->getAttribute('source'); + my $to = $e->getAttribute('target'); + + # We don't know whether the edge data is one per witness + # or one per witness type, or something else. So we just + # save it and let our calling parser decide. + my $edge_hash = { + 'source' => $node_reg->{$from}, + 'target' => $node_reg->{$to}, + }; + foreach my $wkey( keys %$witnesses ) { + my $wname = $witnesses->{$wkey}; + my $wlabel = _lookup_node_data( $e, $wkey ); + $edge_hash->{$wname} = $wlabel if $wlabel; + } + push( @{$graph_hash->{'edges'}}, $edge_hash ); } return $graph_hash; } diff --git a/lib/Text/Tradition/Parser/CSV.pm b/lib/Text/Tradition/Parser/KUL.pm similarity index 98% rename from lib/Text/Tradition/Parser/CSV.pm rename to lib/Text/Tradition/Parser/KUL.pm index 003936c..05c3a12 100644 --- a/lib/Text/Tradition/Parser/CSV.pm +++ b/lib/Text/Tradition/Parser/KUL.pm @@ -1,4 +1,4 @@ -package Text::Tradition::Parser::CSV; +package Text::Tradition::Parser::KUL; use strict; use warnings; diff --git a/lib/Text/Tradition/Parser/Self.pm b/lib/Text/Tradition/Parser/Self.pm index 6baca63..8c5c391 100644 --- a/lib/Text/Tradition/Parser/Self.pm +++ b/lib/Text/Tradition/Parser/Self.pm @@ -32,74 +32,74 @@ my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $POSITION_KEY, $CLASS_KEY ) sub parse { my( $tradition, $graphml_str ) = @_; - $DB::single = 1; my $graph_data = Text::Tradition::Parser::GraphML::parse( $graphml_str ); my $collation = $tradition->collation; my %witnesses; # Add the nodes to the graph. + # TODO Are we adding extra start/end nodes? my $extra_data = {}; # Keep track of data that needs to be processed # after the nodes & edges are created. print STDERR "Adding graph nodes\n"; foreach my $n ( @{$graph_data->{'nodes'}} ) { - # Each node is either a segment or a reading, depending on - # its class. Readings have text, segments don't. - my %node_data = %$n; - my $nodeid = delete $node_data{$IDKEY}; - my $reading = delete $node_data{$TOKENKEY}; - my $class = $node_data{$CLASS_KEY} || ''; - # TODO this is a hack, fix it? - $class = 'reading' unless $class eq 'segment'; - my $method = $class eq 'segment' ? "add_$class" : "add_reading"; - my $gnode = $collation->$method( $nodeid ); - $gnode->label( $reading ); - $gnode->set_common if $class eq 'common'; - - # Now save the rest of the data, i.e. not the ID or label, - # if it exists. - if ( keys %node_data ) { - $extra_data->{$nodeid} = \%node_data; - } + # Each node is either a segment or a reading, depending on + # its class. Readings have text, segments don't. + my %node_data = %$n; + my $nodeid = delete $node_data{$IDKEY}; + my $reading = delete $node_data{$TOKENKEY}; + my $class = $node_data{$CLASS_KEY} || ''; + # TODO this is a hack, fix it? + $class = 'reading' unless $class eq 'segment'; + my $method = $class eq 'segment' ? "add_$class" : "add_reading"; + my $gnode = $collation->$method( $nodeid ); + $gnode->label( $reading ); + $gnode->set_common if $class eq 'common'; + + # Now save the rest of the data, i.e. not the ID or label, + # if it exists. + if ( keys %node_data ) { + $extra_data->{$nodeid} = \%node_data; + } } - + # Now add the edges. print STDERR "Adding graph edges\n"; foreach my $e ( @{$graph_data->{'edges'}} ) { - my %edge_data = %$e; - my $from = delete $edge_data{'source'}; - my $to = delete $edge_data{'target'}; - my $class = delete $edge_data{'class'}; - - # Whatever is left tells us what kind of edge it is. - foreach my $wkey ( keys %edge_data ) { - if( $wkey =~ /^witness/ ) { - unless( $class eq 'path' ) { - warn "Cannot add witness label to a $class edge"; - next; - } - my $wit = $edge_data{$wkey}; - unless( $witnesses{$wit} ) { - $tradition->add_witness( sigil => $wit ); - $witnesses{$wit} = 1; - } - my $label = $wkey eq 'witness_ante_corr' - ? $wit . $collation->ac_label : $wit; - $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label ); - } elsif( $wkey eq 'relationship' ) { - unless( $class eq 'relationship' ) { - warn "Cannot add relationship label to a $class edge"; - next; - } - my $rel = $edge_data{$wkey}; - # TODO handle global relationships - $collation->add_relationship( $rel, $from->{$IDKEY}, $to->{$IDKEY} ); - } else { - my $seg_edge = $collation->graph->add_edge( $from->{$IDKEY}, $to->{$IDKEY} ); - $seg_edge->set_attribute( 'class', 'segment' ); - } - } + my %edge_data = %$e; + my $from = delete $edge_data{'source'}; + my $to = delete $edge_data{'target'}; + my $class = delete $edge_data{'class'}; + + # Whatever is left tells us what kind of edge it is. + foreach my $wkey ( keys %edge_data ) { + if( $wkey =~ /^witness/ ) { + unless( $class eq 'path' ) { + warn "Cannot add witness label to a $class edge"; + next; + } + my $wit = $edge_data{$wkey}; + unless( $witnesses{$wit} ) { + $tradition->add_witness( sigil => $wit ); + $witnesses{$wit} = 1; + } + my $label = $wkey eq 'witness_ante_corr' + ? $wit . $collation->ac_label : $wit; + $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label ); + } elsif( $wkey eq 'relationship' ) { + unless( $class eq 'relationship' ) { + warn "Cannot add relationship label to a $class edge"; + next; + } + my $rel = $edge_data{$wkey}; + # TODO handle global relationships + $collation->add_relationship( $rel, $from->{$IDKEY}, $to->{$IDKEY} ); + } else { + my $seg_edge = $collation->graph->add_edge( $from->{$IDKEY}, $to->{$IDKEY} ); + $seg_edge->set_attribute( 'class', 'segment' ); + } + } } ## Deal with node information (transposition, relationships, etc.) that @@ -107,19 +107,19 @@ sub parse { print STDERR "Adding second-pass data\n"; my $linear = undef; foreach my $nkey ( keys %$extra_data ) { - foreach my $edkey ( keys %{$extra_data->{$nkey}} ) { - my $this_reading = $collation->reading( $nkey ); - if( $edkey eq $TRANSPOS_KEY ) { - my $other_reading = $collation->reading( $extra_data->{$nkey}->{$edkey} ); - # We evidently have a linear graph. - $linear = 1; - $this_reading->set_identical( $other_reading ); - } elsif ( $edkey eq $POSITION_KEY ) { - $this_reading->position( $extra_data->{$nkey}->{$edkey} ); - } else { - warn "Unfamiliar reading node data $edkey for $nkey"; - } - } + foreach my $edkey ( keys %{$extra_data->{$nkey}} ) { + my $this_reading = $collation->reading( $nkey ); + if( $edkey eq $TRANSPOS_KEY ) { + my $other_reading = $collation->reading( $extra_data->{$nkey}->{$edkey} ); + # We evidently have a linear graph. + $linear = 1; + $this_reading->set_identical( $other_reading ); + } elsif ( $edkey eq $POSITION_KEY ) { + $this_reading->position( $extra_data->{$nkey}->{$edkey} ); + } else { + warn "Unfamiliar reading node data $edkey for $nkey"; + } + } } $collation->linear( $linear ); diff --git a/lib/Text/Tradition/Parser/TEI.pm b/lib/Text/Tradition/Parser/TEI.pm index 59ee42c..03b76cf 100644 --- a/lib/Text/Tradition/Parser/TEI.pm +++ b/lib/Text/Tradition/Parser/TEI.pm @@ -2,6 +2,7 @@ package Text::Tradition::Parser::TEI; use strict; use warnings; +use Text::Tradition::Parser::Util qw( collate_variants ); use XML::LibXML; use XML::LibXML::XPathContext; @@ -28,6 +29,33 @@ the appropriate witness objects. =cut +my $text = {}; # Hash of arrays, one per eventual witness we find. +my @common_readings; +my $substitutions = {}; # Keep track of merged readings +my $app_anchors = {}; # Track apparatus references +my $app_ac = {}; # Save a.c. readings + +# Create the package variables for tag names. + +# Would really like to do this with varname variables, but apparently this +# is considered a bad idea. The long way round then. +my( $LISTWIT, $WITNESS, $TEXT, $W, $SEG, $APP, $RDG, $LEM ) + = ( 'listWit', 'witness', 'text', 'w', 'seg', 'app', 'rdg', 'lem' ); +sub make_tagnames { + my( $ns ) = @_; + if( $ns ) { + $LISTWIT = "$ns:$LISTWIT"; + $WITNESS = "$ns:$WITNESS"; + $TEXT = "$ns:$TEXT"; + $W = "$ns:$W"; + $SEG = "$ns:$SEG"; + $APP = "$ns:$APP"; + $RDG = "$ns:$RDG"; + $LEM = "$ns:$LEM"; + } +} + +# Parse the TEI file. sub parse { my( $tradition, $xml_str ) = @_; @@ -36,108 +64,324 @@ sub parse { my $doc = $parser->parse_string( $xml_str ); my $tei = $doc->documentElement(); my $xpc = XML::LibXML::XPathContext->new( $tei ); - $xpc->registerNs( 'tei', 'http://www.tei-c.org/ns/1.0' ); - + my $ns; + if( $tei->namespaceURI ) { + $ns = 'tei'; + $xpc->registerNs( $ns, $tei->namespaceURI ); + } + make_tagnames( $ns ); + # Then get the witnesses and create the witness objects. - foreach my $wit_el ( $xpc->findnodes( '//tei:listWit/tei:witness' ) ) { - my $sig = $wit_el->getAttribute( 'xml:id' ); - my $source = $wit_el->toString(); - $tradition->add_witness( sigil => $sig, source => $source ); + foreach my $wit_el ( $xpc->findnodes( "//$LISTWIT/$WITNESS" ) ) { + my $sig = $wit_el->getAttribute( 'xml:id' ); + my $source = $wit_el->toString(); + $tradition->add_witness( sigil => $sig, source => $source ); } - # Now go through the text and make the tokens. - # Assume for now that each word is tokenized in the XML. - my $text = {}; - map { $text->{$_->sigil} = [] } @{$tradition->witnesses}; - my $word_ctr = 0; - my %used_word_ids; - foreach my $word_el ( $xpc->findnodes( '//tei:w|tei:seg' ) ) { - # If it is contained within a lem or a rdg, look at those witnesses. - # Otherwise it is common to all witnesses. - # Also common if it is the only lem/rdg within its app. - # Thus we are assuming non-nested apps. - - my $parent_rdg = $xpc->find( 'parent::tei:lem|parent::tei:rdg', $word_el ); - my @wits = get_sigla( $parent_rdg ); - @wits = map { $_->sigil } @{$tradition->witnesses} unless @wits; - - # Create the node - my $reading = make_reading( $tradition->collation, $word_el ); - - # Figure out if it is a common node, that is, if it is outside an apparatus - # or the only rdg in an apparatus - my $common = 1; - if( $xpc->findnodes( 'ancestor::tei:app', $word_el ) ) { - # If we are in an app we are not a common node... - $common = 0; - if( $xpc->findnodes( 'ancestor::tei:app/tei:rdg' )->size == 1 ) { - # unless we are the only reading in the app. - $common = 1; - } - } - $reading->make_common if $common; - - foreach my $sig ( @wits ) { - push( @{$text->{$sig}}, $reading ); - } + map { $text->{$_->sigil} = [] } $tradition->witnesses; + # Look for all word/seg node IDs and note their pre-existence. + my @attrs = $xpc->findnodes( "//$W|$SEG/attribute::xml:id" ); + save_preexisting_nodeids( @attrs ); + + # Now go through the children of the text element and pull out the + # actual text. + foreach my $xml_el ( $xpc->findnodes( "//$TEXT" ) ) { + foreach my $xn ( $xml_el->childNodes ) { + _get_readings( $tradition, $xn ); + } + } + # Our $text global now has lists of readings, one per witness. + # Join them up. + my $c = $tradition->collation; + foreach my $sig ( keys %$text ) { + next if $sig eq 'base'; # Skip base text readings with no witnesses. + # Determine the list of readings for + my $sequence = $text->{$sig}; + my @real_sequence = ( $c->start ); + push( @$sequence, $c->end ); + my $source = $c->start; + foreach( _clean_sequence( $sig, $sequence ) ) { + my $rdg = _return_rdg( $_ ); + push( @real_sequence, $rdg ); + $c->add_path( $source, $rdg, $sig ); + $source = $rdg; + } + $tradition->witness( $sig )->path( \@real_sequence ); + # See if we need to make an a.c. version of the witness. + if( exists $app_ac->{$sig} ) { + my @uncorrected; + push( @uncorrected, @real_sequence ); + foreach my $app ( keys %{$app_ac->{$sig}} ) { + my $start = _return_rdg( $app_anchors->{$app}->{$sig}->{'start'} ); + my $end = _return_rdg( $app_anchors->{$app}->{$sig}->{'end'} ); + my @new = map { _return_rdg( $_ ) } @{$app_ac->{$sig}->{$app}}; + _replace_sequence( \@uncorrected, $start, $end, @new ); + } + my $source = $c->start; + foreach my $rdg ( @uncorrected ) { + my $has_base = grep { $_->label eq $sig } $source->edges_to( $rdg ); + if( $rdg ne $c->start && !$has_base ) { + print STDERR sprintf( "Adding path %s from %s -> %s\n", + $sig.$c->ac_label, $source->name, $rdg->name ); + $c->add_path( $source, $rdg, $sig.$c->ac_label ); + } + $source = $rdg; + } + $tradition->witness( $sig )->uncorrected_path( \@uncorrected ); + } + } + # Delete readings that are no longer part of the graph. + # TODO think this is useless actually + foreach ( keys %$substitutions ) { + $tradition->collation->del_reading( $tradition->collation->reading( $_ ) ); + } + $tradition->collation->calculate_positions( @common_readings ); +} + +sub _clean_sequence { + my( $wit, $sequence ) = @_; + my @clean_sequence; + foreach my $rdg ( @$sequence ) { + if( $rdg =~ /^PH-(.*)$/ ) { + # It is a placeholder. Keep it only if we need it. + my $app_id = $1; + if( exists $app_ac->{$wit}->{$app_id} ) { + print STDERR "Retaining empty placeholder for $app_id\n"; + push( @clean_sequence, $rdg ); + } + } else { + push( @clean_sequence, $rdg ); + } } + return @clean_sequence; +} - $DB::single = 1; - # Now we have the text paths through the witnesses, so we can make - # the edges. - my $end = $tradition->collation->add_reading( '#END#' ); - foreach my $sigil ( keys %$text ) { - my @nodes = @{$text->{$sigil}}; - my $source = $tradition->collation->start; - foreach my $n ( @nodes ) { - # print STDERR sprintf( "Joining %s -> %s for wit %s\n", $source->text, $n->text, $sigil ); - $tradition->collation->add_path( $source, $n, $sigil ); - $source = $n; - } - $tradition->collation->add_path( $source, $end, $sigil ); +sub _replace_sequence { + my( $arr, $start, $end, @new ) = @_; + my( $start_idx, $end_idx ); + foreach my $i ( 0 .. $#{$arr} ) { + $start_idx = $i if( $arr->[$i]->name eq $start ); + if( $arr->[$i]->name eq $end ) { + $end_idx = $i; + last; + } + } + unless( $start_idx && $end_idx ) { + warn "Could not find start and end"; + return; } + my $length = $end_idx - $start_idx + 1; + splice( @$arr, $start_idx, $length, @new ); +} - # TODO think about relationships, transpositions, etc. +sub _return_rdg { + my( $rdg ) = @_; + # If we were passed a reading name, return the name. If we were + # passed a reading object, return the object. + my $wantobj = ref( $rdg ) eq 'Text::Tradition::Collation::Reading'; + my $real = $rdg; + if( exists $substitutions->{ $wantobj ? $rdg->name : $rdg } ) { + $real = $substitutions->{ $wantobj ? $rdg->name : $rdg }; + $real = $real->name unless $wantobj; + } + return $real; } +## Recursive helper function to help us navigate through nested XML, +## picking out the text. $tradition is the tradition, needed for +## making readings; $xn is the XML node currently being looked at, +## $in_var is a flag to say that we are inside a variant, $ac is a +## flag to say that we are inside an ante-correctionem reading, and +## @cur_wits is the list of witnesses to which this XML node applies. +## Returns the list of readings, if any, created on the run. + +{ + my @active_wits; + my $current_app; + + sub _get_readings { + my( $tradition, $xn, $in_var, $ac, @cur_wits ) = @_; + @cur_wits = @active_wits unless $in_var; + + my @new_readings; + if( $xn->nodeType == XML_TEXT_NODE ) { + # Some words, thus make some readings. + my $str = $xn->data; + return unless $str =~ /\S/; # skip whitespace-only text nodes + #print STDERR "Handling text node " . $str . "\n"; + # Check that all the witnesses we have are active. + foreach my $c ( @cur_wits ) { + warn "Could not find $c in active wits" + unless grep { $c eq $_ } @active_wits; + } + $str =~ s/^\s+//; + my $final = $str =~ s/\s+$//; + foreach my $w ( split( /\s+/, $str ) ) { + # For now, skip punctuation. + next if $w !~ /[[:alnum:]]/; + my $rdg = make_reading( $tradition->collation, $w ); + push( @new_readings, $rdg ); + unless( $in_var ) { + push( @common_readings, $rdg ); + $rdg->make_common; + } + foreach ( @cur_wits ) { + warn "Empty wit!" unless $_; + warn "Empty reading!" unless $rdg; + push( @{$text->{$_}}, $rdg ) unless $ac; + } + } + } elsif( $xn->nodeName eq 'w' ) { + # Everything in this tag is one word. Also save any original XML ID. + #print STDERR "Handling word " . $xn->toString . "\n"; + # Check that all the witnesses we have are active. + foreach my $c ( @cur_wits ) { + warn "Could not find $c in active wits" + unless grep { $c eq $_ } @active_wits; + } + my $xml_id = $xn->getAttribute( 'xml:id' ); + my $rdg = make_reading( $tradition->collation, $xn->textContent, $xml_id ); + push( @new_readings, $rdg ); + unless( $in_var ) { + push( @common_readings, $rdg ); + $rdg->make_common; + } + foreach( @cur_wits ) { + warn "Empty wit!" unless $_; + warn "Empty reading!" unless $rdg; + push( @{$text->{$_}}, $rdg ) unless $ac; + } + } elsif ( $xn->nodeName eq 'app' ) { + $current_app = $xn->getAttribute( 'xml:id' ); + # print STDERR "Handling app $current_app\n"; + # Keep the reading sets in this app. + my @sets; + # Recurse through all children (i.e. rdgs) for sets of words. + foreach ( $xn->childNodes ) { + my @rdg_set = _get_readings( $tradition, $_, $in_var, $ac, @cur_wits ); + push( @sets, \@rdg_set ) if @rdg_set; + } + # Now collate these sets if we have more than one. + my $subs = collate_variants( $tradition->collation, @sets ) if @sets > 1; + map { $substitutions->{$_} = $subs->{$_} } keys %$subs; + # TODO Look through substitutions to see if we can make anything common now. + # Return the entire set of unique readings. + my %unique; + foreach my $s ( @sets ) { + map { $unique{$_->name} = $_ } @$s; + } + push( @new_readings, values( %unique ) ); + # Exit the current app. + $current_app = ''; + } elsif ( $xn->nodeName eq 'lem' || $xn->nodeName eq 'rdg' ) { + # Alter the current witnesses and recurse. + #print STDERR "Handling reading for " . $xn->getAttribute( 'wit' ) . "\n"; + $ac = $xn->getAttribute( 'type' ) && $xn->getAttribute( 'type' ) eq 'a.c.'; + my @rdg_wits = get_sigla( $xn ); + @rdg_wits = ( 'base' ) unless @rdg_wits; # Allow for editorially-supplied readings + my @words; + foreach ( $xn->childNodes ) { + my @rdg_set = _get_readings( $tradition, $_, 1, $ac, @rdg_wits ); + push( @words, @rdg_set ) if @rdg_set; + } + # If we have more than one word in a reading, it should become a segment. + # $tradition->collation->add_segment( @words ) if @words > 1; + + if( $ac ) { + # Add the reading set to the a.c. readings. + foreach ( @rdg_wits ) { + $app_ac->{$_}->{$current_app} = \@words; + } + } else { + # Add the reading set to the app anchors for each witness + # or put in placeholders for empty p.c. readings + foreach ( @rdg_wits ) { + my $start = @words ? $words[0]->name : "PH-$current_app"; + my $end = @words ? $words[-1]->name : "PH-$current_app"; + $app_anchors->{$current_app}->{$_}->{'start'} = $start; + $app_anchors->{$current_app}->{$_}->{'end'} = $end; + push( @{$text->{$_}}, $start ) unless @words; + } + } + push( @new_readings, @words ); + } elsif( $xn->nodeName eq 'witStart' ) { + # Add the relevant wit(s) to the active list. + #print STDERR "Handling witStart\n"; + push( @active_wits, @cur_wits ); + } elsif( $xn->nodeName eq 'witEnd' ) { + # Take the relevant wit(s) out of the list. + #print STDERR "Handling witEnd\n"; + my $regexp = '^(' . join( '|', @cur_wits ) . ')$'; + @active_wits = grep { $_ !~ /$regexp/ } @active_wits; + } elsif( $xn->nodeName eq 'witDetail' ) { + # Ignore these for now. + return; + } else { + # Recurse as if this tag weren't there. + #print STDERR "Recursing on tag " . $xn->nodeName . "\n"; + foreach( $xn->childNodes ) { + push( @new_readings, _get_readings( $tradition, $_, $in_var, $ac, @cur_wits ) ); + } + } + return @new_readings; + } + +} + +# Helper to extract a list of witness sigla from a reading element. sub get_sigla { my( $rdg ) = @_; # Cope if we have been handed a NodeList. There is only # one reading here. if( ref( $rdg ) eq 'XML::LibXML::NodeList' ) { - $rdg = $rdg->shift; + $rdg = $rdg->shift; } my @wits; if( ref( $rdg ) eq 'XML::LibXML::Element' ) { - @wits = split( /\s+/, $rdg->getAttribute( 'wit' ) ); - map { $_ =~ s/^\#// } @wits; + my $witstr = $rdg->getAttribute( 'wit' ); + $witstr =~ s/^\s+//; + $witstr =~ s/\s+$//; + @wits = split( /\s+/, $witstr ); + map { $_ =~ s/^\#// } @wits; } return @wits; } +# Helper with its counters to actually make the readings. { my $word_ctr = 0; my %used_nodeids; + sub save_preexisting_nodeids { + foreach( @_ ) { + $used_nodeids{$_->getValue()} = 1; + } + } + sub make_reading { - my( $graph, $word_el) = @_; - my $xml_id = $word_el->getAttribute( 'xml:id' ); - if( $xml_id && exists $used_nodeids{$xml_id} ) { - warn "Already used assigned ID $xml_id"; - $xml_id = undef; - } - if( !$xml_id ) { - until( $xml_id ) { - my $try_id = 'w'.$word_ctr++; - next if exists $used_nodeids{$try_id}; - $xml_id = $try_id; - } - } - my $rdg = $graph->add_reading( $xml_id ); - $rdg->text( $word_el->textContent() ); - $used_nodeids{$xml_id} = $rdg; - return $rdg; + my( $graph, $word, $xml_id ) = @_; + if( $xml_id ) { + if( exists $used_nodeids{$xml_id} ) { + if( $used_nodeids{$xml_id} != 1 ) { + warn "Already used assigned XML ID somewhere else!"; + $xml_id = undef; + } + } else { + warn "Undetected pre-existing XML ID"; + } + } + if( !$xml_id ) { + until( $xml_id ) { + my $try_id = 'w'.$word_ctr++; + next if exists $used_nodeids{$try_id}; + $xml_id = $try_id; + } + } + my $rdg = $graph->add_reading( $xml_id ); + $rdg->text( $word ); + $used_nodeids{$xml_id} = $rdg; + return $rdg; } } diff --git a/lib/Text/Tradition/Parser/Util.pm b/lib/Text/Tradition/Parser/Util.pm new file mode 100644 index 0000000..6d9dab1 --- /dev/null +++ b/lib/Text/Tradition/Parser/Util.pm @@ -0,0 +1,194 @@ +package Text::Tradition::Parser::Util; + +use strict; +use warnings; +use Algorithm::Diff; +use Exporter 'import'; +use vars qw/ @EXPORT_OK /; +@EXPORT_OK = qw/ add_hash_entry check_for_repeated cmp_str collate_variants is_monotonic /; + +=item B + +collate_variants( $collation, @reading_ranges ) + +Given a set of readings in the form +( lemma_start, lemma_end, rdg1_start, rdg1_end, ... ) +walks through each to identify those readings that are identical. The +collation is a Text::Tradition::Collation object; the elements of +@readings are Text::Tradition::Collation::Reading objects that appear +on the collation graph. + +TODO: Handle collapsed and non-collapsed transpositions. + +=cut + +sub collate_variants { + my( $collation, @reading_sets ) = @_; + + # Two different ways to do this, depending on whether we want + # transposed reading nodes to be merged into one (producing a + # nonlinear, bidirectional graph) or not (producing a relatively + # linear, unidirectional graph.) + return $collation->linear ? collate_linearly( @_ ) + : collate_nonlinearly( @_ ); +} + +sub collate_linearly { + my( $collation, $lemma_set, @variant_sets ) = @_; + + my @unique; + my $substitutions = {}; + push( @unique, @$lemma_set ); + while( @variant_sets ) { + my $variant_set = shift @variant_sets; + # Use diff to do this job + my $diff = Algorithm::Diff->new( \@unique, $variant_set, + {'keyGen' => \&_collation_hash} ); + my @new_unique; + my %merged; + while( $diff->Next ) { + if( $diff->Same ) { + # merge the nodes + my @l = $diff->Items( 1 ); + my @v = $diff->Items( 2 ); + foreach my $i ( 0 .. $#l ) { + if( !$merged{$l[$i]->name} ) { + print STDERR sprintf( "Merging %s into %s\n", + $v[$i]->name, + $l[$i]->name ); + $collation->merge_readings( $l[$i], $v[$i] ); + $merged{$l[$i]->name} = 1; + $substitutions->{$v[$i]->name} = $l[$i]; + } else { + print STDERR "Would have double merged " . $l[$i]->name . "\n"; + } + } + # splice the lemma nodes into the variant set + my( $offset ) = $diff->Get( 'min2' ); + splice( @$variant_set, $offset, scalar( @l ), @l ); + push( @new_unique, @l ); + } else { + # Keep the old unique readings + push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 ); + # Add the new readings to the 'unique' list + push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 ); + } + } + @unique = @new_unique; + } + return $substitutions; +} + +sub collate_nonlinearly { + my( $collation, $lemma_set, @variant_sets ) = @_; + + my @unique; + my $substitutions = {}; + push( @unique, @$lemma_set ); + while( @variant_sets ) { + my $variant_set = shift @variant_sets; + # Simply match the first reading that carries the same word, so + # long as that reading has not yet been used to match another + # word in this variant. That way lies loopy madness. + my @distinct; + my %merged; + foreach my $idx ( 0 .. $#{$variant_set} ) { + my $vw = $variant_set->[$idx]; + my @same = grep { cmp_str( $_ ) eq $vw->label } @unique; + my $matched; + if( @same ) { + foreach my $i ( 0 .. $#same ) { + unless( $merged{$same[$i]->name} ) { + #print STDERR sprintf( "Merging %s into %s\n", + # $vw->name, + # $same[$i]->name ); + $collation->merge_readings( $same[$i], $vw ); + $merged{$same[$i]->name} = 1; + $matched = $i; + $variant_set->[$idx] = $same[$i]; + $substitutions->{$vw->name} = $same[$i]; + } + } + } + unless( @same && defined($matched) ) { + push( @distinct, $vw ); + } + } + push( @unique, @distinct ); + } + return $substitutions; +} + +sub _collation_hash { + my $node = shift; + return cmp_str( $node ); +} + +=item B + +Pretend you never saw this method. Really it needs to not be hardcoded. + +=cut + +sub cmp_str { + my( $reading ) = @_; + my $word = $reading->label(); + $word = lc( $word ); + $word =~ s/\W//g; + $word =~ s/v/u/g; + $word =~ s/j/i/g; + $word =~ s/cha/ca/g; + $word =~ s/quatuor/quattuor/g; + $word =~ s/ioannes/iohannes/g; + return $word; +} + +=item B + +my @rep = check_for_repeated( @readings ) + +Given an array of items, returns any items that appear in the array more +than once. + +=cut + +sub check_for_repeated { + my @seq = @_; + my %unique; + my @repeated; + foreach ( @seq ) { + if( exists $unique{$_->name} ) { + push( @repeated, $_->name ); + } else { + $unique{$_->name} = 1; + } + } + return @repeated; +} + +sub add_hash_entry { + my( $hash, $key, $entry ) = @_; + if( exists $hash->{$key} ) { + push( @{$hash->{$key}}, $entry ); + } else { + $hash->{$key} = [ $entry ]; + } +} + +sub is_monotonic { + my( @readings ) = @_; + my( $common, $min, $max ) = ( -1, -1, -1 ); + foreach my $rdg ( @readings ) { +# print STDERR "Checking reading " . $rdg->name . "/" . $rdg->text . " - " +# . $rdg->position->reference ."\n"; + return 0 if $rdg->position->common < $common; + if( $rdg->position->common == $common ) { + return 0 if $rdg->position->min <= $min; + return 0 if $rdg->position->max <= $max; + } + $common = $rdg->position->common; + $min = $rdg->position->min; + $max = $rdg->position->max; + } + return 1; +} \ No newline at end of file diff --git a/lib/Text/Tradition/Stemma.pm b/lib/Text/Tradition/Stemma.pm index 0f3a881..d4466aa 100644 --- a/lib/Text/Tradition/Stemma.pm +++ b/lib/Text/Tradition/Stemma.pm @@ -22,60 +22,51 @@ has character_matrix => ( sub make_character_matrix { my $self = shift; unless( $self->collation->linear ) { - warn "Need a linear graph in order to make an alignment table"; - return; + warn "Need a linear graph in order to make an alignment table"; + return; } - my @all_pos = sort { Text::Tradition::Collation::Position::str_cmp( $a, $b ) } - $self->collation->possible_positions; - my $table = []; - my $characters = {}; - map { $characters->{$_} = {} } @all_pos; - foreach my $wit ( @{$self->collation->tradition->witnesses} ) { - # First implementation: make dumb alignment table, caring about - # nothing except which reading is in which position. - my $sigilfield = sprintf( "%-10s", $wit->sigil ); - push( @$table, [ $sigilfield, make_witness_row( $characters, $wit->path, - \@all_pos ) ] ); - if( $wit->has_ante_corr ) { - $sigilfield = sprintf( "%-10s", $wit->sigil . "_ac" ); - push( @$table, [ $sigilfield, - make_witness_row( $characters, $wit->uncorrected_path, - \@all_pos ) ] ); - } + my $table = $self->collation->make_alignment_table; + # Push the names of the witnesses to initialize the rows of the matrix. + my @matrix = map { [ $self->_normalize_ac( $_ ) ] } @{$table->[0]}; + $DB::single = 1; + foreach my $token_index ( 1 .. $#{$table} ) { + # First implementation: make dumb alignment table, caring about + # nothing except which reading is in which position. + my @chars = convert_characters( $table->[$token_index] ); + foreach my $idx ( 0 .. $#matrix ) { + push( @{$matrix[$idx]}, $chars[$idx] ); + } } - $self->_save_character_matrix( $table ); -} + $self->_save_character_matrix( \@matrix ); +} -sub make_witness_row { - my( $characters, $path, $positions ) = @_; - my %char_hash; - map { $char_hash{$_} = 'X' } @$positions; - foreach my $rdg( @$path ) { - $char_hash{$rdg->position->minref} = get_character( $rdg, $characters ); +sub _normalize_ac { + my( $self, $witname ) = @_; + my $ac = $self->collation->ac_label; + if( $witname =~ /(.*)\Q$ac\E$/ ) { + $witname = $1 . '_ac'; } - my @row = map { $char_hash{$_} } @$positions; - return @row; + return sprintf( "%-10s", $witname ); } - -sub get_character { - my( $reading, $characters ) = @_; - my $this_pos = $characters->{$reading->position->minref}; +sub convert_characters { + my $row = shift; # This is a simple algorithm that treats every reading as different. # Eventually we will want to be able to specify how relationships # affect the character matrix. - my $text = $reading->text; - unless( exists $this_pos->{$text} ) { - # We need to find what the next character is here, and record it. - my @all_chr = sort { $a <=> $b } values( %$this_pos ); - if( @all_chr == 8 ) { - warn "Already have eight variants at position " - . $reading->position->minref . "; not adding " . $reading->text; - return '?'; - } - $this_pos->{$text} = scalar @all_chr; + my %unique = ( '__UNDEF__' => 'X' ); + my $ctr = 0; + foreach my $word ( @$row ) { + if( $word && !exists $unique{$word} ) { + $unique{$word} = chr( 65 + $ctr ); + $ctr++; + } } - return $this_pos->{$text}; + if( scalar( keys %unique ) > 8 ) { + warn "Have more than 8 variants on this location; pars will break"; + } + my @chars = map { $_ ? $unique{$_} : $unique{'__UNDEF__' } } @$row; + return @chars; } sub pars_input { @@ -86,7 +77,7 @@ sub pars_input { my $columns = scalar @{$self->character_matrix->[0]} - 1; $matrix .= "\t$rows\t$columns\n"; foreach my $row ( @{$self->character_matrix} ) { - $matrix .= join( '', @$row ) . "\n"; + $matrix .= join( '', @$row ) . "\n"; } return $matrix; } @@ -96,7 +87,6 @@ sub run_pars { # Set up a temporary directory for all the default Phylip files. my $phylip_dir = File::Temp->newdir(); - $DB::single = 1; # We need an infile, and we need a command input file. open( MATRIX, ">$phylip_dir/infile" ) or die "Could not write $phylip_dir/infile"; print MATRIX $self->pars_input(); @@ -128,36 +118,36 @@ sub run_pars { my $PHYLIP_PATH = '/Users/tla/Projects/phylip-3.69/exe'; my $program = "pars"; if( $^O eq 'darwin' ) { - $program = "$PHYLIP_PATH/$program.app/Contents/MacOS/$program"; + $program = "$PHYLIP_PATH/$program.app/Contents/MacOS/$program"; } else { - $program = "$PHYLIP_PATH/$program"; + $program = "$PHYLIP_PATH/$program"; } { - # We need to run it in our temporary directory where we have created - # all the expected files. - local $CWD = $phylip_dir; - my @cmd = ( $program ); - run \@cmd, '<', 'cmdfile', '>', '/dev/null'; + # We need to run it in our temporary directory where we have created + # all the expected files. + local $CWD = $phylip_dir; + my @cmd = ( $program ); + run \@cmd, '<', 'cmdfile', '>', '/dev/null'; } # Now our output should be in 'outfile' and our tree in 'outtree', # both in the temp directory. my @outtree; if( -f "$phylip_dir/outtree" ) { - open( TREE, "$phylip_dir/outtree" ) or die "Could not open outtree for read"; - @outtree = ; - close TREE; + open( TREE, "$phylip_dir/outtree" ) or die "Could not open outtree for read"; + @outtree = ; + close TREE; } return( 1, join( '', @outtree ) ) if @outtree; my @error; if( -f "$phylip_dir/outfile" ) { - open( OUTPUT, "$phylip_dir/outfile" ) or die "Could not open output for read"; - @error = ; - close OUTPUT; + open( OUTPUT, "$phylip_dir/outfile" ) or die "Could not open output for read"; + @error = ; + close OUTPUT; } else { - push( @error, "Neither outtree nor output file was produced!" ); + push( @error, "Neither outtree nor output file was produced!" ); } return( undef, join( '', @error ) ); } diff --git a/make_tradition.pl b/make_tradition.pl new file mode 100644 index 0000000..14acdef --- /dev/null +++ b/make_tradition.pl @@ -0,0 +1,109 @@ +#!/usr/bin/env perl + +use lib 'lib'; +use strict; +use warnings; +use Getopt::Long; +use Text::Tradition; +use Text::Tradition::Stemma; + +binmode STDERR, ":utf8"; +binmode STDOUT, ":utf8"; +eval { no warnings; binmode $DB::OUT, ":utf8"; }; + +my( $informat, $inbase, $outformat, $help, $linear, $HACK ) + = ( '', '', '', '', 1, 0 ); + +GetOptions( 'i|in=s' => \$informat, + 'b|base=s' => \$inbase, + 'o|out=s' => \$outformat, + 'l|linear!' => \$linear, + 'h|help' => \$help, + 'hack' => \$HACK, + ); + +if( $help ) { + help(); +} + +unless( $informat =~ /^(CSV|CTE|KUL|Self|TEI|CollateX)$/i ) { + help( "Input format must be one of CollateX, CSV, CTE, Self, TEI" ); +} +$informat = 'CollateX' if $informat =~ /^c(ollate)?x$/i; +$informat = 'KUL' if $informat =~ /^kul$/i; +$informat = 'CTE' if $informat =~ /^cte$/i; +$informat = 'Self' if $informat =~ /^self$/i; +$informat = 'TEI' if $informat =~ /^tei$/i; + +unless( $outformat =~ /^(graphml|svg|dot|stemma|csv)$/ ) { + help( "Output format must be one of graphml, svg, csv, stemma, or dot" ); +} + +# Do we have a base if we need it? +if( $informat eq 'KUL' && !$inbase ) { + help( "$informat input needs a base text" ); +} + +# CSV parsing requires a filename; XML parsing requires a string. +my $input = $ARGV[0]; +unless( $informat eq 'KUL' || $informat eq 'CSV' ) { + my @lines; + open( INFILE, "$input" ) or die "Could not read $input"; + @lines = ; + close INFILE; + $input = join( '', @lines ); +} + +# First: read the base. Make a graph, but also note which +# nodes represent line beginnings. +my %args = ( $informat => $input, + 'linear' => $linear ); +$args{'base'} = $inbase if $inbase; +my $tradition = Text::Tradition->new( %args ); + +### Custom hacking +# Remove witnesses C, E, G in the Matthew text +if( $HACK ) { + foreach( $tradition->collation->paths() ) { + $tradition->collation->del_path( $_ ) if $_->label =~ /^[ceg]$/i; + } + foreach( $tradition->collation->readings() ) { + if( !$_->outgoing() && !$_->incoming() ) { + print STDERR "Deleting reading " . $_->label . "\n"; + $tradition->collation->del_reading( $_ ); + } + } +} + +# Now output what we have been asked to. +if( $outformat eq 'stemma' ) { + my $stemma = Text::Tradition::Stemma->new( + 'collation' => $tradition->collation ); + my( $result, $tree ) = $stemma->run_pars(); + if( $result ) { + print $tree; + } else { + print STDERR "Bad result: $tree"; + } +} else { + my $output = "as_$outformat"; + print $tradition->collation->$output(); +} + +sub help { + my( $msg ) = @_; + print STDERR << "EOF" +Usage: $0 -i [format] -o [format] (--base [filename]) (--(no)linear) [inputfile] + i, input: Format of the input file. Must be one of CollateX, CSV, CTE, Self, TEI. + o, output: Format of the output. Must be one of svg, dot, graphml, csv, stemma. + b, base: Filename that contains a base text. Needed for CSV input. + l, linear: Treat transposed readings separately, producing a linear graph. + If nolinear, treat transposed readings as the same node. + h, help: Print this message. +EOF + ; + if( $msg ) { + print STDERR "$msg\n"; + } + exit ($msg ? 1 : 0 ); +}