has 'witnesses' => (
traits => ['Array'],
- is => 'rw',
isa => 'ArrayRef[Text::Tradition::Witness]',
handles => {
- all => 'elements',
- add => 'push',
+ witnesses => 'elements',
+ add_witness => 'push',
},
default => sub { [] },
);
isa => 'Str',
default => 'Tradition',
);
+
+around 'add_witness' => sub {
+ my $orig = shift;
+ my $self = shift;
+ my $new_wit = Text::Tradition::Witness->new( @_ );
+ $self->$orig( $new_wit );
+ return $new_wit;
+};
+
sub BUILD {
my( $self, $init_args ) = @_;
if( exists $init_args->{'witnesses'} ) {
- # We got passed an uncollated list of witnesses. Make a
- # witness object for each witness, and then send them to the
- # collator.
- my $autosigil = 0;
- foreach my $wit ( %{$init_args->{'witnesses'}} ) {
- # Each item in the list is either a string or an arrayref.
- # If it's a string, it is a filename; if it's an arrayref,
- # it is a tuple of 'sigil, file'. Handle either case.
- my $args;
- if( ref( $wit ) eq 'ARRAY' ) {
- $args = { 'sigil' => $wit->[0],
- 'file' => $wit->[1] };
- } else {
- $args = { 'sigil' => chr( $autosigil+65 ),
- 'file' => $wit };
- $autosigil++;
- }
- $self->witnesses->push( Text::Tradition::Witness->new( $args ) );
- # TODO Now how to collate these?
- }
+ # We got passed an uncollated list of witnesses. Make a
+ # witness object for each witness, and then send them to the
+ # collator.
+ my $autosigil = 0;
+ foreach my $wit ( %{$init_args->{'witnesses'}} ) {
+ # Each item in the list is either a string or an arrayref.
+ # If it's a string, it is a filename; if it's an arrayref,
+ # it is a tuple of 'sigil, file'. Handle either case.
+ my $args;
+ if( ref( $wit ) eq 'ARRAY' ) {
+ $args = { 'sigil' => $wit->[0],
+ 'file' => $wit->[1] };
+ } else {
+ $args = { 'sigil' => chr( $autosigil+65 ),
+ 'file' => $wit };
+ $autosigil++;
+ }
+ $self->witnesses->add_witness( $args );
+ # TODO Now how to collate these?
+ }
} else {
- # Else we need to parse some collation data. Make a Collation object
- my $collation = Text::Tradition::Collation->new( %$init_args,
- 'tradition' => $self );
- $self->_save_collation( $collation );
+ # Else we need to parse some collation data. Make a Collation object
+ my $collation = Text::Tradition::Collation->new( %$init_args,
+ 'tradition' => $self );
+ $self->_save_collation( $collation );
- # Call the appropriate parser on the given data
- my @formats = grep { /^(Self|CollateX|CSV|CTE|TEI)$/ } keys( %$init_args );
- my $format = shift( @formats );
- unless( $format ) {
- warn "No data given to create a collation; will initialize an empty one";
- }
- if( $format && $format =~ /^(CSV|CTE)$/ &&
- !exists $init_args->{'base'} ) {
- warn "Cannot make a collation from $format without a base text";
- return;
- }
+ # Call the appropriate parser on the given data
+ my @formats = grep { /^(Self|CollateX|CSV|CTE|KUL|TEI)$/ } keys( %$init_args );
+ my $format = shift( @formats );
+ unless( $format ) {
+ warn "No data given to create a collation; will initialize an empty one";
+ }
+ if( $format && $format =~ /^(KUL|CTE)$/ &&
+ !exists $init_args->{'base'} ) {
+ warn "Cannot make a collation from $format without a base text";
+ return;
+ }
- # Starting point for all texts
- my $last_node = $collation->add_reading( '#START#' );
+ # Start and end points for all texts
+ $collation->add_reading( '#START#' );
+ $collation->add_reading( '#END#' );
+
- # Now do the parsing.
- my @sigla;
- if( $format ) {
- my @parseargs;
- if( $format =~ /^(CSV|CTE)$/ ) {
- $init_args->{'data'} = $init_args->{$format};
- $init_args->{'format'} = $format;
- $format = 'BaseText';
- @parseargs = %$init_args;
- } else {
- @parseargs = ( $init_args->{ $format } );
- }
- my $mod = "Text::Tradition::Parser::$format";
- load( $mod );
- $mod->can('parse')->( $self, @parseargs );
- }
+ # Now do the parsing.
+ my @sigla;
+ if( $format ) {
+ my @parseargs;
+ if( $format =~ /^(KUL|CTE)$/ ) {
+ $init_args->{'data'} = $init_args->{$format};
+ $init_args->{'format'} = $format;
+ $format = 'BaseText';
+ @parseargs = %$init_args;
+ } else {
+ @parseargs = ( $init_args->{ $format } );
+ }
+ my $mod = "Text::Tradition::Parser::$format";
+ load( $mod );
+ $mod->can('parse')->( $self, @parseargs );
+ }
}
}
sub witness {
my( $self, $sigil ) = @_;
my $requested_wit;
- foreach my $wit ( @{$self->witnesses} ) {
- $requested_wit = $wit if $wit->sigil eq $sigil;
+ foreach my $wit ( $self->witnesses ) {
+ $requested_wit = $wit if $wit->sigil eq $sigil;
}
# We depend on an undef return value for no such witness.
# warn "No such witness $sigil" unless $requested_wit;
return $requested_wit;
}
-
-
-sub add_witness {
- my $self = shift;
- my $new_wit = Text::Tradition::Witness->new( @_ );
- push( @{$self->witnesses}, $new_wit );
- return $new_wit;
-}
+
# The user will usually be instantiating a Tradition object, and
# examining its collation. The information about the tradition can
package Text::Tradition::Collation;
+use Encode qw( decode_utf8 );
+use File::Temp;
use Graph::Easy;
use IPC::Run qw( run binary );
+use Text::CSV_XS;
use Text::Tradition::Collation::Path;
use Text::Tradition::Collation::Position;
use Text::Tradition::Collation::Reading;
is => 'ro',
isa => 'Graph::Easy',
handles => {
- add_reading => 'add_node',
- del_reading => 'del_node',
- add_path => 'add_edge',
- del_path => 'del_edge',
- reading => 'node',
- path => 'edge',
- readings => 'nodes',
- segments => 'nodes',
- paths => 'edges',
- relationships => 'edges',
+ add_reading => 'add_node',
+ del_reading => 'del_node',
+ del_segment => 'del_node',
+ add_path => 'add_edge',
+ del_path => 'del_edge',
+ reading => 'node',
+ path => 'edge',
+ readings => 'nodes',
+ segments => 'nodes',
+ paths => 'edges',
+ relationships => 'edges',
},
default => sub { Graph::Easy->new( undirected => 0 ) },
);
-
+
has 'tradition' => ( # TODO should this not be ro?
is => 'rw',
predicate => 'has_graphml',
);
+has 'csv' => (
+ is => 'ro',
+ isa => 'Str',
+ writer => '_save_csv',
+ predicate => 'has_csv',
+ );
+
# Keeps track of the lemmas within the collation. At most one lemma
# per position in the graph.
has 'lemmata' => (
# Make sure there are three arguments
unless( @_ == 3 ) {
- warn "Call add_path with args source, target, witness";
- return;
+ warn "Call add_path with args source, target, witness";
+ return;
}
# Make sure the proposed path does not yet exist
# NOTE 'reading' will currently return readings and segments
my( $source, $target, $wit ) = @_;
$source = $self->reading( $source )
- unless ref( $source ) eq 'Text::Tradition::Collation::Reading';
+ unless ref( $source ) eq 'Text::Tradition::Collation::Reading';
$target = $self->reading( $target )
- unless ref( $target ) eq 'Text::Tradition::Collation::Reading';
+ unless ref( $target ) eq 'Text::Tradition::Collation::Reading';
foreach my $path ( $source->edges_to( $target ) ) {
- if( $path->label eq $wit && $path->class eq 'edge.path' ) {
- return;
- }
+ if( $path->label eq $wit && $path->class eq 'edge.path' ) {
+ return;
+ }
}
# Do the deed
$self->$orig( @_ );
# Make sure there is not another relationship between these two
# readings or segments already
$source = $self->reading( $source )
- unless ref( $source ) && $source->isa( 'Graph::Easy::Node' );
+ unless ref( $source ) && $source->isa( 'Graph::Easy::Node' );
$target = $self->reading( $target )
- unless ref( $target ) && $target->isa( 'Graph::Easy::Node' );
+ unless ref( $target ) && $target->isa( 'Graph::Easy::Node' );
foreach my $rel ( $source->edges_to( $target ), $target->edges_to( $source ) ) {
- if( $rel->class eq 'edge.relationship' ) {
- return ( undef, "Relationship already exists between these readings" );
- } else {
- return ( undef, "There is a witness path between these readings" );
- }
+ if( $rel->class eq 'edge.relationship' ) {
+ return ( undef, "Relationship already exists between these readings" );
+ }
}
-
- if( $source->has_position && $target->has_position ) {
- unless( grep { $_ eq $target } $self->same_position_as( $source ) ) {
- return( undef, "Cannot set relationship at different positions" );
- }
+ if( $options->{'equal_rank'} && !relationship_valid( $source, $target ) ) {
+ return ( undef, 'Relationship creates witness loop' );
}
+ # TODO Think about positional hilarity if relationships are added after positions
+ # are assigned.
+
my @joined = ( [ $source->name, $target->name ] ); # Keep track of the nodes we join.
$options->{'this_relation'} = [ $source, $target ];
return ( undef, $@ );
}
$self->graph->add_edge( $source, $target, $rel );
- if( $options->{'global'} ) {
- # Look for all readings with the source label, and if there are
- # colocated readings with the target label, join them too.
- foreach my $r ( grep { $_->label eq $source->label } $self->readings() ) {
- next if $r->name eq $source->name;
- my @colocated = grep { $_->label eq $target->label }
- $self->same_position_as( $r );
- if( @colocated ) {
- warn "Multiple readings with same label at same position!"
- if @colocated > 1;
- my $colo = $colocated[0];
- next if $colo->edges_to( $r ) || $r->edges_to( $colo );
- $options->{'primary_relation'} = $options->{'this_relation'};
- $options->{'this_relation'} = [ $r, $colocated[0] ];
- my $dup_rel = Text::Tradition::Collation::Relationship->new( %$options );
- $self->graph->add_edge( $r, $colocated[0], $dup_rel );
- push( @joined, [ $r->name, $colocated[0]->name ] );
- }
- }
- }
+
+ # TODO Handle global relationship setting
+
return( 1, @joined );
}
+sub relationship_valid {
+ my( $source, $target ) = @_;
+ # Check that linking the source and target in a relationship won't lead
+ # to a path loop for any witness.
+ my @proposed_related = ( $source, $target );
+ push( @proposed_related, $source->related_readings );
+ push( @proposed_related, $target->related_readings );
+ my %pr_ids;
+ map { $pr_ids{ $_->name } = 1 } @proposed_related;
+ # The lists of 'in' and 'out' should not have any element that appears
+ # in 'proposed_related'.
+ foreach my $pr ( @proposed_related ) {
+ foreach my $e ( $pr->incoming ) {
+ if( exists $pr_ids{ $e->from->name } ) {
+ return 0;
+ }
+ }
+ foreach my $e ( $pr->outgoing ) {
+ if( exists $pr_ids{ $e->to->name } ) {
+ return 0;
+ }
+ }
+ }
+ return 1;
+}
+
=head2 Output method(s)
=over
my @cmd = qw/dot -Tsvg/;
my( $svg, $err );
- my $in = $self->as_dot();
- run( \@cmd, \$in, ">", binary(), \$svg );
+ my $dotfile = File::Temp->new();
+ binmode $dotfile, ':utf8';
+ print $dotfile $self->as_dot();
+ push( @cmd, $dotfile->filename );
+ run( \@cmd, ">", binary(), \$svg );
+ $svg = decode_utf8( $svg );
$self->_save_svg( $svg );
$self->expand_graph_paths();
return $svg;
$dot .= "\tedge [ arrowhead=open ];\n";
$dot .= "\tgraph [ rankdir=LR ];\n";
$dot .= sprintf( "\tnode [ fontsize=%d, fillcolor=%s, style=%s, shape=%s ];\n",
- 11, "white", "filled", $self->graph->get_attribute( 'node', 'shape' ) );
+ 11, "white", "filled", $self->graph->get_attribute( 'node', 'shape' ) );
foreach my $reading ( $self->readings ) {
- # Need not output nodes without separate labels
- next if $reading->name eq $reading->label;
- # TODO output readings or segments, but not both
- next if $reading->class eq 'node.segment';
- $dot .= sprintf( "\t\"%s\" [ label=\"%s\" ];\n", $reading->name, $reading->label );
+ # Need not output nodes without separate labels
+ next if $reading->name eq $reading->label;
+ # TODO output readings or segments, but not both
+ next if $reading->class eq 'node.segment';
+ $dot .= sprintf( "\t\"%s\" [ label=\"%s\" ];\n", $reading->name, $reading->label );
}
my @edges = $view eq 'relationship' ? $self->relationships : $self->paths;
foreach my $edge ( @edges ) {
- my %variables = ( 'color' => '#000000',
- 'fontcolor' => '#000000',
- 'label' => $edge->label,
- );
- my $varopts = join( ', ', map { $_.'="'.$variables{$_}.'"' } sort keys %variables );
- $dot .= sprintf( "\t\"%s\" -> \"%s\" [ %s ];\n",
- $edge->from->name, $edge->to->name, $varopts );
+ my %variables = ( 'color' => '#000000',
+ 'fontcolor' => '#000000',
+ 'label' => $edge->label,
+ );
+ my $varopts = join( ', ', map { $_.'="'.$variables{$_}.'"' } sort keys %variables );
+ $dot .= sprintf( "\t\"%s\" -> \"%s\" [ %s ];\n",
+ $edge->from->name, $edge->to->name, $varopts );
}
$dot .= "}\n";
return $dot;
my $graphml_ns = 'http://graphml.graphdrawing.org/xmlns';
my $xsi_ns = 'http://www.w3.org/2001/XMLSchema-instance';
my $graphml_schema = 'http://graphml.graphdrawing.org/xmlns ' .
- 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
+ 'http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd';
# Create the document and root node
my $graphml = XML::LibXML->createDocument( "1.0", "UTF-8" );
my %node_data_keys;
my $ndi = 0;
foreach my $datum ( qw/ name reading identical position class / ) {
- $node_data_keys{$datum} = 'dn'.$ndi++;
- my $key = $root->addNewChild( $graphml_ns, 'key' );
- $key->setAttribute( 'attr.name', $datum );
- $key->setAttribute( 'attr.type', 'string' );
- $key->setAttribute( 'for', 'node' );
- $key->setAttribute( 'id', $node_data_keys{$datum} );
+ $node_data_keys{$datum} = 'dn'.$ndi++;
+ my $key = $root->addNewChild( $graphml_ns, 'key' );
+ $key->setAttribute( 'attr.name', $datum );
+ $key->setAttribute( 'attr.type', 'string' );
+ $key->setAttribute( 'for', 'node' );
+ $key->setAttribute( 'id', $node_data_keys{$datum} );
}
# Add the data keys for edges, i.e. witnesses
my $edi = 0;
my %edge_data_keys;
foreach my $edge_key( qw/ witness_main witness_ante_corr relationship class / ) {
- $edge_data_keys{$edge_key} = 'de'.$edi++;
- my $key = $root->addNewChild( $graphml_ns, 'key' );
- $key->setAttribute( 'attr.name', $edge_key );
- $key->setAttribute( 'attr.type', 'string' );
- $key->setAttribute( 'for', 'edge' );
- $key->setAttribute( 'id', $edge_data_keys{$edge_key} );
+ $edge_data_keys{$edge_key} = 'de'.$edi++;
+ my $key = $root->addNewChild( $graphml_ns, 'key' );
+ $key->setAttribute( 'attr.name', $edge_key );
+ $key->setAttribute( 'attr.type', 'string' );
+ $key->setAttribute( 'for', 'edge' );
+ $key->setAttribute( 'id', $edge_data_keys{$edge_key} );
}
# Add the graph, its nodes, and its edges
my %node_hash;
# Add our readings to the graph
foreach my $n ( sort { $a->name cmp $b->name } $self->readings ) {
- my $node_el = $graph->addNewChild( $graphml_ns, 'node' );
- my $node_xmlid = 'n' . $node_ctr++;
- $node_hash{ $n->name } = $node_xmlid;
- $node_el->setAttribute( 'id', $node_xmlid );
- _add_graphml_data( $node_el, $node_data_keys{'name'}, $n->name );
- _add_graphml_data( $node_el, $node_data_keys{'reading'}, $n->label );
- _add_graphml_data( $node_el, $node_data_keys{'position'}, $n->position->reference )
- if $n->has_position;
- _add_graphml_data( $node_el, $node_data_keys{'class'}, $n->sub_class );
- _add_graphml_data( $node_el, $node_data_keys{'identical'}, $n->primary->name )
- if $n->has_primary;
+ my $node_el = $graph->addNewChild( $graphml_ns, 'node' );
+ my $node_xmlid = 'n' . $node_ctr++;
+ $node_hash{ $n->name } = $node_xmlid;
+ $node_el->setAttribute( 'id', $node_xmlid );
+ _add_graphml_data( $node_el, $node_data_keys{'name'}, $n->name );
+ _add_graphml_data( $node_el, $node_data_keys{'reading'}, $n->label );
+ _add_graphml_data( $node_el, $node_data_keys{'position'}, $n->position->reference )
+ if $n->has_position;
+ _add_graphml_data( $node_el, $node_data_keys{'class'}, $n->sub_class );
+ _add_graphml_data( $node_el, $node_data_keys{'identical'}, $n->primary->name )
+ if $n->has_primary;
}
# Add any segments we have
foreach my $n ( sort { $a->name cmp $b->name } $self->segments ) {
- my $node_el = $graph->addNewChild( $graphml_ns, 'node' );
- my $node_xmlid = 'n' . $node_ctr++;
- $node_hash{ $n->name } = $node_xmlid;
- $node_el->setAttribute( 'id', $node_xmlid );
- _add_graphml_data( $node_el, $node_data_keys{'class'}, $n->sub_class );
- _add_graphml_data( $node_el, $node_data_keys{'name'}, $n->name );
+ my $node_el = $graph->addNewChild( $graphml_ns, 'node' );
+ my $node_xmlid = 'n' . $node_ctr++;
+ $node_hash{ $n->name } = $node_xmlid;
+ $node_el->setAttribute( 'id', $node_xmlid );
+ _add_graphml_data( $node_el, $node_data_keys{'class'}, $n->sub_class );
+ _add_graphml_data( $node_el, $node_data_keys{'name'}, $n->name );
}
# Add the path, relationship, and segment edges
my $edge_ctr = 0;
foreach my $e ( sort { $a->from->name cmp $b->from->name } $self->graph->edges() ) {
- my( $name, $from, $to ) = ( 'e'.$edge_ctr++,
- $node_hash{ $e->from->name() },
- $node_hash{ $e->to->name() } );
- my $edge_el = $graph->addNewChild( $graphml_ns, 'edge' );
- $edge_el->setAttribute( 'source', $from );
- $edge_el->setAttribute( 'target', $to );
- $edge_el->setAttribute( 'id', $name );
- # Add the edge class
- _add_graphml_data( $edge_el, $edge_data_keys{'class'}, $e->sub_class );
- if( $e->sub_class eq 'path' ) {
- # It's a witness path, so add the witness
- my $base = $e->label;
- my $key = $edge_data_keys{'witness_main'};
- # TODO kind of hacky
- if( $e->label =~ /^(.*?)\s+(\(a\.c\.\))$/ ) {
- $base = $1;
- $key = $edge_data_keys{'witness_ante_corr'};
- }
- _add_graphml_data( $edge_el, $key, $base );
- } elsif( $e->sub_class eq 'relationship' ) {
- # It's a relationship
- _add_graphml_data( $edge_el, $edge_data_keys{'relationship'}, $e->label );
- } # else a segment, nothing to record but source, target, class
+ my( $name, $from, $to ) = ( 'e'.$edge_ctr++,
+ $node_hash{ $e->from->name() },
+ $node_hash{ $e->to->name() } );
+ my $edge_el = $graph->addNewChild( $graphml_ns, 'edge' );
+ $edge_el->setAttribute( 'source', $from );
+ $edge_el->setAttribute( 'target', $to );
+ $edge_el->setAttribute( 'id', $name );
+ # Add the edge class
+ _add_graphml_data( $edge_el, $edge_data_keys{'class'}, $e->sub_class );
+ if( $e->sub_class eq 'path' ) {
+ # It's a witness path, so add the witness
+ my $base = $e->label;
+ my $key = $edge_data_keys{'witness_main'};
+ # TODO kind of hacky
+ if( $e->label =~ /^(.*?)\s+(\(a\.c\.\))$/ ) {
+ $base = $1;
+ $key = $edge_data_keys{'witness_ante_corr'};
+ }
+ _add_graphml_data( $edge_el, $key, $base );
+ } elsif( $e->sub_class eq 'relationship' ) {
+ # It's a relationship
+ _add_graphml_data( $edge_el, $edge_data_keys{'relationship'}, $e->label );
+ } # else a segment, nothing to record but source, target, class
}
# Return the thing
$data_el->appendText( $value );
}
+=item B<as_csv>
+
+print $graph->as_csv( $recalculate )
+
+Returns a CSV alignment table representation of the collation graph, one
+row per witness (or witness uncorrected.) Unless $recalculate is passed
+(and is a true value), the method will return a cached copy of the CSV
+after the first call to the method.
+
+=cut
+
+sub as_csv {
+ my( $self, $recalc ) = @_;
+ return $self->csv if $self->has_csv;
+ my $table = $self->make_alignment_table;
+ my $csv = Text::CSV_XS->new( { binary => 1, quote_null => 0 } );
+ my @result;
+ foreach my $row ( @$table ) {
+ $csv->combine( @$row );
+ push( @result, decode_utf8( $csv->string ) );
+ }
+ $self->_save_csv( join( "\n", @result ) );
+ return $self->csv;
+}
+
+sub make_alignment_table {
+ my $self = shift;
+ unless( $self->linear ) {
+ warn "Need a linear graph in order to make an alignment table";
+ return;
+ }
+ my $table;
+ my @all_pos = sort { $a <=> $b } $self->possible_positions;
+ foreach my $wit ( $self->tradition->witnesses ) {
+ my @row = _make_witness_row( $wit->path, \@all_pos );
+ unshift( @row, $wit->sigil );
+ push( @$table, \@row );
+ if( $wit->has_ante_corr ) {
+ my @ac_row = _make_witness_row( $wit->uncorrected_path, \@all_pos );
+ unshift( @ac_row, $wit->sigil . $self->ac_label );
+ push( @$table, \@ac_row );
+ }
+ }
+ # Return a table where the witnesses read in columns rather than rows.
+ my $turned = _turn_table( $table );
+ return $turned;
+}
+
+sub _make_witness_row {
+ my( $path, $positions ) = @_;
+ my %char_hash;
+ map { $char_hash{$_} = undef } @$positions;
+ foreach my $rdg ( @$path ) {
+ $char_hash{$rdg->rank} = $rdg->text;
+ }
+ my @row = map { $char_hash{$_} } @$positions;
+ return @row;
+}
+
+# Helper to turn the witnesses along columns rather than rows. Assumes
+# equal-sized rows.
+sub _turn_table {
+ my( $table ) = @_;
+ my $result = [];
+ return $result unless scalar @$table;
+ my $nrows = scalar @{$table->[0]};
+ foreach my $idx ( 0 .. $nrows - 1 ) {
+ foreach my $wit ( 0 .. $#{$table} ) {
+ $result->[$idx]->[$wit] = $table->[$wit]->[$idx];
+ }
+ }
+ return $result;
+}
+
+
sub collapse_graph_paths {
my $self = shift;
# Our collation graph has an path per witness. This is great for
print STDERR "Collapsing witness paths in graph...\n";
# Don't list out every witness if we have more than half to list.
- my $majority = int( scalar( @{$self->tradition->witnesses} ) / 2 ) + 1;
+ my $majority = int( scalar( $self->tradition->witnesses ) / 2 ) + 1;
# But don't compress if there are only a few witnesses.
$majority = 4 if $majority < 4;
foreach my $node ( $self->readings ) {
- my $newlabels = {};
- # We will visit each node, so we only look ahead.
- foreach my $edge ( $node->outgoing() ) {
- next unless $edge->class eq 'edge.path';
- add_hash_entry( $newlabels, $edge->to->name, $edge->name );
- $self->del_path( $edge );
- }
-
- foreach my $newdest ( keys %$newlabels ) {
- my $label;
- my @compressed_wits = ();
- if( @{$newlabels->{$newdest}} < $majority ) {
- $label = join( ', ', sort( @{$newlabels->{$newdest}} ) );
- } else {
- ## TODO FIX THIS HACK
- my @aclabels;
- foreach my $wit ( @{$newlabels->{$newdest}} ) {
- if( $wit =~ /^(.*?)(\s*\(?a\.\s*c\.\)?)$/ ) {
- push( @aclabels, $wit );
- } else {
- push( @compressed_wits, $wit );
- }
- }
- $label = join( ', ', 'majority', sort( @aclabels ) );
- }
-
- my $newpath =
- $self->add_path( $node, $self->reading( $newdest ), $label );
- if( @compressed_wits ) {
- $newpath->hidden_witnesses( \@compressed_wits );
- }
- }
+ my $newlabels = {};
+ # We will visit each node, so we only look ahead.
+ foreach my $edge ( $node->outgoing() ) {
+ next unless $edge->class eq 'edge.path';
+ add_hash_entry( $newlabels, $edge->to->name, $edge->name );
+ $self->del_path( $edge );
+ }
+
+ foreach my $newdest ( keys %$newlabels ) {
+ my $label;
+ my @compressed_wits = @{$newlabels->{$newdest}};
+ if( @compressed_wits < $majority ) {
+ $label = join( ', ', sort( @{$newlabels->{$newdest}} ) );
+ } else {
+ ## TODO FIX THIS HACK
+ my @aclabels;
+ foreach my $wit ( @compressed_wits ) {
+ push( @aclabels, $wit ) if( $wit =~ /^(.*?)(\s*\(?a\.\s*c\.\)?)$/ );
+ }
+ $label = join( ', ', 'majority', sort( @aclabels ) );
+ }
+
+ my $newpath = $self->add_path( $node, $self->reading( $newdest ), $label );
+ $newpath->hidden_witnesses( \@compressed_wits );
+ }
}
$self->collapsed( 1 );
print STDERR "Expanding witness paths in graph...\n";
foreach my $path( $self->paths ) {
- my $from = $path->from;
- my $to = $path->to;
- my @wits = split( /, /, $path->label );
- if( $path->has_hidden_witnesses ) {
- push( @wits, @{$path->hidden_witnesses} );
- }
- $self->del_path( $path );
- foreach ( @wits ) {
- $self->add_path( $from, $to, $_ );
- }
+ my $from = $path->from;
+ my $to = $path->to;
+ warn sprintf( "No hidden witnesses on %s -> %s ?", $from->name, $to->name )
+ unless $path->has_hidden_witnesses;
+ my @wits = @{$path->hidden_witnesses};
+ $self->del_path( $path );
+ foreach ( @wits ) {
+ $self->add_path( $from, $to, $_ );
+ }
}
$self->collapsed( 0 );
}
my $self = shift;
my( $new_start ) = @_;
if( $new_start ) {
- $self->del_reading( '#START#' );
- $self->graph->rename_node( $new_start, '#START#' );
+ $self->del_reading( '#START#' );
+ $self->graph->rename_node( $new_start, '#START#' );
+ }
+ # Make sure the start node has a start position.
+ unless( $self->reading( '#START#' )->has_position ) {
+ $self->reading( '#START#' )->position( '0,0' );
}
return $self->reading('#START#');
}
+=item B<end>
+
+my $end = $collation->end();
+
+Returns the end of the collation, a meta-reading with label '#END#'.
+
+=cut
+
+sub end {
+ my $self = shift;
+ my( $new_end ) = @_;
+ if( $new_end ) {
+ $self->del_reading( '#END#' );
+ $self->graph->rename_node( $new_end, '#END#' );
+ }
+ return $self->reading('#END#');
+}
+
=item B<reading_sequence>
my @readings = $graph->reading_sequence( $first, $last, $path[, $alt_path] );
=cut
+# TODO Think about returning some lazy-eval iterator.
+
sub reading_sequence {
my( $self, $start, $end, $witness, $backup ) = @_;
my %seen;
my $n = $start;
while( $n && $n ne $end ) {
- if( exists( $seen{$n->name()} ) ) {
- warn "Detected loop at " . $n->name();
- last;
- }
- $seen{$n->name()} = 1;
-
- my $next = $self->next_reading( $n, $witness, $backup );
- warn "Did not find any path for $witness from reading " . $n->name
- unless $next;
- push( @readings, $next );
- $n = $next;
+ if( exists( $seen{$n->name()} ) ) {
+ warn "Detected loop at " . $n->name();
+ last;
+ }
+ $seen{$n->name()} = 1;
+
+ my $next = $self->next_reading( $n, $witness, $backup );
+ warn "Did not find any path for $witness from reading " . $n->name
+ unless $next;
+ push( @readings, $next );
+ $n = $next;
}
# Check that the last reading is our end reading.
my $last = $readings[$#readings];
warn "Last reading found from " . $start->label() .
- " for witness $witness is not the end!"
- unless $last eq $end;
+ " for witness $witness is not the end!"
+ unless $last eq $end;
return @readings;
}
sub _find_linked_reading {
my( $self, $direction, $node, $path, $alt_path ) = @_;
my @linked_paths = $direction eq 'next'
- ? $node->outgoing() : $node->incoming();
+ ? $node->outgoing() : $node->incoming();
return undef unless scalar( @linked_paths );
# We have to find the linked path that contains all of the
my $base_le;
my $alt_le;
foreach my $le ( @linked_paths ) {
- if( $le->name eq $self->baselabel ) {
- $base_le = $le;
- } else {
- my @le_wits = $self->witnesses_of_label( $le->name );
- if( _is_within( \@path_wits, \@le_wits ) ) {
- # This is the right path.
- return $direction eq 'next' ? $le->to() : $le->from();
- } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
- $alt_le = $le;
- }
- }
+ if( $le->name eq $self->baselabel ) {
+ $base_le = $le;
+ } else {
+ my @le_wits = $self->witnesses_of_label( $le->name );
+ if( _is_within( \@path_wits, \@le_wits ) ) {
+ # This is the right path.
+ return $direction eq 'next' ? $le->to() : $le->from();
+ } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) {
+ $alt_le = $le;
+ }
+ }
}
# Got this far? Return the alternate path if it exists.
return $direction eq 'next' ? $alt_le->to() : $alt_le->from()
- if $alt_le;
+ if $alt_le;
# Got this far? Return the base path if it exists.
return $direction eq 'next' ? $base_le->to() : $base_le->from()
- if $base_le;
+ if $base_le;
# Got this far? We have no appropriate path.
warn "Could not find $direction node from " . $node->label
- . " along path $path";
+ . " along path $path";
return undef;
}
my( $set1, $set2 ) = @_;
my $ret = @$set1; # will be 0, i.e. false, if set1 is empty
foreach my $el ( @$set1 ) {
- $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
+ $ret = 0 unless grep { /^\Q$el\E$/ } @$set2;
}
return $ret;
}
# text in the collation.
my $paths = {};
my @common_readings;
- foreach my $wit ( @{$self->tradition->witnesses} ) {
- my $curr_reading = $self->start;
- my @wit_path = $self->reading_sequence( $self->start, $end,
- $wit->sigil );
- $wit->path( \@wit_path );
-
- # Detect the common readings.
- @common_readings = _find_common( \@common_readings, \@wit_path );
+ foreach my $wit ( $self->tradition->witnesses ) {
+ my $curr_reading = $self->start;
+ my @wit_path = $self->reading_sequence( $self->start, $end,
+ $wit->sigil );
+ $wit->path( \@wit_path );
+
+ # Detect the common readings.
+ @common_readings = _find_common( \@common_readings, \@wit_path );
}
# Mark all the nodes as either common or not.
foreach my $cn ( @common_readings ) {
- print STDERR "Setting " . $cn->name . " / " . $cn->label
- . " as common node\n";
- $cn->make_common;
+ print STDERR "Setting " . $cn->name . " / " . $cn->label
+ . " as common node\n";
+ $cn->make_common;
}
foreach my $n ( $self->readings() ) {
- $n->make_variant unless $n->is_common;
+ $n->make_variant unless $n->is_common;
}
# Return an array of the common nodes in order.
return @common_readings;
my( $common_readings, $new_path ) = @_;
my @cr;
if( @$common_readings ) {
- foreach my $n ( @$new_path ) {
- push( @cr, $n ) if grep { $_ eq $n } @$common_readings;
- }
+ foreach my $n ( @$new_path ) {
+ push( @cr, $n ) if grep { $_ eq $n } @$common_readings;
+ }
} else {
- push( @cr, @$new_path );
+ push( @cr, @$new_path );
}
return @cr;
}
my %diverged;
map { $diverged{$_->name} = 1 } @$divergence;
foreach( @$common_readings ) {
- push( @cr, $_ ) unless $diverged{$_->name};
+ push( @cr, $_ ) unless $diverged{$_->name};
}
return @cr;
}
my( $self ) = @_;
my @common_readings;
- foreach my $wit ( @{$self->tradition->witnesses} ) {
- print STDERR "Making path for " . $wit->sigil . "\n";
- $self->make_witness_path( $wit );
- @common_readings = _find_common( \@common_readings, $wit->path );
- @common_readings = _find_common( \@common_readings, $wit->uncorrected_path );
+ foreach my $wit ( $self->tradition->witnesses ) {
+ print STDERR "Making path for " . $wit->sigil . "\n";
+ $self->make_witness_path( $wit );
+ @common_readings = _find_common( \@common_readings, $wit->path );
+ @common_readings = _find_common( \@common_readings, $wit->uncorrected_path );
}
map { $_->make_common } @common_readings;
return @common_readings;
my @chain = @{$wit->path};
my $sig = $wit->sigil;
foreach my $idx ( 0 .. $#chain-1 ) {
- $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
+ $self->add_path( $chain[$idx], $chain[$idx+1], $sig );
}
@chain = @{$wit->uncorrected_path};
foreach my $idx( 0 .. $#chain-1 ) {
- my $source = $chain[$idx];
- my $target = $chain[$idx+1];
- $self->add_path( $source, $target, $sig.$self->ac_label )
- unless $self->has_path( $source, $target, $sig );
+ my $source = $chain[$idx];
+ my $target = $chain[$idx+1];
+ $self->add_path( $source, $target, $sig.$self->ac_label )
+ unless $self->has_path( $source, $target, $sig );
}
}
return sort { $a->position->cmp_with( $b->position ) } @common;
}
-# Calculate the relative positions of nodes in the graph, if they
-# were not given to us.
-sub calculate_positions {
- my( $self, @ordered_common ) = @_;
-
- # First assign positions to all the common nodes.
- my $l = 1;
- foreach my $oc ( @ordered_common ) {
- $oc->position( $l++, 1 );
+sub calculate_ranks {
+ my $self = shift;
+ # Walk a version of the graph where every node linked by a relationship
+ # edge is fundamentally the same node, and do a topological ranking on
+ # the nodes in this graph.
+ my $topo_graph = Graph::Easy->new();
+ my %rel_containers;
+ my $rel_ctr = 0;
+ # Add the nodes
+ foreach my $r ( $self->readings ) {
+ next if exists $rel_containers{$r->name};
+ my @rels = $r->related_readings( 'colocated' );
+ if( @rels ) {
+ # Make a relationship container.
+ push( @rels, $r );
+ my $rn = $topo_graph->add_node( 'rel_container_' . $rel_ctr++ );
+ foreach( @rels ) {
+ $rel_containers{$_->name} = $rn;
+ }
+ } else {
+ # Add a new node to mirror the old node.
+ $rel_containers{$r->name} = $topo_graph->add_node( $r->name );
+ }
}
- if( $self->linear ) {
- # For the space between each common node, we have to find all the chains
- # from all the witnesses. The longest chain gives us our max, and the
- # others get min/max ranges to fit.
- my $first = shift @ordered_common;
- while( @ordered_common ) {
- my %paths;
- my $next = shift @ordered_common;
- my $longest = 0;
- foreach my $wit ( @{$self->tradition->witnesses} ) {
- # Key to the path is not important; we just have to get
- # all unique paths.
- my $length = $self->_track_paths( \%paths, $first, $next, $wit->sigil );
- $longest = $length unless $longest > $length;
- if( $wit->has_ante_corr ) {
- my $length = $self->_track_paths( \%paths, $first, $next,
- $wit->sigil.$self->ac_label, $wit->sigil );
- $longest = $length unless $longest > $length;
- }
- }
-
- # Transform the path values from unique strings to arrays.
- my @all_paths;
- foreach my $k ( keys %paths ) {
- my @v = split( /\s+/, $k );
- push( @all_paths, \@v );
- }
- @all_paths = sort { scalar @$b <=> scalar @$a } @all_paths;
-
- # Now %paths has all the unique paths, and we know how long the
- # longest of these is. Assign positions, starting with the
- # longest. All non-common positions start at 2.
- foreach my $path ( @all_paths ) {
- # Initially each element has a minimum position of 2
- # plus its position in the array (1 is the common
- # node), and a max position of the longest array
- # length minus its position in the array.
- my $range = $longest - scalar @$path;
- my $min = 2;
- foreach my $i ( 0 .. $#{$path} ) {
- my $rdg = $self->reading( $path->[$i] );
- if( $rdg->has_position ) {
- # This reading has already had a more specific
- # position set, so we need to take that into
- # account when calculating the min and max for
- # the next reading.
- my $rangeminus = $rdg->position->min - $min;
- $min = $rdg->position->min + 1;
- $range = $range - $rangeminus;
- if( $range < 0 ) {
- print STDERR "Negative range for position! " . $rdg->name . "\n"; # May remove this warning
- $range = 0;
- }
- } else {
- $rdg->position( $first->position->common, $min, $min+$range );
- $min++;
- $longest = $min+$range-2 unless $longest+2 > $min+$range; # min starts at 2 but longest assumes 0 start
- }
- }
- }
- # Now go through again and make sure the positions are
- # monotonic. Do this until they are.
- my $monotonic = 0;
- my $counter = 0;
- until( $monotonic ) {
- $monotonic = 1;
- $counter++;
- foreach my $path ( @all_paths ) {
- foreach my $i ( 0 .. $#{$path} ) {
- my $rdg = $self->reading( $path->[$i] );
- my $prior = $self->reading( $path->[$i-1] ) if $i > 0;
- my $next = $self->reading( $path->[$i+1] ) if $i < $#{$path};
- if( $prior && $rdg->position->min <= $prior->position->min ) {
- $monotonic = 0;
- $rdg->position->min( $prior->position->min + 1 );
- }
- if( $next && $rdg->position->max >= $next->position->max ) {
- $monotonic = 0;
- if( $next->position->max - 1 >= $rdg->position->min ) {
- # If moving rdg/max down would not send it below
- # rdg/min, do that.
- $rdg->position->max( $next->position->max - 1 );
- } else {
- # Otherwise increase next/max.
- $next->position->max( $rdg->position->max + 1 );
- # ...min will be fixed on the next pass.
- }
- }
- }
- }
- if( $counter > $#all_paths + 1 && !$monotonic ) {
- # We risk an infinite loop. Get out of here.
- warn "Still not monotonic after $counter passes at common point "
- . $first->position->common;
- last;
- }
- }
- print STDERR "Took $counter passes for monotonicity at " . $first->position->common. "\n"
- if $counter > 1;
-
- $first = $next;
- }
-
- } else {
-
- # Non-linear positions are pretty much impossible to pin down.
- # Any reading might appear anywhere in the graph. I guess we
- # can do positions where there aren't transpositions...
-
+ # Add the edges. Need only one edge between any pair of nodes.
+ foreach my $r ( $self->readings ) {
+ foreach my $n ( $r->neighbor_readings( 'forward' ) ) {
+ $topo_graph->add_edge_once( $rel_containers{$r->name},
+ $rel_containers{$n->name} );
+ }
+ }
+
+ # Now do the rankings, starting with the start node.
+ my $topo_start = $rel_containers{$self->start->name};
+ my $node_ranks = { $topo_start->name => 0 };
+ my @curr_origin = ( $topo_start );
+ # A little iterative function.
+ while( @curr_origin ) {
+ @curr_origin = _assign_rank( $node_ranks, @curr_origin );
+ }
+ # Transfer our rankings from the topological graph to the real one.
+ foreach my $r ( $self->readings ) {
+ $r->rank( $node_ranks->{$rel_containers{$r->name}->name} );
}
- $self->init_lemmata();
}
-# Helper function for the guts of calculate_positions.
-sub _track_paths {
- my $self = shift;
- my $track_hash = shift;
- # Args are first, last, wit, backup
- my @path = $self->reading_sequence( @_ );
- # Top and tail the array
- shift @path;
- pop @path;
- $track_hash->{join( ' ', map { $_->name } @path )} = $_[2]
- if @path;
- return @path;
+sub _assign_rank {
+ my( $node_ranks, @current_nodes ) = @_;
+ # Look at each of the children of @current_nodes. If all the child's
+ # parents have a rank, assign it the highest rank + 1 and add it to
+ # @next_nodes. Otherwise skip it.
+ my @next_nodes;
+ foreach my $c ( @current_nodes ) {
+ warn "Current reading " . $c->name . "has no rank!"
+ unless exists $node_ranks->{$c->name};
+ # print STDERR "Looking at child of node " . $c->name . ", rank "
+ # . $node_ranks->{$c->name} . "\n";
+ my @children = map { $_->to } $c->outgoing;
+ foreach my $child ( @children ) {
+ next if exists $node_ranks->{$child->name};
+ my $highest_rank = -1;
+ my $skip = 0;
+ my @parents = map { $_->from } $child->incoming;
+ foreach my $parent ( @parents ) {
+ if( exists $node_ranks->{$parent->name} ) {
+ $highest_rank = $node_ranks->{$parent->name}
+ if $highest_rank <= $node_ranks->{$parent->name};
+ } else {
+ $skip = 1;
+ last;
+ }
+ }
+ next if $skip;
+ # print STDERR "Assigning rank " . ( $highest_rank + 1 ) . " to node " . $child->name . "\n";
+ $node_ranks->{$child->name} = $highest_rank + 1;
+ push( @next_nodes, $child );
+ }
+ }
+ return @next_nodes;
}
-
+
sub possible_positions {
my $self = shift;
- my @answer;
- my %positions = ();
- foreach my $r ( $self->readings ) {
- next unless $r->has_position;
- $positions{$r->position->maxref} = 1;
- }
- @answer = keys %positions;
- return @answer;
+ my %all_pos;
+ map { $all_pos{ $_->rank } = 1 } $self->readings;
+ return keys %all_pos;
}
# TODO think about indexing this.
sub readings_at_position {
my( $self, $position, $strict ) = @_;
unless( ref( $position ) eq 'Text::Tradition::Collation::Position' ) {
- $position = Text::Tradition::Collation::Position->new( $position );
+ $position = Text::Tradition::Collation::Position->new( $position );
}
my @answer;
foreach my $r ( $self->readings ) {
- push( @answer, $r ) if $r->is_at_position( $position, $strict );
+ push( @answer, $r ) if $r->is_at_position( $position, $strict );
}
return @answer;
}
my $self = shift;
foreach my $position ( $self->possible_positions ) {
- $self->lemmata->{$position} = undef;
+ $self->lemmata->{$position} = undef;
}
foreach my $cr ( $self->common_readings ) {
- $self->lemmata->{$cr->position->maxref} = $cr->name;
+ $self->lemmata->{$cr->position->maxref} = $cr->name;
}
}
map { $fixed_positions{$_} = 0 } keys %{$positions_off};
map { $fixed_positions{$_} = 1 } $self->possible_positions;
foreach my $pos ( sort { Text::Tradition::Collation::Position::str_cmp( $a, $b ) } keys %fixed_positions ) {
- # Find the state of this position. If there is an active node,
- # its name will be the state; otherwise the state will be 0
- # (nothing at this position) or undef (ellipsis at this position)
- my $active = undef;
- $active = $self->lemmata->{$pos} if exists $self->lemmata->{$pos};
-
- # Is there a formerly active node that was toggled off?
- if( exists( $positions_off->{$pos} ) ) {
- my $off_node = $positions_off->{$pos};
- if( $active && $active ne $off_node) {
- push( @answer, [ $off_node, 0 ], [ $active, 1 ] );
- } else {
- unless( $fixed_positions{$pos} ) {
- $active = 0;
- delete $self->lemmata->{$pos};
- }
- push( @answer, [ $off_node, $active ] );
- }
-
- # No formerly active node, so we just see if there is a currently
- # active one.
- } elsif( $active ) {
- # Push the active node, whatever it is.
- push( @answer, [ $active, 1 ] );
- } else {
- # Push the state that is there. Arbitrarily use the first node
- # at that position.
- my @pos_nodes = $self->readings_at_position( $pos );
- push( @answer, [ $pos_nodes[0]->name, $self->lemmata->{$pos} ] );
- delete $self->lemmata->{$pos} unless $fixed_positions{$pos};
- }
+ # Find the state of this position. If there is an active node,
+ # its name will be the state; otherwise the state will be 0
+ # (nothing at this position) or undef (ellipsis at this position)
+ my $active = undef;
+ $active = $self->lemmata->{$pos} if exists $self->lemmata->{$pos};
+
+ # Is there a formerly active node that was toggled off?
+ if( exists( $positions_off->{$pos} ) ) {
+ my $off_node = $positions_off->{$pos};
+ if( $active && $active ne $off_node) {
+ push( @answer, [ $off_node, 0 ], [ $active, 1 ] );
+ } else {
+ unless( $fixed_positions{$pos} ) {
+ $active = 0;
+ delete $self->lemmata->{$pos};
+ }
+ push( @answer, [ $off_node, $active ] );
+ }
+
+ # No formerly active node, so we just see if there is a currently
+ # active one.
+ } elsif( $active ) {
+ # Push the active node, whatever it is.
+ push( @answer, [ $active, 1 ] );
+ } else {
+ # Push the state that is there. Arbitrarily use the first node
+ # at that position.
+ my @pos_nodes = $self->readings_at_position( $pos );
+ push( @answer, [ $pos_nodes[0]->name, $self->lemmata->{$pos} ] );
+ delete $self->lemmata->{$pos} unless $fixed_positions{$pos};
+ }
}
return @answer;
return unless $rname;
my $reading = $self->reading( $rname );
if( !$reading || $reading->is_common() ) {
- # Do nothing, it's a common node.
- return;
+ # Do nothing, it's a common node.
+ return;
}
my $pos = $reading->position;
my @readings_off;
if( $old_state && $old_state eq $rname ) {
- # Turn off the node. We turn on no others by default.
- push( @readings_off, $reading );
+ # Turn off the node. We turn on no others by default.
+ push( @readings_off, $reading );
} else {
- # Turn on the node.
- $self->lemmata->{$pos->reference} = $rname;
- # Any other 'on' readings in the same position should be off
- # if we have a fixed position.
- push( @readings_off, $self->same_position_as( $reading, 1 ) )
- if $pos->fixed;
- # Any node that is an identical transposed one should be off.
- push( @readings_off, $reading->identical_readings );
+ # Turn on the node.
+ $self->lemmata->{$pos->reference} = $rname;
+ # Any other 'on' readings in the same position should be off
+ # if we have a fixed position.
+ push( @readings_off, $self->same_position_as( $reading, 1 ) )
+ if $pos->fixed;
+ # Any node that is an identical transposed one should be off.
+ push( @readings_off, $reading->identical_readings );
}
@readings_off = unique_list( @readings_off );
-
+
# Turn off the readings that need to be turned off.
my @readings_delemmatized;
foreach my $n ( @readings_off ) {
- my $npos = $n->position;
- my $state = undef;
- $state = $self->lemmata->{$npos->reference}
- if defined $self->lemmata->{$npos->reference};
- if( $state && $state eq $n->name ) {
- # this reading is still on, so turn it off
- push( @readings_delemmatized, $n );
- my $new_state = undef;
- if( $npos->fixed && $n eq $reading ) {
- # This is the reading that was clicked, so if there are no
- # other readings there and this is a fixed position, turn off
- # the position. In all other cases, restore the ellipsis.
- my @other_n = $self->same_position_as( $n ); # TODO do we need strict?
- $new_state = 0 unless @other_n;
- }
- $self->lemmata->{$npos->reference} = $new_state;
- } elsif( $old_state && $old_state eq $n->name ) {
- # another reading has already been turned on here
- push( @readings_delemmatized, $n );
- } # else some other reading was on anyway, so pass.
+ my $npos = $n->position;
+ my $state = undef;
+ $state = $self->lemmata->{$npos->reference}
+ if defined $self->lemmata->{$npos->reference};
+ if( $state && $state eq $n->name ) {
+ # this reading is still on, so turn it off
+ push( @readings_delemmatized, $n );
+ my $new_state = undef;
+ if( $npos->fixed && $n eq $reading ) {
+ # This is the reading that was clicked, so if there are no
+ # other readings there and this is a fixed position, turn off
+ # the position. In all other cases, restore the ellipsis.
+ my @other_n = $self->same_position_as( $n ); # TODO do we need strict?
+ $new_state = 0 unless @other_n;
+ }
+ $self->lemmata->{$npos->reference} = $new_state;
+ } elsif( $old_state && $old_state eq $n->name ) {
+ # another reading has already been turned on here
+ push( @readings_delemmatized, $n );
+ } # else some other reading was on anyway, so pass.
}
return @readings_delemmatized;
}
sub add_hash_entry {
my( $hash, $key, $entry ) = @_;
if( exists $hash->{$key} ) {
- push( @{$hash->{$key}}, $entry );
+ push( @{$hash->{$key}}, $entry );
} else {
- $hash->{$key} = [ $entry ];
+ $hash->{$key} = [ $entry ];
}
}
# single argument to be parsed out into a position.
my %args;
if( @_ == 1 ) {
- my( $common, $min, $max ) = parse_reference( $_[0] );
- %args = ( 'common' => $common,
- 'min' => $min,
- 'max' => $max );
+ my( $common, $min, $max ) = parse_reference( $_[0] );
+ %args = ( 'common' => $common,
+ 'min' => $min,
+ 'max' => $max );
} elsif ( 2 <= @_ && @_ <= 3 ) {
- my( $common, $min, $max ) = @_;
- $max = $min unless $max;
- %args = ( 'common' => $common,
- 'min' => $min,
- 'max' => $max );
+ my( $common, $min, $max ) = @_;
+ $max = $min unless $max;
+ %args = ( 'common' => $common,
+ 'min' => $min,
+ 'max' => $max );
} else {
- %args = @_;
+ %args = @_;
}
return $class->$orig( %args );
sub BUILD {
my $self = shift;
if( $self->min > $self->max ) {
- die "Position minimum cannot be higher than maximum";
+ die "Position minimum cannot be higher than maximum";
}
}
sub parse_reference {
my( $ref ) = @_;
if( $ref =~ /^(\d+),(\d+)(\-(\d+))?$/ ) {
- my( $common, $min, $max ) = ( $1, $2, $4 );
- $max = $min unless defined $max;
- return( $common, $min, $max );
+ my( $common, $min, $max ) = ( $1, $2, $4 );
+ $max = $min unless defined $max;
+ return( $common, $min, $max );
} else {
- warn "Bad argument $ref passed to Position constructor";
- return undef;
+ warn "Bad argument $ref passed to Position constructor";
+ return undef;
}
}
sub cmp_with {
my( $self, $other ) = @_;
return _cmp_bits( [ $self->common, $self->min, $self->max ],
- [ $other->common, $other->min, $other->max ] );
+ [ $other->common, $other->min, $other->max ] );
}
# Class method
sub _cmp_bits {
my( $a, $b ) = @_;
return $a->[0] <=> $b->[0]
- unless $a->[0] == $b->[0];
+ unless $a->[0] == $b->[0];
return $a->[1] <=> $b->[1]
- unless $a->[1] == $b->[1];
+ unless $a->[1] == $b->[1];
return $a->[2] <=> $b->[2];
}
sub is_colocated {
my( $self, $other, $strict ) = @_;
if( $strict ) {
- return $self->common == $other->common
- && $self->min == $other->min
- && $self->max == $other->max;
+ return $self->common == $other->common
+ && $self->min == $other->min
+ && $self->max == $other->max;
} else {
- return $self->common == $other->common
- && $self->min <= $other->max
- && $self->max >= $other->min;
+ return $self->common == $other->common
+ && $self->min <= $other->max
+ && $self->max >= $other->min;
}
}
isa => 'Text::Tradition::Collation::Position',
predicate => 'has_position',
);
+
+has 'rank' => (
+ is => 'rw',
+ isa => 'Int',
+ predicate => 'has_rank',
+ );
# This contains an array of reading objects; the array is a pool,
# shared by the reading objects inside the pool. When a reading is
return $self->label;
}
+sub witnessed_by {
+ my( $self, $sigil, $backup ) = @_;
+ my @wits = $self->witnesses;
+ return 1 if grep { $_ eq $sigil } @wits;
+ if( $backup ) {
+ return 1 if grep { $_ eq $backup } @wits;
+ }
+ return 0;
+}
+
+sub witnesses {
+ my( $self ) = @_;
+ my @paths = grep { $_->get_attribute( 'class' ) eq 'path' } $self->outgoing;
+ push( @paths, grep { $_->get_attribute( 'class' ) eq 'path' } $self->incoming );
+ my %wits;
+ foreach my $p ( @paths ) {
+ if( $p->has_hidden_witnesses ) {
+ foreach ( @{$p->hidden_witnesses} ) {
+ $wits{$_} = 1;
+ }
+ } else {
+ $wits{$p->label} = 1;
+ }
+ }
+ return keys %wits;
+}
+
sub merge_from {
my( $self, $merged_node ) = @_;
# Adopt the identity pool of the other node.
my $new_pool = _merge_array_pool( \@now_identical, $self->same_as )
if @now_identical;
- # TODO Adopt the relationship attributes of the other node.
+ # TODO Adopt the relationship attributes and segment memberships of the other node.
}
## Dealing with transposed readings. These methods are only really
return values( %connected );
}
-sub adjust_neighbor_position {
- my $self = shift;
- return unless $self->position->fixed;
-
- # TODO This is a naive and repetitive implementation and
- # I don't like it.
- foreach my $neighbor ( $self->neighbor_readings('forward') ) {
- next unless !$neighbor->is_common &&
- $neighbor->position->common == $self->position->common;
- if( $neighbor->position->fixed &&
- $neighbor->position->min == $self->position->min ) {
- warn sprintf( "Readings %s and %s are at the same position!",
- $neighbor->name, $self->name );
- }
- next if $neighbor->position->fixed || $neighbor->position->matched;
- $neighbor->position->min( $self->position->min + 1 );
- # Recurse if necessary.
- $neighbor->adjust_neighbor_position()
- unless $neighbor->position->fixed;
- }
- foreach my $neighbor ( $self->neighbor_readings('back') ) {
- next unless !$neighbor->is_common &&
- $neighbor->position->common == $self->position->common;
- if( $neighbor->position->fixed &&
- $neighbor->position->min == $self->position->min ) {
- warn sprintf( "Readings %s and %s are at the same position!",
- $neighbor->name, $self->name );
- }
- next if $neighbor->position->fixed || $neighbor->position->matched;
- $neighbor->position->max( $self->position->max - 1 );
- # Recurse if necessary.
- $neighbor->adjust_neighbor_position()
- unless $neighbor->position->fixed;
+# Returns all readings related to the one we've got.
+sub related_readings {
+ my( $self, $colocated ) = @_;
+ my @related;
+ foreach my $e ( $self->edges ) {
+ next unless $e->isa( 'Text::Tradition::Collation::Relationship' );
+ next if $colocated && $e->type eq 'repetition';
+ push( @related, $e->from eq $self ? $e->to : $e->from );
}
- return;
-}
-
-sub match_position {
- my( $self, $other ) = @_;
- # Adjust the position of both these nodes to be as restrictive as possible.
- unless( $self->position->is_colocated( $other->position ) ) {
- warn "Cannot match positions of non-colocated readings";
- return;
- }
- my $sp = $self->position;
- my $op = $other->position;
- my $newmin = $sp->min > $op->min ? $sp->min : $op->min;
- my $newmax = $sp->max < $op->max ? $sp->max : $op->max;
- my $newpos = Text::Tradition::Collation::Position->new(
- 'common' => $sp->common,
- 'min' => $newmin,
- 'max' => $newmax,
- 'matched' => 1,
- );
- # We are setting the positions to be the same object. I don't
- # think that actually matters. We may eventually want unique
- # objects for each position.
- $self->position( $newpos );
- $other->position( $newpos );
- $self->adjust_neighbor_position();
- $other->adjust_neighbor_position();
+ return @related;
}
## Keep track of which readings are unchanged across witnesses.
-
sub is_common {
my( $self ) = shift;
return $self->get_attribute( 'class' ) eq 'common';
is => 'rw',
isa => 'Bool',
);
+
+has 'equal_rank' => (
+ is => 'rw',
+ isa => 'Bool',
+ );
sub FOREIGNBUILDARGS {
my $class = shift;
use Moose;
use MooseX::NonMoose;
+use Text::Tradition::Collation::Position;
extends 'Graph::Easy::Node';
required => 1,
);
+has 'position' => (
+ is => 'rw',
+ isa => 'Text::Tradition::Collation::Position',
+ predicate => 'has_position',
+);
+
sub FOREIGNBUILDARGS {
my $class = shift;
my %args = @_;
sub BUILD {
my( $self, $args ) = @_;
$self->set_attribute( 'class', 'segment' );
+ my $ctr = 1;
+ foreach my $r ( @{$args->{members}} ) {
+ my $seg_edge = $r->parent->add_edge( $r, $self, $ctr++ );
+ $seg_edge->set_attribute( 'class', 'segment' );
+ }
+ unless ( grep { !$_->has_position } @{$args->{members}} ) {
+ $self->set_position;
+ }
+}
+# We use our 'members' array for the initialization, but afterward we
+# go by graph edges. This ensures that merged nodes stay merged.
+around 'members' => sub {
+ my $orig = shift;
+ my $self = shift;
+ my @members;
+ foreach my $sl ( sort { $a->name <=> $b->name }
+ grep { $_->sub_class eq 'segment' } $self->incoming ) {
+ push( @members, $sl->from );
+ }
+ return \@members;
+};
+
+sub set_position {
+ my $self = shift;
+ my( $common, $min, $max );
+ my $readings = $self->members;
foreach my $r ( @{$self->members} ) {
- my $seg_edge = $r->parent->add_edge( $r, $self, 'segment' );
- $seg_edge->set_attribute( 'class', 'segment' );
+ if( $r->has_position ) {
+ if( $common && $common != $r->position->common ) {
+ warn "Segment adding node with position skew";
+ } elsif( !$common ) {
+ $common = $r->position->common;
+ }
+ $min = $r->position->min unless $min && $min < $r->position->min;
+ $max = $r->position->max unless $max && $max > $r->position->max;
+ } else {
+ warn "Called set_position on segment which has an unpositioned reading";
+ }
}
+ $self->position( Text::Tradition::Collation::Position->new(
+ common => $common, min => $min, max => $max
+ ) );
}
-
-# For now, a segment has no position in the graph. Eventually it might
-# have the position of its first member.
-sub has_position {
- return undef;
+sub neighbor_readings {
+ my( $self, $direction ) = @_;
+ $direction = 'both' unless $direction;
+ my @answer;
+ if( $direction !~ /^back/ ) {
+ # We want forward readings.
+ push( @answer, $self->members->[0]->neighbor_readings( 'forward' ) );
+ }
+ if( $direction ne 'forward' ) {
+ # We want backward readings.
+ push( @answer, $self->members->[0]->neighbor_readings( 'back' ) );
+ }
+ return @answer;
}
no Moose;
use strict;
use warnings;
use Module::Load;
-use Algorithm::Diff;
+use Text::Tradition::Parser::Util qw( collate_variants cmp_str check_for_repeated add_hash_entry );
=head1 NAME
my %all_witnesses;
my @unwitnessed_lemma_nodes;
foreach my $app ( @app_entries ) {
- my( $line, $num ) = split( /\./, $app->{_id} );
- # DEBUG with a short graph
- last if $SHORTEND && $line > $SHORTEND;
- # DEBUG for problematic entries
- my $scrutinize = '';
- my $first_line_reading = $base_line_starts[ $line ];
- my $too_far = $base_line_starts[ $line+1 ];
-
- my $lemma = $app->{rdg_0};
- my $seq = 1;
- # Is this the Nth occurrence of this reading in the line?
- if( $lemma =~ s/(_)?(\d)$// ) {
- $seq = $2;
- }
- my @lemma_words = split( /\s+/, $lemma );
-
- # Now search for the lemma words within this line.
- my $lemma_start = $first_line_reading;
- my $lemma_end;
- my %seen;
- while( $lemma_start ne $too_far ) {
- # Loop detection
- if( $seen{ $lemma_start->name() } ) {
- warn "Detected loop at " . $lemma_start->name() .
- ", ref $line,$num";
- last;
- }
- $seen{ $lemma_start->name() } = 1;
-
- # Try to match the lemma.
- my $unmatch = 0;
- print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
- $lemma_words[0] . "...\n"
- if "$line.$num" eq $scrutinize;
- if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
- # Skip it if we need a match that is not the first.
- if( --$seq < 1 ) {
- # Now we have to compare the rest of the words here.
- if( scalar( @lemma_words ) > 1 ) {
- my $next_reading =
- $collation->next_reading( $lemma_start );
- foreach my $w ( @lemma_words[1..$#lemma_words] ) {
- printf STDERR "Now matching %s against %s\n",
- cmp_str($next_reading), $w
- if "$line.$num" eq $scrutinize;
- if( $w ne cmp_str($next_reading) ) {
- $unmatch = 1;
- last;
- } else {
- $lemma_end = $next_reading;
- $next_reading =
- $collation->next_reading( $lemma_end );
- }
- }
- } else {
- $lemma_end = $lemma_start;
- }
- } else {
- $unmatch = 1;
- }
- }
- last unless ( $unmatch || !defined( $lemma_end ) );
- $lemma_end = undef;
- $lemma_start = $collation->next_reading( $lemma_start );
- }
-
- unless( $lemma_end ) {
- warn "No match found for @lemma_words at $line.$num";
- next;
- }
-
- # Now we have found the lemma; we will record an 'edit', in
- # terms of a splice operation, for each subsequent reading.
- # We also note which witnesses take the given edit.
-
- my @lemma_set = $collation->reading_sequence( $lemma_start,
- $lemma_end );
- my @reading_sets = [ @lemma_set ];
-
- # For each reading that is not rdg_0, we create the variant
- # reading nodes, and store the range as an edit operation on
- # the base text.
- my $variant_objects;
- my %pc_seen; # Keep track of mss with explicit post-corr data
- foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
- my @mss = grep { $app->{$_} eq $k } keys( %$app );
-
- # Keep track of lemma nodes that don't actually appear in
- # any MSS; we will want to remove them from the collation.
- push( @unwitnessed_lemma_nodes, @lemma_set )
- if !@mss && $k eq 'rdg_0';
-
- # Keep track of what witnesses we have seen.
- @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
- # Keep track of which witnesses bear corrected readings here.
- foreach my $m ( @mss ) {
- my $base = _is_post_corr( $m );
- next unless $base;
- $pc_seen{$base} = 1;
- }
- next if $k eq 'rdg_0';
-
- # Parse the variant into reading tokens.
- # TODO don't hardcode the reading split operation
- my @variant = split( /\s+/, $app->{$k} );
- @variant = () if $app->{$k} eq '/'; # This is an omission.
-
- my @variant_readings;
- my $ctr = 0;
- foreach my $vw ( @variant ) {
- my $vwname = "$k/$line.$num.$ctr"; $ctr++;
- my $vwreading = $collation->add_reading( $vwname );
- $vwreading->text( $vw );
- push( @variant_readings, $vwreading );
- }
-
- $variant_objects->{$k} = { 'mss' => \@mss,
- 'reading' => \@variant_readings,
- };
- push( @reading_sets, \@variant_readings );
- }
-
- # Now collate and collapse the identical readings within the
- # collated sets. Modifies the reading sets that were passed.
- collate_variants( $collation, @reading_sets );
-
- # Record any stated relationships between the nodes and the lemma.
- set_relationships( $collation, $app, \@lemma_set, $variant_objects );
-
- # Now create the splice-edit objects that will be used
- # to reconstruct each witness.
-
- foreach my $rkey ( keys %$variant_objects ) {
- # Object is argument list for splice, so:
- # offset, length, replacements
- my $edit_object = [ $lemma_start->name,
- scalar( @lemma_set ),
- $variant_objects->{$rkey}->{reading} ];
- foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
- # Is this a p.c. entry?
- my $base = _is_post_corr( $ms );
- if( $base ) { # this is a post-corr witness
- my $pc_key = $base . "_post";
- _add_hash_entry( $edits_required, $pc_key, $edit_object );
- } else { # this is an ante-corr witness
- my $pc_key = $ms . "_post";
- _add_hash_entry( $edits_required, $ms, $edit_object );
- unless( $pc_seen{$ms} ) {
- # If this witness carries no correction, add this
- # same object to its post-corrected state.
- _add_hash_entry( $edits_required, $pc_key,
- $edit_object );
- }
- }
- }
- }
+ my( $line, $num ) = split( /\./, $app->{_id} );
+ # DEBUG with a short graph
+ last if $SHORTEND && $line > $SHORTEND;
+ # DEBUG for problematic entries
+ my $scrutinize = '';
+ my $first_line_reading = $base_line_starts[ $line ];
+ my $too_far = $base_line_starts[ $line+1 ];
+
+ my $lemma = $app->{rdg_0};
+ my $seq = 1;
+ # Is this the Nth occurrence of this reading in the line?
+ if( $lemma =~ s/(_)?(\d)$// ) {
+ $seq = $2;
+ }
+ my @lemma_words = split( /\s+/, $lemma );
+
+ # Now search for the lemma words within this line.
+ my $lemma_start = $first_line_reading;
+ my $lemma_end;
+ my %seen;
+ while( $lemma_start ne $too_far ) {
+ # Loop detection
+ if( $seen{ $lemma_start->name() } ) {
+ warn "Detected loop at " . $lemma_start->name() .
+ ", ref $line,$num";
+ last;
+ }
+ $seen{ $lemma_start->name() } = 1;
+
+ # Try to match the lemma.
+ my $unmatch = 0;
+ print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
+ $lemma_words[0] . "...\n"
+ if "$line.$num" eq $scrutinize;
+ if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
+ # Skip it if we need a match that is not the first.
+ if( --$seq < 1 ) {
+ # Now we have to compare the rest of the words here.
+ if( scalar( @lemma_words ) > 1 ) {
+ my $next_reading =
+ $collation->next_reading( $lemma_start );
+ foreach my $w ( @lemma_words[1..$#lemma_words] ) {
+ printf STDERR "Now matching %s against %s\n",
+ cmp_str($next_reading), $w
+ if "$line.$num" eq $scrutinize;
+ if( $w ne cmp_str($next_reading) ) {
+ $unmatch = 1;
+ last;
+ } else {
+ $lemma_end = $next_reading;
+ $next_reading =
+ $collation->next_reading( $lemma_end );
+ }
+ }
+ } else {
+ $lemma_end = $lemma_start;
+ }
+ } else {
+ $unmatch = 1;
+ }
+ }
+ last unless ( $unmatch || !defined( $lemma_end ) );
+ $lemma_end = undef;
+ $lemma_start = $collation->next_reading( $lemma_start );
+ }
+
+ unless( $lemma_end ) {
+ warn "No match found for @lemma_words at $line.$num";
+ next;
+ }
+
+ # Now we have found the lemma; we will record an 'edit', in
+ # terms of a splice operation, for each subsequent reading.
+ # We also note which witnesses take the given edit.
+
+ my @lemma_set = $collation->reading_sequence( $lemma_start,
+ $lemma_end );
+ my @reading_sets = [ @lemma_set ];
+
+ # For each reading that is not rdg_0, we create the variant
+ # reading nodes, and store the range as an edit operation on
+ # the base text.
+ my $variant_objects;
+ my %pc_seen; # Keep track of mss with explicit post-corr data
+ foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
+ my @mss = grep { $app->{$_} eq $k } keys( %$app );
+
+ # Keep track of lemma nodes that don't actually appear in
+ # any MSS; we will want to remove them from the collation.
+ push( @unwitnessed_lemma_nodes, @lemma_set )
+ if !@mss && $k eq 'rdg_0';
+
+ # Keep track of what witnesses we have seen.
+ @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
+ # Keep track of which witnesses bear corrected readings here.
+ foreach my $m ( @mss ) {
+ my $base = _is_post_corr( $m );
+ next unless $base;
+ $pc_seen{$base} = 1;
+ }
+ next if $k eq 'rdg_0';
+
+ # Parse the variant into reading tokens.
+ # TODO don't hardcode the reading split operation
+ my @variant = split( /\s+/, $app->{$k} );
+ @variant = () if $app->{$k} eq '/'; # This is an omission.
+
+ my @variant_readings;
+ my $ctr = 0;
+ foreach my $vw ( @variant ) {
+ my $vwname = "$k/$line.$num.$ctr"; $ctr++;
+ my $vwreading = $collation->add_reading( $vwname );
+ $vwreading->text( $vw );
+ push( @variant_readings, $vwreading );
+ }
+
+ $variant_objects->{$k} = { 'mss' => \@mss,
+ 'reading' => \@variant_readings,
+ };
+ push( @reading_sets, \@variant_readings );
+ }
+
+ # Now collate and collapse the identical readings within the
+ # collated sets. Modifies the reading sets that were passed.
+ collate_variants( $collation, @reading_sets );
+
+ # Record any stated relationships between the nodes and the lemma.
+ set_relationships( $collation, $app, \@lemma_set, $variant_objects );
+
+ # Now create the splice-edit objects that will be used
+ # to reconstruct each witness.
+
+ foreach my $rkey ( keys %$variant_objects ) {
+ # Object is argument list for splice, so:
+ # offset, length, replacements
+ my $edit_object = [ $lemma_start->name,
+ scalar( @lemma_set ),
+ $variant_objects->{$rkey}->{reading} ];
+ foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
+ # Is this a p.c. entry?
+ my $base = _is_post_corr( $ms );
+ if( $base ) { # this is a post-corr witness
+ my $pc_key = $base . "_post";
+ add_hash_entry( $edits_required, $pc_key, $edit_object );
+ } else { # this is an ante-corr witness
+ my $pc_key = $ms . "_post";
+ add_hash_entry( $edits_required, $ms, $edit_object );
+ unless( $pc_seen{$ms} ) {
+ # If this witness carries no correction, add this
+ # same object to its post-corrected state.
+ add_hash_entry( $edits_required, $pc_key,
+ $edit_object );
+ }
+ }
+ }
+ }
} # Finished going through the apparatus entries
# Now make the witness objects, and create their text sequences
foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
- print STDERR "Creating witness $w\n";
- my $witness_obj = $collation->tradition->add_witness( sigil => $w );
- my $debug; # = $w eq 'Vb11';
- my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
- my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
- if exists( $edits_required->{$w."_post"} );
-
- my @repeated = _check_for_repeated( @ante_corr_seq );
- warn "Repeated elements @repeated in $w a.c."
- if @repeated;
- @repeated = _check_for_repeated( @post_corr_seq );
- warn "Repeated elements @repeated in $w p.c."
- if @repeated;
-
- # Now save these paths in my witness object
- if( @post_corr_seq ) {
- $witness_obj->path( \@post_corr_seq );
- $witness_obj->uncorrected_path( \@ante_corr_seq );
- } else {
- $witness_obj->path( \@ante_corr_seq );
- }
+ print STDERR "Creating witness $w\n";
+ my $witness_obj = $collation->tradition->add_witness( sigil => $w );
+ my $debug; # = $w eq 'Vb11';
+ my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
+ my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
+ if exists( $edits_required->{$w."_post"} );
+
+ my @repeated = check_for_repeated( @ante_corr_seq );
+ warn "Repeated elements @repeated in $w a.c."
+ if @repeated;
+ @repeated = check_for_repeated( @post_corr_seq );
+ warn "Repeated elements @repeated in $w p.c."
+ if @repeated;
+
+ # Now save these paths in my witness object
+ if( @post_corr_seq ) {
+ $witness_obj->path( \@post_corr_seq );
+ $witness_obj->uncorrected_path( \@ante_corr_seq );
+ } else {
+ $witness_obj->path( \@ante_corr_seq );
+ }
}
# Now remove our 'base text' edges, which is to say, the only
# ones we have created so far. Also remove any unwitnessed
# lemma nodes (TODO unless we are treating base as witness)
foreach ( $collation->paths() ) {
- $collation->del_path( $_ );
+ $collation->del_path( $_ );
}
foreach( @unwitnessed_lemma_nodes ) {
- $collation->del_reading( $_ );
+ $collation->del_reading( $_ );
+ # TODO do we need to delete any relationship paths here?
}
### HACKY HACKY Do some one-off path corrections here.
if( $collation->linear ) {
- my $c = $collation;
- my $end = $SHORTEND ? $SHORTEND : 155;
- my $path = $c->tradition->witness('Vb11')->path;
- if( $end > 16 ) {
- $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
- splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) );
- }
- # What else?
+ my $c = $collation;
+ my $end = $SHORTEND ? $SHORTEND : 155;
+ # Vb11
+ my $path;
+ if( $end > 16 ) {
+ $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
+ $path = $c->tradition->witness('Vb11')->path;
+ splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) );
+ $path = $c->tradition->witness('Vb11')->uncorrected_path;
+ splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) );
+ }
+ # What else?
+ # Vb26:
+ $path = $c->tradition->witness('Vb26')->path;
+ splice( @$path, 618, 0, $c->reading('rdg_1/46.1.1') ) if $end > 46;
+ # Vb13:
+ $path = $c->tradition->witness('Vb13')->path;
+ splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58;
+ $path = $c->tradition->witness('Vb13')->uncorrected_path;
+ splice( @$path, 758, 0, $c->reading( '58,5' ) ) if $end > 58;
+ # Vb20 a.c.:
+ $path = $c->tradition->witness('Vb20')->uncorrected_path;
+ splice( @$path, 1251, 1, $c->reading( '94,4' ) ) if $end > 94;
+ # Vb5:
+ $path = $c->tradition->witness('Vb5')->path;
+ splice( @$path, 1436, 0, $c->reading('rdg_1/106.5.1') ) if $end > 106;
+ # extraneous:
+ $c->del_reading( 'rdg_2/147.6.13' );
+ $c->del_reading( 'rdg_2/147.6.14' );
+ $c->del_reading( 'rdg_2/147.6.15' );
+
} else {
- my $c = $collation;
- my $end = $SHORTEND ? $SHORTEND : 155;
- # Vb5:
- my $path = $c->tradition->witness('Vb5')->path;
- splice( @$path, 1436, 0, $c->reading('106,14') ) if $end > 106;
- # Vb11:
- $path = $c->tradition->witness('Vb11')->path;
- if( $end > 16 ) {
- $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
- splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( '16,1' ) );
- }
- # Vb12 a.c.:
- $path = $c->tradition->witness('Vb12')->uncorrected_path;
- splice( @$path, 1828, 1, $c->reading('rdg_2/137.5.0') ) if $end > 137;
- # Vb13:
- $path = $c->tradition->witness('Vb13')->path;
- splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58;
- # Vb20 a.c.:
- $path = $c->tradition->witness('Vb20')->uncorrected_path;
- splice( @$path, 1251, 1, $c->reading( '94,6' ) ) if $end > 94;
- # Vb26:
- $path = $c->tradition->witness('Vb26')->path;
- splice( @$path, 618, 0, $c->reading('46,2') ) if $end > 46;
+ my $c = $collation;
+ my $end = $SHORTEND ? $SHORTEND : 155;
+ # Vb5:
+ my $path = $c->tradition->witness('Vb5')->path;
+ splice( @$path, 1436, 0, $c->reading('106,14') ) if $end > 106;
+ # Vb11:
+ $path = $c->tradition->witness('Vb11')->path;
+ if( $end > 16 ) {
+ $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
+ splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( '16,1' ) );
+ }
+ # Vb13:
+ $path = $c->tradition->witness('Vb13')->path;
+ splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58;
+ $path = $c->tradition->witness('Vb13')->uncorrected_path;
+ splice( @$path, 758, 0, $c->reading( '58,5' ) ) if $end > 58;
+ # Vb20 a.c.:
+ $path = $c->tradition->witness('Vb20')->uncorrected_path;
+ splice( @$path, 1251, 1, $c->reading( '94,4' ) ) if $end > 94;
+ # Vb26:
+ $path = $c->tradition->witness('Vb26')->path;
+ splice( @$path, 618, 0, $c->reading('46,2') ) if $end > 46;
}
- # Now walk paths and calculate positions.
- my @common_readings =
- $collation->make_witness_paths();
- $collation->calculate_positions( @common_readings );
-}
-
-sub _check_for_repeated {
- my @seq = @_;
- my %unique;
- my @repeated;
- foreach ( @seq ) {
- if( exists $unique{$_->name} ) {
- push( @repeated, $_->name );
- } else {
- $unique{$_->name} = 1;
- }
- }
- return @repeated;
+ # Now walk paths and calculate positional rank.
+ my @common_readings = $collation->make_witness_paths();
+ # Have to check relationship validity at this point, because before that
+ # we had no paths.
+# foreach my $rel ( $collation->relationships ) {
+# next unless $rel->equal_rank;
+# unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
+# warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
+# $rel->type, $rel->from->name, $rel->to->name );
+# }
+# }
+ $collation->calculate_ranks();
}
=item B<read_base>
open( BASE, $base_file ) or die "Could not open file $base_file: $!";
my $i = 1;
while(<BASE>) {
- # Make the readings, and connect them up for the base, but
- # also save the first reading of each line in an array for the
- # purpose.
- # TODO use configurable reading separator
- chomp;
- my @words = split;
- my $started = 0;
- my $wordref = 0;
- my $lineref = scalar @$lineref_array;
- last if $SHORTEND && $lineref > $SHORTEND;
- foreach my $w ( @words ) {
- my $readingref = join( ',', $lineref, ++$wordref );
- my $reading = $collation->add_reading( $readingref );
- $reading->text( $w );
- unless( $started ) {
- push( @$lineref_array, $reading );
- $started = 1;
- }
- # Add edge paths in the graph, for easier tracking when
- # we start applying corrections. These paths will be
- # removed when we're done.
- my $path = $collation->add_path( $last_reading, $reading,
- $collation->baselabel );
- $last_reading = $reading;
-
- # Note an array index for the reading, for later correction splices.
- $base_text_index{$readingref} = $i++;
- }
+ # Make the readings, and connect them up for the base, but
+ # also save the first reading of each line in an array for the
+ # purpose.
+ # TODO use configurable reading separator
+ chomp;
+ my @words = split;
+ my $started = 0;
+ my $wordref = 0;
+ my $lineref = scalar @$lineref_array;
+ last if $SHORTEND && $lineref > $SHORTEND;
+ foreach my $w ( @words ) {
+ my $readingref = join( ',', $lineref, ++$wordref );
+ my $reading = $collation->add_reading( $readingref );
+ $reading->text( $w );
+ unless( $started ) {
+ push( @$lineref_array, $reading );
+ $started = 1;
+ }
+ # Add edge paths in the graph, for easier tracking when
+ # we start applying corrections. These paths will be
+ # removed when we're done.
+ my $path = $collation->add_path( $last_reading, $reading,
+ $collation->baselabel );
+ $last_reading = $reading;
+
+ # Note an array index for the reading, for later correction splices.
+ $base_text_index{$readingref} = $i++;
+ }
}
close BASE;
# Ending point for all texts
- my $endpoint = $collation->add_reading( '#END#' );
- $collation->add_path( $last_reading, $endpoint, $collation->baselabel );
- push( @$lineref_array, $endpoint );
- $base_text_index{$endpoint->name} = $i;
+ $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
+ push( @$lineref_array, $collation->end );
+ $base_text_index{$collation->end->name} = $i;
return( @$lineref_array );
}
-=item B<collate_variants>
-
-collate_variants( $collation, @reading_ranges )
-
-Given a set of readings in the form
-( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
-walks through each to identify those readings that are identical. The
-collation is a Text::Tradition::Collation object; the elements of
-@readings are Text::Tradition::Collation::Reading objects that appear
-on the collation graph.
-
-TODO: Handle collapsed and non-collapsed transpositions.
-
-=cut
-
-sub collate_variants {
- my( $collation, @reading_sets ) = @_;
-
- # Two different ways to do this, depending on whether we want
- # transposed reading nodes to be merged into one (producing a
- # nonlinear, bidirectional graph) or not (producing a relatively
- # linear, unidirectional graph.)
- return $collation->linear ? collate_linearly( @_ )
- : collate_nonlinearly( @_ );
-}
-
-sub collate_linearly {
- my( $collation, $lemma_set, @variant_sets ) = @_;
-
- my @unique;
- push( @unique, @$lemma_set );
- while( @variant_sets ) {
- my $variant_set = shift @variant_sets;
- # Use diff to do this job
- my $diff = Algorithm::Diff->new( \@unique, $variant_set,
- {'keyGen' => \&_collation_hash} );
- my @new_unique;
- my %merged;
- while( $diff->Next ) {
- if( $diff->Same ) {
- # merge the nodes
- my @l = $diff->Items( 1 );
- my @v = $diff->Items( 2 );
- foreach my $i ( 0 .. $#l ) {
- if( !$merged{$l[$i]->name} ) {
- $collation->merge_readings( $l[$i], $v[$i] );
- $merged{$l[$i]->name} = 1;
- } else {
- print STDERR "Would have double merged " . $l[$i]->name . "\n";
- }
- }
- # splice the lemma nodes into the variant set
- my( $offset ) = $diff->Get( 'min2' );
- splice( @$variant_set, $offset, scalar( @l ), @l );
- push( @new_unique, @l );
- } else {
- # Keep the old unique readings
- push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
- # Add the new readings to the 'unique' list
- push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
- }
- }
- @unique = @new_unique;
- }
-}
-
-sub collate_nonlinearly {
- my( $collation, $lemma_set, @variant_sets ) = @_;
-
- my @unique;
- push( @unique, @$lemma_set );
- while( @variant_sets ) {
- my $variant_set = shift @variant_sets;
- # Simply match the first reading that carries the same word, so
- # long as that reading has not yet been used to match another
- # word in this variant. That way lies loopy madness.
- my @distinct;
- my %merged;
- foreach my $idx ( 0 .. $#{$variant_set} ) {
- my $vw = $variant_set->[$idx];
- my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
- my $matched;
- if( @same ) {
- foreach my $i ( 0 .. $#same ) {
- unless( $merged{$same[$i]->name} ) {
- #print STDERR sprintf( "Merging %s into %s\n",
- # $vw->name,
- # $same[$i]->name );
- $collation->merge_readings( $same[$i], $vw );
- $merged{$same[$i]->name} = 1;
- $matched = $i;
- $variant_set->[$idx] = $same[$i];
- }
- }
- }
- unless( @same && defined($matched) ) {
- push( @distinct, $vw );
- }
- }
- push( @unique, @distinct );
- }
-}
-
-
-
-sub _collation_hash {
- my $node = shift;
- return cmp_str( $node );
-}
-
sub set_relationships {
my( $collation, $app, $lemma, $variants ) = @_;
foreach my $rkey ( keys %$variants ) {
- my $var = $variants->{$rkey}->{'reading'};
- my $type = $app->{sprintf( "_%s_type", $rkey )};
- my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
- my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
-
- my %rel_options = ();
- $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
- $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
-
- if( $type =~ /^(inv|tr|rep)$/i ) {
- # Transposition or repetition: look for nodes with the
- # same label but different IDs and mark them.
- $type = 'repetition' if $type =~ /^rep/i;
- $rel_options{'type'} = $type;
- my %labels;
- foreach my $r ( @$lemma ) {
- $labels{cmp_str( $r )} = $r;
- }
- foreach my $r( @$var ) {
- if( exists $labels{$r->label} &&
- $r->name ne $labels{$r->label}->name ) {
- if( $type eq 'repetition' ) {
- # Repetition
- $collation->add_relationship( $r, $labels{$r->label}, \%rel_options );
- } else {
- # Transposition
- $r->set_identical( $labels{$r->label} );
- }
- }
- }
- } elsif( $type =~ /^(gr|lex|sp(el)?)$/i ) {
-
- # Grammar/spelling/lexical: this can be a one-to-one or
- # one-to-many mapping. We should think about merging
- # readings if it is one-to-many.
-
- $type = 'grammatical' if $type =~ /gr/i;
- $type = 'spelling' if $type =~ /sp/i;
- $type = 'repetition' if $type =~ /rep/i;
- $type = 'lexical' if $type =~ /lex/i;
- $rel_options{'type'} = $type;
- if( @$lemma == @$var ) {
- foreach my $i ( 0 .. $#{$lemma} ) {
- $collation->add_relationship( $var->[$i], $lemma->[$i],
- \%rel_options );
- }
- } else {
- # An uneven many-to-many mapping. Make a segment out of
- # whatever we have.
- my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
- my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
- $collation->add_relationship( $varseg, $lemseg, \%rel_options );
- }
- } elsif( $type !~ /^(add|om)$/i ) {
- warn "Unrecognized type $type";
- }
+ my $var = $variants->{$rkey}->{'reading'};
+ my $type = $app->{sprintf( "_%s_type", $rkey )};
+ my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
+ my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
+
+ my %rel_options = ();
+ $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
+ $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
+
+ if( $type =~ /^(inv|tr|rep)$/i ) {
+ # Transposition or repetition: look for nodes with the
+ # same label but different IDs and mark them.
+ $type = 'repetition' if $type =~ /^rep/i;
+ $rel_options{'type'} = $type;
+ $rel_options{'equal_rank'} = undef;
+ my %labels;
+ foreach my $r ( @$lemma ) {
+ $labels{cmp_str( $r )} = $r;
+ }
+ foreach my $r( @$var ) {
+ if( exists $labels{$r->label} &&
+ $r->name ne $labels{$r->label}->name ) {
+ if( $type eq 'repetition' ) {
+ # Repetition
+ $collation->add_relationship( $r, $labels{$r->label}, \%rel_options );
+ } else {
+ # Transposition
+ $r->set_identical( $labels{$r->label} );
+ }
+ }
+ }
+ } elsif( $type =~ /^(gr|sp(el)?)$/i ) {
+
+ # Grammar/spelling/lexical: this can be a one-to-one or
+ # one-to-many mapping. We should think about merging
+ # readings if it is one-to-many.
+
+ $type = 'grammatical' if $type =~ /gr/i;
+ $type = 'spelling' if $type =~ /sp/i;
+ $type = 'repetition' if $type =~ /rep/i;
+ # $type = 'lexical' if $type =~ /lex/i;
+ $rel_options{'type'} = $type;
+ $rel_options{'equal_rank'} = 1;
+ if( @$lemma == @$var ) {
+ foreach my $i ( 0 .. $#{$lemma} ) {
+ $collation->add_relationship( $var->[$i], $lemma->[$i],
+ \%rel_options );
+ }
+ } else {
+ # An uneven many-to-many mapping. Skip for now.
+ # We really want to make a segment out of whatever we have.
+ # my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
+ # my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
+ # $collation->add_relationship( $varseg, $lemseg, \%rel_options );
+ if( @$lemma == 1 && @$var == 1 ) {
+ $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
+ }
+ }
+ } elsif( $type !~ /^(add|om|lex)$/i ) {
+ warn "Unrecognized type $type";
+ }
}
}
-
+
sub apply_edits {
my( $collation, $edit_sequence, $debug ) = @_;
my @lemma_text = $collation->reading_sequence( $collation->start,
- $collation->reading( '#END#' ) );
+ $collation->reading( '#END#' ) );
my $drift = 0;
foreach my $correction ( @$edit_sequence ) {
- my( $lemma_start, $length, $items ) = @$correction;
- my $offset = $base_text_index{$lemma_start};
- my $realoffset = $offset + $drift;
- if( $debug ||
- $lemma_text[$realoffset]->name ne $lemma_start ) {
- my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
- my @base_phrase;
- my $i = $realoffset;
- my $l = $collation->reading( $lemma_start );
- while( $i < $realoffset+$length ) {
- push( @base_phrase, $l );
- $l = $collation->next_reading( $l );
- $i++;
- }
-
- print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
- "with %s (%s) with drift %d\n",
- join( ' ', map {$_->label} @base_phrase ),
- join( ' ', map {$_->name} @base_phrase ),
- $realoffset,
- join( ' ', map {$_->label} @$items ),
- join( ' ', map {$_->name} @$items ),
- $drift,
- ) if $debug;
-
- if( $lemma_text[$realoffset]->name ne $lemma_start ) {
- warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
- "but %s (%s) is there instead",
- join( ' ', map {$_->label} @base_phrase ),
- join( ' ', map {$_->name} @base_phrase ),
- join( ' ', map {$_->label} @$items ),
- join( ' ', map {$_->name} @$items ),
- join( ' ', map {$_->label} @this_phrase ),
- join( ' ', map {$_->name} @this_phrase ),
- ) );
- # next;
- }
- }
- splice( @lemma_text, $realoffset, $length, @$items );
- $drift += @$items - $length;
+ my( $lemma_start, $length, $items ) = @$correction;
+ my $offset = $base_text_index{$lemma_start};
+ my $realoffset = $offset + $drift;
+ if( $debug ||
+ $lemma_text[$realoffset]->name ne $lemma_start ) {
+ my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
+ my @base_phrase;
+ my $i = $realoffset;
+ my $l = $collation->reading( $lemma_start );
+ while( $i < $realoffset+$length ) {
+ push( @base_phrase, $l );
+ $l = $collation->next_reading( $l );
+ $i++;
+ }
+
+ print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
+ "with %s (%s) with drift %d\n",
+ join( ' ', map {$_->label} @base_phrase ),
+ join( ' ', map {$_->name} @base_phrase ),
+ $realoffset,
+ join( ' ', map {$_->label} @$items ),
+ join( ' ', map {$_->name} @$items ),
+ $drift,
+ ) if $debug;
+
+ if( $lemma_text[$realoffset]->name ne $lemma_start ) {
+ warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
+ "but %s (%s) is there instead",
+ join( ' ', map {$_->label} @base_phrase ),
+ join( ' ', map {$_->name} @base_phrase ),
+ join( ' ', map {$_->label} @$items ),
+ join( ' ', map {$_->name} @$items ),
+ join( ' ', map {$_->label} @this_phrase ),
+ join( ' ', map {$_->name} @this_phrase ),
+ ) );
+ # next;
+ }
+ }
+ splice( @lemma_text, $realoffset, $length, @$items );
+ $drift += @$items - $length;
}
return @lemma_text;
}
-
+
# Helper function. Given a witness sigil, if it is a post-correctione
# sigil,return the base witness. If not, return a false value.
sub _is_post_corr {
my( $sigil ) = @_;
if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
- return $1;
+ return $1;
}
return undef;
}
-sub _add_hash_entry {
- my( $hash, $key, $entry ) = @_;
- if( exists $hash->{$key} ) {
- push( @{$hash->{$key}}, $entry );
- } else {
- $hash->{$key} = [ $entry ];
- }
-}
-
-
-=item B<cmp_str>
-
-Pretend you never saw this method. Really it needs to not be hardcoded.
-
-=cut
-
-sub cmp_str {
- my( $reading ) = @_;
- my $word = $reading->label();
- $word = lc( $word );
- $word =~ s/\W//g;
- $word =~ s/v/u/g;
- $word =~ s/j/i/g;
- $word =~ s/cha/ca/g;
- $word =~ s/quatuor/quattuor/g;
- $word =~ s/ioannes/iohannes/g;
- return $word;
-}
=back
# Add the nodes to the graph. First delete the start node, because
# GraphML graphs will have their own start nodes.
$collation->del_reading( $collation->start() );
+ $collation->del_reading( $collation->end() );
my $extra_data = {}; # Keep track of info to be processed after all
# nodes have been created
foreach my $n ( @{$graph_data->{'nodes'}} ) {
- my %node_data = %$n;
- my $nodeid = delete $node_data{$IDKEY};
- my $token = delete $node_data{$CONTENTKEY};
- unless( defined $nodeid && defined $token ) {
- $DB::single = 1;
- warn "Did not find an ID or token for graph node, can't add it";
- next;
- }
- my $gnode = $collation->add_reading( $nodeid );
- $gnode->text( $token );
-
- # Whatever is left is extra info to be processed later.
- if( keys %node_data ) {
- $extra_data->{$nodeid} = \%node_data;
- }
+ my %node_data = %$n;
+ my $nodeid = delete $node_data{$IDKEY};
+ my $token = delete $node_data{$CONTENTKEY};
+ unless( defined $nodeid && defined $token ) {
+ warn "Did not find an ID or token for graph node, can't add it";
+ next;
+ }
+ my $gnode = $collation->add_reading( $nodeid );
+ $gnode->text( $token );
+
+ # Whatever is left is extra info to be processed later.
+ if( keys %node_data ) {
+ $extra_data->{$nodeid} = \%node_data;
+ }
}
-
+
# Now add the edges.
foreach my $e ( @{$graph_data->{'edges'}} ) {
- my %edge_data = %$e;
- my $from = delete $edge_data{'source'};
- my $to = delete $edge_data{'target'};
-
- # In CollateX, we have a distinct witness data ID per witness,
- # so that we can have multiple witnesses per edge. We want to
- # translate this to one witness per edge in our own
- # representation.
- foreach my $ekey ( keys %edge_data ) {
- my $wit = $edge_data{$ekey};
- # Create the witness object if it does not yet exist.
- unless( $witnesses{$wit} ) {
- $tradition->add_witness( 'sigil' => $wit );
- $witnesses{$wit} = 1;
- }
- $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $wit );
- }
+ my %edge_data = %$e;
+ my $from = delete $edge_data{'source'};
+ my $to = delete $edge_data{'target'};
+
+ # In CollateX, we have a distinct witness data ID per witness,
+ # so that we can have multiple witnesses per edge. We want to
+ # translate this to one witness per edge in our own
+ # representation.
+ foreach my $ekey ( keys %edge_data ) {
+ my $wit = $edge_data{$ekey};
+ # Create the witness object if it does not yet exist.
+ unless( $witnesses{$wit} ) {
+ $tradition->add_witness( 'sigil' => $wit );
+ $witnesses{$wit} = 1;
+ }
+ $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $wit );
+ }
}
# Process the extra node data if it exists.
foreach my $nodeid ( keys %$extra_data ) {
- my $ed = $extra_data->{$nodeid};
- if( exists $ed->{$TRANSKEY} ) {
-
- my $tn_reading = $collation->reading( $nodeid );
- my $main_reading = $collation->reading( $ed->{$TRANSKEY} );
- if( $collation->linear ) {
- $tn_reading->set_identical( $main_reading );
- } else {
- $collation->merge_readings( $main_reading, $tn_reading );
- }
- } # else we don't have any other tags to process yet.
+ my $ed = $extra_data->{$nodeid};
+ if( exists $ed->{$TRANSKEY} ) {
+
+ my $tn_reading = $collation->reading( $nodeid );
+ my $main_reading = $collation->reading( $ed->{$TRANSKEY} );
+ if( $collation->linear ) {
+ $tn_reading->set_identical( $main_reading );
+ } else {
+ $collation->merge_readings( $main_reading, $tn_reading );
+ }
+ } # else we don't have any other tags to process yet.
}
# Find the beginning and end nodes of the graph. The beginning node
# has no incoming edges; the end node has no outgoing edges.
my( $begin_node, $end_node );
foreach my $gnode ( $collation->readings() ) {
- # print STDERR "Checking node " . $gnode->name . "\n";
- my @outgoing = $gnode->outgoing();
- my @incoming = $gnode->incoming();
-
- unless( scalar @incoming ) {
- warn "Already have a beginning node" if $begin_node;
- $begin_node = $gnode;
- $collation->start( $gnode );
- }
- unless( scalar @outgoing ) {
- warn "Already have an ending node" if $end_node;
- $end_node = $gnode;
- }
+ # print STDERR "Checking node " . $gnode->name . "\n";
+ my @outgoing = $gnode->outgoing();
+ my @incoming = $gnode->incoming();
+
+ unless( scalar @incoming ) {
+ warn "Already have a beginning node" if $begin_node;
+ $begin_node = $gnode;
+ $collation->start( $gnode );
+ }
+ unless( scalar @outgoing ) {
+ warn "Already have an ending node" if $end_node;
+ $end_node = $gnode;
+ $collation->end( $gnode );
+ }
}
# Record for each witness its sequence of readings, and determine
my( $graphml_str ) = @_;
my $graph_hash = { 'nodes' => [],
- 'edges' => [] };
+ 'edges' => [] };
my $parser = XML::LibXML->new();
my $doc = $parser->parse_string( $graphml_str );
# First get the ID keys, for witnesses and for collation data
foreach my $k ( $xpc->findnodes( '//g:key' ) ) {
- # Each key has a 'for' attribute; the edge keys are witnesses, and
- # the node keys contain an ID and string for each node.
- my $keyid = $k->getAttribute( 'id' );
- my $keyname = $k->getAttribute( 'attr.name' );
-
- if( $k->getAttribute( 'for' ) eq 'node' ) {
- # Keep track of the XML identifiers for the data carried
- # in each node element.
- $nodedata->{$keyid} = $keyname
- } else {
- $witnesses->{$keyid} = $keyname;
- }
+ # Each key has a 'for' attribute; the edge keys are witnesses, and
+ # the node keys contain an ID and string for each node.
+ my $keyid = $k->getAttribute( 'id' );
+ my $keyname = $k->getAttribute( 'attr.name' );
+
+ if( $k->getAttribute( 'for' ) eq 'node' ) {
+ # Keep track of the XML identifiers for the data carried
+ # in each node element.
+ $nodedata->{$keyid} = $keyname
+ } else {
+ $witnesses->{$keyid} = $keyname;
+ }
}
my $graph_el = $xpc->find( '/g:graphml/g:graph' )->[0];
# Add the nodes to the graph hash.
my @nodes = $xpc->findnodes( '//g:node' );
foreach my $n ( @nodes ) {
- # Could use a better way of registering these
- my $node_hash = {};
- foreach my $dkey ( keys %$nodedata ) {
- my $keyname = $nodedata->{$dkey};
- my $keyvalue = _lookup_node_data( $n, $dkey );
- $node_hash->{$keyname} = $keyvalue if defined $keyvalue;
- }
- $node_reg->{$n->getAttribute( 'id' )} = $node_hash;
- push( @{$graph_hash->{'nodes'}}, $node_hash );
+ # Could use a better way of registering these
+ my $node_hash = {};
+ foreach my $dkey ( keys %$nodedata ) {
+ my $keyname = $nodedata->{$dkey};
+ my $keyvalue = _lookup_node_data( $n, $dkey );
+ $node_hash->{$keyname} = $keyvalue if defined $keyvalue;
+ }
+ $node_reg->{$n->getAttribute( 'id' )} = $node_hash;
+ push( @{$graph_hash->{'nodes'}}, $node_hash );
}
-
+
# Now add the edges, and cross-ref with the node objects.
my @edges = $xpc->findnodes( '//g:edge' );
foreach my $e ( @edges ) {
- my $from = $e->getAttribute('source');
- my $to = $e->getAttribute('target');
-
- # We don't know whether the edge data is one per witness
- # or one per witness type, or something else. So we just
- # save it and let our calling parser decide.
- my $edge_hash = {
- 'source' => $node_reg->{$from},
- 'target' => $node_reg->{$to},
- };
- foreach my $wkey( keys %$witnesses ) {
- my $wname = $witnesses->{$wkey};
- my $wlabel = _lookup_node_data( $e, $wkey );
- $edge_hash->{$wname} = $wlabel if $wlabel;
- }
- push( @{$graph_hash->{'edges'}}, $edge_hash );
+ my $from = $e->getAttribute('source');
+ my $to = $e->getAttribute('target');
+
+ # We don't know whether the edge data is one per witness
+ # or one per witness type, or something else. So we just
+ # save it and let our calling parser decide.
+ my $edge_hash = {
+ 'source' => $node_reg->{$from},
+ 'target' => $node_reg->{$to},
+ };
+ foreach my $wkey( keys %$witnesses ) {
+ my $wname = $witnesses->{$wkey};
+ my $wlabel = _lookup_node_data( $e, $wkey );
+ $edge_hash->{$wname} = $wlabel if $wlabel;
+ }
+ push( @{$graph_hash->{'edges'}}, $edge_hash );
}
return $graph_hash;
}
-package Text::Tradition::Parser::CSV;
+package Text::Tradition::Parser::KUL;
use strict;
use warnings;
sub parse {
my( $tradition, $graphml_str ) = @_;
- $DB::single = 1;
my $graph_data = Text::Tradition::Parser::GraphML::parse( $graphml_str );
my $collation = $tradition->collation;
my %witnesses;
# Add the nodes to the graph.
+ # TODO Are we adding extra start/end nodes?
my $extra_data = {}; # Keep track of data that needs to be processed
# after the nodes & edges are created.
print STDERR "Adding graph nodes\n";
foreach my $n ( @{$graph_data->{'nodes'}} ) {
- # Each node is either a segment or a reading, depending on
- # its class. Readings have text, segments don't.
- my %node_data = %$n;
- my $nodeid = delete $node_data{$IDKEY};
- my $reading = delete $node_data{$TOKENKEY};
- my $class = $node_data{$CLASS_KEY} || '';
- # TODO this is a hack, fix it?
- $class = 'reading' unless $class eq 'segment';
- my $method = $class eq 'segment' ? "add_$class" : "add_reading";
- my $gnode = $collation->$method( $nodeid );
- $gnode->label( $reading );
- $gnode->set_common if $class eq 'common';
-
- # Now save the rest of the data, i.e. not the ID or label,
- # if it exists.
- if ( keys %node_data ) {
- $extra_data->{$nodeid} = \%node_data;
- }
+ # Each node is either a segment or a reading, depending on
+ # its class. Readings have text, segments don't.
+ my %node_data = %$n;
+ my $nodeid = delete $node_data{$IDKEY};
+ my $reading = delete $node_data{$TOKENKEY};
+ my $class = $node_data{$CLASS_KEY} || '';
+ # TODO this is a hack, fix it?
+ $class = 'reading' unless $class eq 'segment';
+ my $method = $class eq 'segment' ? "add_$class" : "add_reading";
+ my $gnode = $collation->$method( $nodeid );
+ $gnode->label( $reading );
+ $gnode->set_common if $class eq 'common';
+
+ # Now save the rest of the data, i.e. not the ID or label,
+ # if it exists.
+ if ( keys %node_data ) {
+ $extra_data->{$nodeid} = \%node_data;
+ }
}
-
+
# Now add the edges.
print STDERR "Adding graph edges\n";
foreach my $e ( @{$graph_data->{'edges'}} ) {
- my %edge_data = %$e;
- my $from = delete $edge_data{'source'};
- my $to = delete $edge_data{'target'};
- my $class = delete $edge_data{'class'};
-
- # Whatever is left tells us what kind of edge it is.
- foreach my $wkey ( keys %edge_data ) {
- if( $wkey =~ /^witness/ ) {
- unless( $class eq 'path' ) {
- warn "Cannot add witness label to a $class edge";
- next;
- }
- my $wit = $edge_data{$wkey};
- unless( $witnesses{$wit} ) {
- $tradition->add_witness( sigil => $wit );
- $witnesses{$wit} = 1;
- }
- my $label = $wkey eq 'witness_ante_corr'
- ? $wit . $collation->ac_label : $wit;
- $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label );
- } elsif( $wkey eq 'relationship' ) {
- unless( $class eq 'relationship' ) {
- warn "Cannot add relationship label to a $class edge";
- next;
- }
- my $rel = $edge_data{$wkey};
- # TODO handle global relationships
- $collation->add_relationship( $rel, $from->{$IDKEY}, $to->{$IDKEY} );
- } else {
- my $seg_edge = $collation->graph->add_edge( $from->{$IDKEY}, $to->{$IDKEY} );
- $seg_edge->set_attribute( 'class', 'segment' );
- }
- }
+ my %edge_data = %$e;
+ my $from = delete $edge_data{'source'};
+ my $to = delete $edge_data{'target'};
+ my $class = delete $edge_data{'class'};
+
+ # Whatever is left tells us what kind of edge it is.
+ foreach my $wkey ( keys %edge_data ) {
+ if( $wkey =~ /^witness/ ) {
+ unless( $class eq 'path' ) {
+ warn "Cannot add witness label to a $class edge";
+ next;
+ }
+ my $wit = $edge_data{$wkey};
+ unless( $witnesses{$wit} ) {
+ $tradition->add_witness( sigil => $wit );
+ $witnesses{$wit} = 1;
+ }
+ my $label = $wkey eq 'witness_ante_corr'
+ ? $wit . $collation->ac_label : $wit;
+ $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label );
+ } elsif( $wkey eq 'relationship' ) {
+ unless( $class eq 'relationship' ) {
+ warn "Cannot add relationship label to a $class edge";
+ next;
+ }
+ my $rel = $edge_data{$wkey};
+ # TODO handle global relationships
+ $collation->add_relationship( $rel, $from->{$IDKEY}, $to->{$IDKEY} );
+ } else {
+ my $seg_edge = $collation->graph->add_edge( $from->{$IDKEY}, $to->{$IDKEY} );
+ $seg_edge->set_attribute( 'class', 'segment' );
+ }
+ }
}
## Deal with node information (transposition, relationships, etc.) that
print STDERR "Adding second-pass data\n";
my $linear = undef;
foreach my $nkey ( keys %$extra_data ) {
- foreach my $edkey ( keys %{$extra_data->{$nkey}} ) {
- my $this_reading = $collation->reading( $nkey );
- if( $edkey eq $TRANSPOS_KEY ) {
- my $other_reading = $collation->reading( $extra_data->{$nkey}->{$edkey} );
- # We evidently have a linear graph.
- $linear = 1;
- $this_reading->set_identical( $other_reading );
- } elsif ( $edkey eq $POSITION_KEY ) {
- $this_reading->position( $extra_data->{$nkey}->{$edkey} );
- } else {
- warn "Unfamiliar reading node data $edkey for $nkey";
- }
- }
+ foreach my $edkey ( keys %{$extra_data->{$nkey}} ) {
+ my $this_reading = $collation->reading( $nkey );
+ if( $edkey eq $TRANSPOS_KEY ) {
+ my $other_reading = $collation->reading( $extra_data->{$nkey}->{$edkey} );
+ # We evidently have a linear graph.
+ $linear = 1;
+ $this_reading->set_identical( $other_reading );
+ } elsif ( $edkey eq $POSITION_KEY ) {
+ $this_reading->position( $extra_data->{$nkey}->{$edkey} );
+ } else {
+ warn "Unfamiliar reading node data $edkey for $nkey";
+ }
+ }
}
$collation->linear( $linear );
use strict;
use warnings;
+use Text::Tradition::Parser::Util qw( collate_variants );
use XML::LibXML;
use XML::LibXML::XPathContext;
=cut
+my $text = {}; # Hash of arrays, one per eventual witness we find.
+my @common_readings;
+my $substitutions = {}; # Keep track of merged readings
+my $app_anchors = {}; # Track apparatus references
+my $app_ac = {}; # Save a.c. readings
+
+# Create the package variables for tag names.
+
+# Would really like to do this with varname variables, but apparently this
+# is considered a bad idea. The long way round then.
+my( $LISTWIT, $WITNESS, $TEXT, $W, $SEG, $APP, $RDG, $LEM )
+ = ( 'listWit', 'witness', 'text', 'w', 'seg', 'app', 'rdg', 'lem' );
+sub make_tagnames {
+ my( $ns ) = @_;
+ if( $ns ) {
+ $LISTWIT = "$ns:$LISTWIT";
+ $WITNESS = "$ns:$WITNESS";
+ $TEXT = "$ns:$TEXT";
+ $W = "$ns:$W";
+ $SEG = "$ns:$SEG";
+ $APP = "$ns:$APP";
+ $RDG = "$ns:$RDG";
+ $LEM = "$ns:$LEM";
+ }
+}
+
+# Parse the TEI file.
sub parse {
my( $tradition, $xml_str ) = @_;
my $doc = $parser->parse_string( $xml_str );
my $tei = $doc->documentElement();
my $xpc = XML::LibXML::XPathContext->new( $tei );
- $xpc->registerNs( 'tei', 'http://www.tei-c.org/ns/1.0' );
-
+ my $ns;
+ if( $tei->namespaceURI ) {
+ $ns = 'tei';
+ $xpc->registerNs( $ns, $tei->namespaceURI );
+ }
+ make_tagnames( $ns );
+
# Then get the witnesses and create the witness objects.
- foreach my $wit_el ( $xpc->findnodes( '//tei:listWit/tei:witness' ) ) {
- my $sig = $wit_el->getAttribute( 'xml:id' );
- my $source = $wit_el->toString();
- $tradition->add_witness( sigil => $sig, source => $source );
+ foreach my $wit_el ( $xpc->findnodes( "//$LISTWIT/$WITNESS" ) ) {
+ my $sig = $wit_el->getAttribute( 'xml:id' );
+ my $source = $wit_el->toString();
+ $tradition->add_witness( sigil => $sig, source => $source );
}
- # Now go through the text and make the tokens.
- # Assume for now that each word is tokenized in the XML.
- my $text = {};
- map { $text->{$_->sigil} = [] } @{$tradition->witnesses};
- my $word_ctr = 0;
- my %used_word_ids;
- foreach my $word_el ( $xpc->findnodes( '//tei:w|tei:seg' ) ) {
- # If it is contained within a lem or a rdg, look at those witnesses.
- # Otherwise it is common to all witnesses.
- # Also common if it is the only lem/rdg within its app.
- # Thus we are assuming non-nested apps.
-
- my $parent_rdg = $xpc->find( 'parent::tei:lem|parent::tei:rdg', $word_el );
- my @wits = get_sigla( $parent_rdg );
- @wits = map { $_->sigil } @{$tradition->witnesses} unless @wits;
-
- # Create the node
- my $reading = make_reading( $tradition->collation, $word_el );
-
- # Figure out if it is a common node, that is, if it is outside an apparatus
- # or the only rdg in an apparatus
- my $common = 1;
- if( $xpc->findnodes( 'ancestor::tei:app', $word_el ) ) {
- # If we are in an app we are not a common node...
- $common = 0;
- if( $xpc->findnodes( 'ancestor::tei:app/tei:rdg' )->size == 1 ) {
- # unless we are the only reading in the app.
- $common = 1;
- }
- }
- $reading->make_common if $common;
-
- foreach my $sig ( @wits ) {
- push( @{$text->{$sig}}, $reading );
- }
+ map { $text->{$_->sigil} = [] } $tradition->witnesses;
+ # Look for all word/seg node IDs and note their pre-existence.
+ my @attrs = $xpc->findnodes( "//$W|$SEG/attribute::xml:id" );
+ save_preexisting_nodeids( @attrs );
+
+ # Now go through the children of the text element and pull out the
+ # actual text.
+ foreach my $xml_el ( $xpc->findnodes( "//$TEXT" ) ) {
+ foreach my $xn ( $xml_el->childNodes ) {
+ _get_readings( $tradition, $xn );
+ }
+ }
+ # Our $text global now has lists of readings, one per witness.
+ # Join them up.
+ my $c = $tradition->collation;
+ foreach my $sig ( keys %$text ) {
+ next if $sig eq 'base'; # Skip base text readings with no witnesses.
+ # Determine the list of readings for
+ my $sequence = $text->{$sig};
+ my @real_sequence = ( $c->start );
+ push( @$sequence, $c->end );
+ my $source = $c->start;
+ foreach( _clean_sequence( $sig, $sequence ) ) {
+ my $rdg = _return_rdg( $_ );
+ push( @real_sequence, $rdg );
+ $c->add_path( $source, $rdg, $sig );
+ $source = $rdg;
+ }
+ $tradition->witness( $sig )->path( \@real_sequence );
+ # See if we need to make an a.c. version of the witness.
+ if( exists $app_ac->{$sig} ) {
+ my @uncorrected;
+ push( @uncorrected, @real_sequence );
+ foreach my $app ( keys %{$app_ac->{$sig}} ) {
+ my $start = _return_rdg( $app_anchors->{$app}->{$sig}->{'start'} );
+ my $end = _return_rdg( $app_anchors->{$app}->{$sig}->{'end'} );
+ my @new = map { _return_rdg( $_ ) } @{$app_ac->{$sig}->{$app}};
+ _replace_sequence( \@uncorrected, $start, $end, @new );
+ }
+ my $source = $c->start;
+ foreach my $rdg ( @uncorrected ) {
+ my $has_base = grep { $_->label eq $sig } $source->edges_to( $rdg );
+ if( $rdg ne $c->start && !$has_base ) {
+ print STDERR sprintf( "Adding path %s from %s -> %s\n",
+ $sig.$c->ac_label, $source->name, $rdg->name );
+ $c->add_path( $source, $rdg, $sig.$c->ac_label );
+ }
+ $source = $rdg;
+ }
+ $tradition->witness( $sig )->uncorrected_path( \@uncorrected );
+ }
+ }
+ # Delete readings that are no longer part of the graph.
+ # TODO think this is useless actually
+ foreach ( keys %$substitutions ) {
+ $tradition->collation->del_reading( $tradition->collation->reading( $_ ) );
+ }
+ $tradition->collation->calculate_positions( @common_readings );
+}
+
+sub _clean_sequence {
+ my( $wit, $sequence ) = @_;
+ my @clean_sequence;
+ foreach my $rdg ( @$sequence ) {
+ if( $rdg =~ /^PH-(.*)$/ ) {
+ # It is a placeholder. Keep it only if we need it.
+ my $app_id = $1;
+ if( exists $app_ac->{$wit}->{$app_id} ) {
+ print STDERR "Retaining empty placeholder for $app_id\n";
+ push( @clean_sequence, $rdg );
+ }
+ } else {
+ push( @clean_sequence, $rdg );
+ }
}
+ return @clean_sequence;
+}
- $DB::single = 1;
- # Now we have the text paths through the witnesses, so we can make
- # the edges.
- my $end = $tradition->collation->add_reading( '#END#' );
- foreach my $sigil ( keys %$text ) {
- my @nodes = @{$text->{$sigil}};
- my $source = $tradition->collation->start;
- foreach my $n ( @nodes ) {
- # print STDERR sprintf( "Joining %s -> %s for wit %s\n", $source->text, $n->text, $sigil );
- $tradition->collation->add_path( $source, $n, $sigil );
- $source = $n;
- }
- $tradition->collation->add_path( $source, $end, $sigil );
+sub _replace_sequence {
+ my( $arr, $start, $end, @new ) = @_;
+ my( $start_idx, $end_idx );
+ foreach my $i ( 0 .. $#{$arr} ) {
+ $start_idx = $i if( $arr->[$i]->name eq $start );
+ if( $arr->[$i]->name eq $end ) {
+ $end_idx = $i;
+ last;
+ }
+ }
+ unless( $start_idx && $end_idx ) {
+ warn "Could not find start and end";
+ return;
}
+ my $length = $end_idx - $start_idx + 1;
+ splice( @$arr, $start_idx, $length, @new );
+}
- # TODO think about relationships, transpositions, etc.
+sub _return_rdg {
+ my( $rdg ) = @_;
+ # If we were passed a reading name, return the name. If we were
+ # passed a reading object, return the object.
+ my $wantobj = ref( $rdg ) eq 'Text::Tradition::Collation::Reading';
+ my $real = $rdg;
+ if( exists $substitutions->{ $wantobj ? $rdg->name : $rdg } ) {
+ $real = $substitutions->{ $wantobj ? $rdg->name : $rdg };
+ $real = $real->name unless $wantobj;
+ }
+ return $real;
}
+## Recursive helper function to help us navigate through nested XML,
+## picking out the text. $tradition is the tradition, needed for
+## making readings; $xn is the XML node currently being looked at,
+## $in_var is a flag to say that we are inside a variant, $ac is a
+## flag to say that we are inside an ante-correctionem reading, and
+## @cur_wits is the list of witnesses to which this XML node applies.
+## Returns the list of readings, if any, created on the run.
+
+{
+ my @active_wits;
+ my $current_app;
+
+ sub _get_readings {
+ my( $tradition, $xn, $in_var, $ac, @cur_wits ) = @_;
+ @cur_wits = @active_wits unless $in_var;
+
+ my @new_readings;
+ if( $xn->nodeType == XML_TEXT_NODE ) {
+ # Some words, thus make some readings.
+ my $str = $xn->data;
+ return unless $str =~ /\S/; # skip whitespace-only text nodes
+ #print STDERR "Handling text node " . $str . "\n";
+ # Check that all the witnesses we have are active.
+ foreach my $c ( @cur_wits ) {
+ warn "Could not find $c in active wits"
+ unless grep { $c eq $_ } @active_wits;
+ }
+ $str =~ s/^\s+//;
+ my $final = $str =~ s/\s+$//;
+ foreach my $w ( split( /\s+/, $str ) ) {
+ # For now, skip punctuation.
+ next if $w !~ /[[:alnum:]]/;
+ my $rdg = make_reading( $tradition->collation, $w );
+ push( @new_readings, $rdg );
+ unless( $in_var ) {
+ push( @common_readings, $rdg );
+ $rdg->make_common;
+ }
+ foreach ( @cur_wits ) {
+ warn "Empty wit!" unless $_;
+ warn "Empty reading!" unless $rdg;
+ push( @{$text->{$_}}, $rdg ) unless $ac;
+ }
+ }
+ } elsif( $xn->nodeName eq 'w' ) {
+ # Everything in this tag is one word. Also save any original XML ID.
+ #print STDERR "Handling word " . $xn->toString . "\n";
+ # Check that all the witnesses we have are active.
+ foreach my $c ( @cur_wits ) {
+ warn "Could not find $c in active wits"
+ unless grep { $c eq $_ } @active_wits;
+ }
+ my $xml_id = $xn->getAttribute( 'xml:id' );
+ my $rdg = make_reading( $tradition->collation, $xn->textContent, $xml_id );
+ push( @new_readings, $rdg );
+ unless( $in_var ) {
+ push( @common_readings, $rdg );
+ $rdg->make_common;
+ }
+ foreach( @cur_wits ) {
+ warn "Empty wit!" unless $_;
+ warn "Empty reading!" unless $rdg;
+ push( @{$text->{$_}}, $rdg ) unless $ac;
+ }
+ } elsif ( $xn->nodeName eq 'app' ) {
+ $current_app = $xn->getAttribute( 'xml:id' );
+ # print STDERR "Handling app $current_app\n";
+ # Keep the reading sets in this app.
+ my @sets;
+ # Recurse through all children (i.e. rdgs) for sets of words.
+ foreach ( $xn->childNodes ) {
+ my @rdg_set = _get_readings( $tradition, $_, $in_var, $ac, @cur_wits );
+ push( @sets, \@rdg_set ) if @rdg_set;
+ }
+ # Now collate these sets if we have more than one.
+ my $subs = collate_variants( $tradition->collation, @sets ) if @sets > 1;
+ map { $substitutions->{$_} = $subs->{$_} } keys %$subs;
+ # TODO Look through substitutions to see if we can make anything common now.
+ # Return the entire set of unique readings.
+ my %unique;
+ foreach my $s ( @sets ) {
+ map { $unique{$_->name} = $_ } @$s;
+ }
+ push( @new_readings, values( %unique ) );
+ # Exit the current app.
+ $current_app = '';
+ } elsif ( $xn->nodeName eq 'lem' || $xn->nodeName eq 'rdg' ) {
+ # Alter the current witnesses and recurse.
+ #print STDERR "Handling reading for " . $xn->getAttribute( 'wit' ) . "\n";
+ $ac = $xn->getAttribute( 'type' ) && $xn->getAttribute( 'type' ) eq 'a.c.';
+ my @rdg_wits = get_sigla( $xn );
+ @rdg_wits = ( 'base' ) unless @rdg_wits; # Allow for editorially-supplied readings
+ my @words;
+ foreach ( $xn->childNodes ) {
+ my @rdg_set = _get_readings( $tradition, $_, 1, $ac, @rdg_wits );
+ push( @words, @rdg_set ) if @rdg_set;
+ }
+ # If we have more than one word in a reading, it should become a segment.
+ # $tradition->collation->add_segment( @words ) if @words > 1;
+
+ if( $ac ) {
+ # Add the reading set to the a.c. readings.
+ foreach ( @rdg_wits ) {
+ $app_ac->{$_}->{$current_app} = \@words;
+ }
+ } else {
+ # Add the reading set to the app anchors for each witness
+ # or put in placeholders for empty p.c. readings
+ foreach ( @rdg_wits ) {
+ my $start = @words ? $words[0]->name : "PH-$current_app";
+ my $end = @words ? $words[-1]->name : "PH-$current_app";
+ $app_anchors->{$current_app}->{$_}->{'start'} = $start;
+ $app_anchors->{$current_app}->{$_}->{'end'} = $end;
+ push( @{$text->{$_}}, $start ) unless @words;
+ }
+ }
+ push( @new_readings, @words );
+ } elsif( $xn->nodeName eq 'witStart' ) {
+ # Add the relevant wit(s) to the active list.
+ #print STDERR "Handling witStart\n";
+ push( @active_wits, @cur_wits );
+ } elsif( $xn->nodeName eq 'witEnd' ) {
+ # Take the relevant wit(s) out of the list.
+ #print STDERR "Handling witEnd\n";
+ my $regexp = '^(' . join( '|', @cur_wits ) . ')$';
+ @active_wits = grep { $_ !~ /$regexp/ } @active_wits;
+ } elsif( $xn->nodeName eq 'witDetail' ) {
+ # Ignore these for now.
+ return;
+ } else {
+ # Recurse as if this tag weren't there.
+ #print STDERR "Recursing on tag " . $xn->nodeName . "\n";
+ foreach( $xn->childNodes ) {
+ push( @new_readings, _get_readings( $tradition, $_, $in_var, $ac, @cur_wits ) );
+ }
+ }
+ return @new_readings;
+ }
+
+}
+
+# Helper to extract a list of witness sigla from a reading element.
sub get_sigla {
my( $rdg ) = @_;
# Cope if we have been handed a NodeList. There is only
# one reading here.
if( ref( $rdg ) eq 'XML::LibXML::NodeList' ) {
- $rdg = $rdg->shift;
+ $rdg = $rdg->shift;
}
my @wits;
if( ref( $rdg ) eq 'XML::LibXML::Element' ) {
- @wits = split( /\s+/, $rdg->getAttribute( 'wit' ) );
- map { $_ =~ s/^\#// } @wits;
+ my $witstr = $rdg->getAttribute( 'wit' );
+ $witstr =~ s/^\s+//;
+ $witstr =~ s/\s+$//;
+ @wits = split( /\s+/, $witstr );
+ map { $_ =~ s/^\#// } @wits;
}
return @wits;
}
+# Helper with its counters to actually make the readings.
{
my $word_ctr = 0;
my %used_nodeids;
+ sub save_preexisting_nodeids {
+ foreach( @_ ) {
+ $used_nodeids{$_->getValue()} = 1;
+ }
+ }
+
sub make_reading {
- my( $graph, $word_el) = @_;
- my $xml_id = $word_el->getAttribute( 'xml:id' );
- if( $xml_id && exists $used_nodeids{$xml_id} ) {
- warn "Already used assigned ID $xml_id";
- $xml_id = undef;
- }
- if( !$xml_id ) {
- until( $xml_id ) {
- my $try_id = 'w'.$word_ctr++;
- next if exists $used_nodeids{$try_id};
- $xml_id = $try_id;
- }
- }
- my $rdg = $graph->add_reading( $xml_id );
- $rdg->text( $word_el->textContent() );
- $used_nodeids{$xml_id} = $rdg;
- return $rdg;
+ my( $graph, $word, $xml_id ) = @_;
+ if( $xml_id ) {
+ if( exists $used_nodeids{$xml_id} ) {
+ if( $used_nodeids{$xml_id} != 1 ) {
+ warn "Already used assigned XML ID somewhere else!";
+ $xml_id = undef;
+ }
+ } else {
+ warn "Undetected pre-existing XML ID";
+ }
+ }
+ if( !$xml_id ) {
+ until( $xml_id ) {
+ my $try_id = 'w'.$word_ctr++;
+ next if exists $used_nodeids{$try_id};
+ $xml_id = $try_id;
+ }
+ }
+ my $rdg = $graph->add_reading( $xml_id );
+ $rdg->text( $word );
+ $used_nodeids{$xml_id} = $rdg;
+ return $rdg;
}
}
--- /dev/null
+package Text::Tradition::Parser::Util;
+
+use strict;
+use warnings;
+use Algorithm::Diff;
+use Exporter 'import';
+use vars qw/ @EXPORT_OK /;
+@EXPORT_OK = qw/ add_hash_entry check_for_repeated cmp_str collate_variants is_monotonic /;
+
+=item B<collate_variants>
+
+collate_variants( $collation, @reading_ranges )
+
+Given a set of readings in the form
+( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
+walks through each to identify those readings that are identical. The
+collation is a Text::Tradition::Collation object; the elements of
+@readings are Text::Tradition::Collation::Reading objects that appear
+on the collation graph.
+
+TODO: Handle collapsed and non-collapsed transpositions.
+
+=cut
+
+sub collate_variants {
+ my( $collation, @reading_sets ) = @_;
+
+ # Two different ways to do this, depending on whether we want
+ # transposed reading nodes to be merged into one (producing a
+ # nonlinear, bidirectional graph) or not (producing a relatively
+ # linear, unidirectional graph.)
+ return $collation->linear ? collate_linearly( @_ )
+ : collate_nonlinearly( @_ );
+}
+
+sub collate_linearly {
+ my( $collation, $lemma_set, @variant_sets ) = @_;
+
+ my @unique;
+ my $substitutions = {};
+ push( @unique, @$lemma_set );
+ while( @variant_sets ) {
+ my $variant_set = shift @variant_sets;
+ # Use diff to do this job
+ my $diff = Algorithm::Diff->new( \@unique, $variant_set,
+ {'keyGen' => \&_collation_hash} );
+ my @new_unique;
+ my %merged;
+ while( $diff->Next ) {
+ if( $diff->Same ) {
+ # merge the nodes
+ my @l = $diff->Items( 1 );
+ my @v = $diff->Items( 2 );
+ foreach my $i ( 0 .. $#l ) {
+ if( !$merged{$l[$i]->name} ) {
+ print STDERR sprintf( "Merging %s into %s\n",
+ $v[$i]->name,
+ $l[$i]->name );
+ $collation->merge_readings( $l[$i], $v[$i] );
+ $merged{$l[$i]->name} = 1;
+ $substitutions->{$v[$i]->name} = $l[$i];
+ } else {
+ print STDERR "Would have double merged " . $l[$i]->name . "\n";
+ }
+ }
+ # splice the lemma nodes into the variant set
+ my( $offset ) = $diff->Get( 'min2' );
+ splice( @$variant_set, $offset, scalar( @l ), @l );
+ push( @new_unique, @l );
+ } else {
+ # Keep the old unique readings
+ push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
+ # Add the new readings to the 'unique' list
+ push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
+ }
+ }
+ @unique = @new_unique;
+ }
+ return $substitutions;
+}
+
+sub collate_nonlinearly {
+ my( $collation, $lemma_set, @variant_sets ) = @_;
+
+ my @unique;
+ my $substitutions = {};
+ push( @unique, @$lemma_set );
+ while( @variant_sets ) {
+ my $variant_set = shift @variant_sets;
+ # Simply match the first reading that carries the same word, so
+ # long as that reading has not yet been used to match another
+ # word in this variant. That way lies loopy madness.
+ my @distinct;
+ my %merged;
+ foreach my $idx ( 0 .. $#{$variant_set} ) {
+ my $vw = $variant_set->[$idx];
+ my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
+ my $matched;
+ if( @same ) {
+ foreach my $i ( 0 .. $#same ) {
+ unless( $merged{$same[$i]->name} ) {
+ #print STDERR sprintf( "Merging %s into %s\n",
+ # $vw->name,
+ # $same[$i]->name );
+ $collation->merge_readings( $same[$i], $vw );
+ $merged{$same[$i]->name} = 1;
+ $matched = $i;
+ $variant_set->[$idx] = $same[$i];
+ $substitutions->{$vw->name} = $same[$i];
+ }
+ }
+ }
+ unless( @same && defined($matched) ) {
+ push( @distinct, $vw );
+ }
+ }
+ push( @unique, @distinct );
+ }
+ return $substitutions;
+}
+
+sub _collation_hash {
+ my $node = shift;
+ return cmp_str( $node );
+}
+
+=item B<cmp_str>
+
+Pretend you never saw this method. Really it needs to not be hardcoded.
+
+=cut
+
+sub cmp_str {
+ my( $reading ) = @_;
+ my $word = $reading->label();
+ $word = lc( $word );
+ $word =~ s/\W//g;
+ $word =~ s/v/u/g;
+ $word =~ s/j/i/g;
+ $word =~ s/cha/ca/g;
+ $word =~ s/quatuor/quattuor/g;
+ $word =~ s/ioannes/iohannes/g;
+ return $word;
+}
+
+=item B<collate_variants>
+
+my @rep = check_for_repeated( @readings )
+
+Given an array of items, returns any items that appear in the array more
+than once.
+
+=cut
+
+sub check_for_repeated {
+ my @seq = @_;
+ my %unique;
+ my @repeated;
+ foreach ( @seq ) {
+ if( exists $unique{$_->name} ) {
+ push( @repeated, $_->name );
+ } else {
+ $unique{$_->name} = 1;
+ }
+ }
+ return @repeated;
+}
+
+sub add_hash_entry {
+ my( $hash, $key, $entry ) = @_;
+ if( exists $hash->{$key} ) {
+ push( @{$hash->{$key}}, $entry );
+ } else {
+ $hash->{$key} = [ $entry ];
+ }
+}
+
+sub is_monotonic {
+ my( @readings ) = @_;
+ my( $common, $min, $max ) = ( -1, -1, -1 );
+ foreach my $rdg ( @readings ) {
+# print STDERR "Checking reading " . $rdg->name . "/" . $rdg->text . " - "
+# . $rdg->position->reference ."\n";
+ return 0 if $rdg->position->common < $common;
+ if( $rdg->position->common == $common ) {
+ return 0 if $rdg->position->min <= $min;
+ return 0 if $rdg->position->max <= $max;
+ }
+ $common = $rdg->position->common;
+ $min = $rdg->position->min;
+ $max = $rdg->position->max;
+ }
+ return 1;
+}
\ No newline at end of file
sub make_character_matrix {
my $self = shift;
unless( $self->collation->linear ) {
- warn "Need a linear graph in order to make an alignment table";
- return;
+ warn "Need a linear graph in order to make an alignment table";
+ return;
}
- my @all_pos = sort { Text::Tradition::Collation::Position::str_cmp( $a, $b ) }
- $self->collation->possible_positions;
- my $table = [];
- my $characters = {};
- map { $characters->{$_} = {} } @all_pos;
- foreach my $wit ( @{$self->collation->tradition->witnesses} ) {
- # First implementation: make dumb alignment table, caring about
- # nothing except which reading is in which position.
- my $sigilfield = sprintf( "%-10s", $wit->sigil );
- push( @$table, [ $sigilfield, make_witness_row( $characters, $wit->path,
- \@all_pos ) ] );
- if( $wit->has_ante_corr ) {
- $sigilfield = sprintf( "%-10s", $wit->sigil . "_ac" );
- push( @$table, [ $sigilfield,
- make_witness_row( $characters, $wit->uncorrected_path,
- \@all_pos ) ] );
- }
+ my $table = $self->collation->make_alignment_table;
+ # Push the names of the witnesses to initialize the rows of the matrix.
+ my @matrix = map { [ $self->_normalize_ac( $_ ) ] } @{$table->[0]};
+ $DB::single = 1;
+ foreach my $token_index ( 1 .. $#{$table} ) {
+ # First implementation: make dumb alignment table, caring about
+ # nothing except which reading is in which position.
+ my @chars = convert_characters( $table->[$token_index] );
+ foreach my $idx ( 0 .. $#matrix ) {
+ push( @{$matrix[$idx]}, $chars[$idx] );
+ }
}
- $self->_save_character_matrix( $table );
-}
+ $self->_save_character_matrix( \@matrix );
+}
-sub make_witness_row {
- my( $characters, $path, $positions ) = @_;
- my %char_hash;
- map { $char_hash{$_} = 'X' } @$positions;
- foreach my $rdg( @$path ) {
- $char_hash{$rdg->position->minref} = get_character( $rdg, $characters );
+sub _normalize_ac {
+ my( $self, $witname ) = @_;
+ my $ac = $self->collation->ac_label;
+ if( $witname =~ /(.*)\Q$ac\E$/ ) {
+ $witname = $1 . '_ac';
}
- my @row = map { $char_hash{$_} } @$positions;
- return @row;
+ return sprintf( "%-10s", $witname );
}
-
-sub get_character {
- my( $reading, $characters ) = @_;
- my $this_pos = $characters->{$reading->position->minref};
+sub convert_characters {
+ my $row = shift;
# This is a simple algorithm that treats every reading as different.
# Eventually we will want to be able to specify how relationships
# affect the character matrix.
- my $text = $reading->text;
- unless( exists $this_pos->{$text} ) {
- # We need to find what the next character is here, and record it.
- my @all_chr = sort { $a <=> $b } values( %$this_pos );
- if( @all_chr == 8 ) {
- warn "Already have eight variants at position "
- . $reading->position->minref . "; not adding " . $reading->text;
- return '?';
- }
- $this_pos->{$text} = scalar @all_chr;
+ my %unique = ( '__UNDEF__' => 'X' );
+ my $ctr = 0;
+ foreach my $word ( @$row ) {
+ if( $word && !exists $unique{$word} ) {
+ $unique{$word} = chr( 65 + $ctr );
+ $ctr++;
+ }
}
- return $this_pos->{$text};
+ if( scalar( keys %unique ) > 8 ) {
+ warn "Have more than 8 variants on this location; pars will break";
+ }
+ my @chars = map { $_ ? $unique{$_} : $unique{'__UNDEF__' } } @$row;
+ return @chars;
}
sub pars_input {
my $columns = scalar @{$self->character_matrix->[0]} - 1;
$matrix .= "\t$rows\t$columns\n";
foreach my $row ( @{$self->character_matrix} ) {
- $matrix .= join( '', @$row ) . "\n";
+ $matrix .= join( '', @$row ) . "\n";
}
return $matrix;
}
# Set up a temporary directory for all the default Phylip files.
my $phylip_dir = File::Temp->newdir();
- $DB::single = 1;
# We need an infile, and we need a command input file.
open( MATRIX, ">$phylip_dir/infile" ) or die "Could not write $phylip_dir/infile";
print MATRIX $self->pars_input();
my $PHYLIP_PATH = '/Users/tla/Projects/phylip-3.69/exe';
my $program = "pars";
if( $^O eq 'darwin' ) {
- $program = "$PHYLIP_PATH/$program.app/Contents/MacOS/$program";
+ $program = "$PHYLIP_PATH/$program.app/Contents/MacOS/$program";
} else {
- $program = "$PHYLIP_PATH/$program";
+ $program = "$PHYLIP_PATH/$program";
}
{
- # We need to run it in our temporary directory where we have created
- # all the expected files.
- local $CWD = $phylip_dir;
- my @cmd = ( $program );
- run \@cmd, '<', 'cmdfile', '>', '/dev/null';
+ # We need to run it in our temporary directory where we have created
+ # all the expected files.
+ local $CWD = $phylip_dir;
+ my @cmd = ( $program );
+ run \@cmd, '<', 'cmdfile', '>', '/dev/null';
}
# Now our output should be in 'outfile' and our tree in 'outtree',
# both in the temp directory.
my @outtree;
if( -f "$phylip_dir/outtree" ) {
- open( TREE, "$phylip_dir/outtree" ) or die "Could not open outtree for read";
- @outtree = <TREE>;
- close TREE;
+ open( TREE, "$phylip_dir/outtree" ) or die "Could not open outtree for read";
+ @outtree = <TREE>;
+ close TREE;
}
return( 1, join( '', @outtree ) ) if @outtree;
my @error;
if( -f "$phylip_dir/outfile" ) {
- open( OUTPUT, "$phylip_dir/outfile" ) or die "Could not open output for read";
- @error = <OUTPUT>;
- close OUTPUT;
+ open( OUTPUT, "$phylip_dir/outfile" ) or die "Could not open output for read";
+ @error = <OUTPUT>;
+ close OUTPUT;
} else {
- push( @error, "Neither outtree nor output file was produced!" );
+ push( @error, "Neither outtree nor output file was produced!" );
}
return( undef, join( '', @error ) );
}
--- /dev/null
+#!/usr/bin/env perl
+
+use lib 'lib';
+use strict;
+use warnings;
+use Getopt::Long;
+use Text::Tradition;
+use Text::Tradition::Stemma;
+
+binmode STDERR, ":utf8";
+binmode STDOUT, ":utf8";
+eval { no warnings; binmode $DB::OUT, ":utf8"; };
+
+my( $informat, $inbase, $outformat, $help, $linear, $HACK )
+ = ( '', '', '', '', 1, 0 );
+
+GetOptions( 'i|in=s' => \$informat,
+ 'b|base=s' => \$inbase,
+ 'o|out=s' => \$outformat,
+ 'l|linear!' => \$linear,
+ 'h|help' => \$help,
+ 'hack' => \$HACK,
+ );
+
+if( $help ) {
+ help();
+}
+
+unless( $informat =~ /^(CSV|CTE|KUL|Self|TEI|CollateX)$/i ) {
+ help( "Input format must be one of CollateX, CSV, CTE, Self, TEI" );
+}
+$informat = 'CollateX' if $informat =~ /^c(ollate)?x$/i;
+$informat = 'KUL' if $informat =~ /^kul$/i;
+$informat = 'CTE' if $informat =~ /^cte$/i;
+$informat = 'Self' if $informat =~ /^self$/i;
+$informat = 'TEI' if $informat =~ /^tei$/i;
+
+unless( $outformat =~ /^(graphml|svg|dot|stemma|csv)$/ ) {
+ help( "Output format must be one of graphml, svg, csv, stemma, or dot" );
+}
+
+# Do we have a base if we need it?
+if( $informat eq 'KUL' && !$inbase ) {
+ help( "$informat input needs a base text" );
+}
+
+# CSV parsing requires a filename; XML parsing requires a string.
+my $input = $ARGV[0];
+unless( $informat eq 'KUL' || $informat eq 'CSV' ) {
+ my @lines;
+ open( INFILE, "$input" ) or die "Could not read $input";
+ @lines = <INFILE>;
+ close INFILE;
+ $input = join( '', @lines );
+}
+
+# First: read the base. Make a graph, but also note which
+# nodes represent line beginnings.
+my %args = ( $informat => $input,
+ 'linear' => $linear );
+$args{'base'} = $inbase if $inbase;
+my $tradition = Text::Tradition->new( %args );
+
+### Custom hacking
+# Remove witnesses C, E, G in the Matthew text
+if( $HACK ) {
+ foreach( $tradition->collation->paths() ) {
+ $tradition->collation->del_path( $_ ) if $_->label =~ /^[ceg]$/i;
+ }
+ foreach( $tradition->collation->readings() ) {
+ if( !$_->outgoing() && !$_->incoming() ) {
+ print STDERR "Deleting reading " . $_->label . "\n";
+ $tradition->collation->del_reading( $_ );
+ }
+ }
+}
+
+# Now output what we have been asked to.
+if( $outformat eq 'stemma' ) {
+ my $stemma = Text::Tradition::Stemma->new(
+ 'collation' => $tradition->collation );
+ my( $result, $tree ) = $stemma->run_pars();
+ if( $result ) {
+ print $tree;
+ } else {
+ print STDERR "Bad result: $tree";
+ }
+} else {
+ my $output = "as_$outformat";
+ print $tradition->collation->$output();
+}
+
+sub help {
+ my( $msg ) = @_;
+ print STDERR << "EOF"
+Usage: $0 -i [format] -o [format] (--base [filename]) (--(no)linear) [inputfile]
+ i, input: Format of the input file. Must be one of CollateX, CSV, CTE, Self, TEI.
+ o, output: Format of the output. Must be one of svg, dot, graphml, csv, stemma.
+ b, base: Filename that contains a base text. Needed for CSV input.
+ l, linear: Treat transposed readings separately, producing a linear graph.
+ If nolinear, treat transposed readings as the same node.
+ h, help: Print this message.
+EOF
+ ;
+ if( $msg ) {
+ print STDERR "$msg\n";
+ }
+ exit ($msg ? 1 : 0 );
+}