);
-has 'tradition' => (
+has 'tradition' => ( # TODO should this not be ro?
is => 'rw',
isa => 'Text::Tradition',
);
my @joined = ( [ $source->name, $target->name ] ); # Keep track of the nodes we join.
$options->{'this_relation'} = [ $source, $target ];
- my $rel = Text::Tradition::Collation::Relationship->new( %$options );
+ my $rel;
+ eval { $rel = Text::Tradition::Collation::Relationship->new( %$options ) };
+ if( $@ ) {
+ return ( undef, $@ );
+ }
$self->graph->add_edge( $source, $target, $rel );
if( $options->{'global'} ) {
# Look for all readings with the source label, and if there are
my @edges = $view eq 'relationship' ? $self->relationships : $self->paths;
foreach my $edge ( @edges ) {
- $dot .= sprintf( "\t\"%s\" -> \"%s\" [ color=\"%s\", fontcolor=\"%s\", label=\"%s\" ]\n",
- $edge->from->name, $edge->to->name, '#000000', '#000000', $edge->label );
+ my %variables = ( 'color' => '#000000',
+ 'fontcolor' => '#000000',
+ 'label' => $edge->label,
+ );
+ my $varopts = join( ', ', map { $_.'="'.$variables{$_}.'"' } sort keys %variables );
+ $dot .= sprintf( "\t\"%s\" -> \"%s\" [ %s ]\n",
+ $edge->from->name, $edge->to->name, $varopts );
}
-
$dot .= "}\n";
return $dot;
}
$root->setNamespace( $xsi_ns, 'xsi', 0 );
$root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
+ # TODO Add some global graph data
+
# Add the data keys for nodes
my %node_data_keys;
my $ndi = 0;
$node_el->setAttribute( 'id', $node_xmlid );
_add_graphml_data( $node_el, $node_data_keys{'name'}, $n->name );
_add_graphml_data( $node_el, $node_data_keys{'reading'}, $n->label );
- _add_graphml_data( $node_el, $node_data_keys{'position'}, $n->position );
+ _add_graphml_data( $node_el, $node_data_keys{'position'}, $n->position->reference )
+ if $n->has_position;
_add_graphml_data( $node_el, $node_data_keys{'class'}, $n->sub_class );
_add_graphml_data( $node_el, $node_data_keys{'identical'}, $n->primary->name )
if $n->has_primary;
}
# Transform the path values from unique strings to arrays.
+ my @all_paths;
foreach my $k ( keys %paths ) {
- my @v = split( /\s+/, $paths{$k} );
- $paths{$k} = \@v;
+ my @v = split( /\s+/, $k );
+ push( @all_paths, \@v );
}
+ @all_paths = sort { scalar @$b <=> scalar @$a } @all_paths;
# Now %paths has all the unique paths, and we know how long the
# longest of these is. Assign positions, starting with the
# longest. All non-common positions start at 2.
- foreach my $path ( sort { scalar @$b <=> scalar @$a } values %paths ) {
+ foreach my $path ( @all_paths ) {
+ # Initially each element has a minimum position of 2
+ # plus its position in the array (1 is the common
+ # node), and a max position of the longest array
+ # length minus its position in the array.
my $range = $longest - scalar @$path;
+ my $min = 2;
foreach my $i ( 0 .. $#{$path} ) {
- my $min = $i+2;
my $rdg = $self->reading( $path->[$i] );
- unless( $rdg->has_position ) {
+ if( $rdg->has_position ) {
+ # This reading has already had a more specific
+ # position set, so we need to take that into
+ # account when calculating the min and max for
+ # the next reading.
+ my $rangeminus = $rdg->position->min - $min;
+ $min = $rdg->position->min + 1;
+ $range = $range - $rangeminus;
+ if( $range < 0 ) {
+ print STDERR "Negative range for position! " . $rdg->name . "\n"; # May remove this warning
+ $range = 0;
+ }
+ } else {
$rdg->position( $first->position->common, $min, $min+$range );
+ $min++;
+ $longest = $min+$range-2 unless $longest+2 > $min+$range; # min starts at 2 but longest assumes 0 start
+ }
+ }
+ }
+ # Now go through again and make sure the positions are
+ # monotonic. Do this until they are.
+ my $monotonic = 0;
+ my $counter = 0;
+ until( $monotonic ) {
+ $monotonic = 1;
+ $counter++;
+ foreach my $path ( @all_paths ) {
+ foreach my $i ( 0 .. $#{$path} ) {
+ my $rdg = $self->reading( $path->[$i] );
+ my $prior = $self->reading( $path->[$i-1] ) if $i > 0;
+ my $next = $self->reading( $path->[$i+1] ) if $i < $#{$path};
+ if( $prior && $rdg->position->min <= $prior->position->min ) {
+ $monotonic = 0;
+ $rdg->position->min( $prior->position->min + 1 );
+ }
+ if( $next && $rdg->position->max >= $next->position->max ) {
+ $monotonic = 0;
+ if( $next->position->max - 1 >= $rdg->position->min ) {
+ # If moving rdg/max down would not send it below
+ # rdg/min, do that.
+ $rdg->position->max( $next->position->max - 1 );
+ } else {
+ # Otherwise increase next/max.
+ $next->position->max( $rdg->position->max + 1 );
+ # ...min will be fixed on the next pass.
+ }
+ }
}
}
+ if( $counter > $#all_paths + 1 && !$monotonic ) {
+ # We risk an infinite loop. Get out of here.
+ warn "Still not monotonic after $counter passes at common point "
+ . $first->position->common;
+ last;
+ }
}
+ print STDERR "Took $counter passes for monotonicity at " . $first->position->common. "\n"
+ if $counter > 1;
$first = $next;
}
+
} else {
# Non-linear positions are pretty much impossible to pin down.
# can do positions where there aren't transpositions...
}
-
$self->init_lemmata();
}
# Top and tail the array
shift @path;
pop @path;
- $track_hash->{$_[2]} = join( ' ', map { $_->name } @path )
+ $track_hash->{join( ' ', map { $_->name } @path )} = $_[2]
if @path;
return @path;
}
my %node_data = %$n;
my $nodeid = delete $node_data{$IDKEY};
my $token = delete $node_data{$CONTENTKEY};
- unless( $nodeid && $token ) {
+ unless( defined $nodeid && defined $token ) {
+ $DB::single = 1;
warn "Did not find an ID or token for graph node, can't add it";
next;
}
foreach my $dkey ( keys %$nodedata ) {
my $keyname = $nodedata->{$dkey};
my $keyvalue = _lookup_node_data( $n, $dkey );
- $node_hash->{$keyname} = $keyvalue if $keyvalue;
+ $node_hash->{$keyname} = $keyvalue if defined $keyvalue;
}
$node_reg->{$n->getAttribute( 'id' )} = $node_hash;
push( @{$graph_hash->{'nodes'}}, $node_hash );
=cut
-my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $POSITION_KEY )
- = qw/ name reading identical position /;
+my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $POSITION_KEY, $CLASS_KEY )
+ = qw/ name reading identical position class /;
sub parse {
my( $tradition, $graphml_str ) = @_;
+ $DB::single = 1;
my $graph_data = Text::Tradition::Parser::GraphML::parse( $graphml_str );
my $collation = $tradition->collation;
my $extra_data = {}; # Keep track of data that needs to be processed
# after the nodes & edges are created.
+ print STDERR "Adding graph nodes\n";
foreach my $n ( @{$graph_data->{'nodes'}} ) {
- # Could use a better way of registering these
+ # Each node is either a segment or a reading, depending on
+ # its class. Readings have text, segments don't.
my %node_data = %$n;
my $nodeid = delete $node_data{$IDKEY};
my $reading = delete $node_data{$TOKENKEY};
- my $gnode = $collation->add_reading( $nodeid );
- $gnode->text( $reading );
+ my $class = $node_data{$CLASS_KEY} || '';
+ # TODO this is a hack, fix it?
+ $class = 'reading' unless $class eq 'segment';
+ my $method = $class eq 'segment' ? "add_$class" : "add_reading";
+ my $gnode = $collation->$method( $nodeid );
+ $gnode->label( $reading );
+ $gnode->set_common if $class eq 'common';
# Now save the rest of the data, i.e. not the ID or label,
# if it exists.
}
# Now add the edges.
+ print STDERR "Adding graph edges\n";
foreach my $e ( @{$graph_data->{'edges'}} ) {
my %edge_data = %$e;
my $from = delete $edge_data{'source'};
my $to = delete $edge_data{'target'};
+ my $class = delete $edge_data{'class'};
# Whatever is left tells us what kind of edge it is.
foreach my $wkey ( keys %edge_data ) {
if( $wkey =~ /^witness/ ) {
+ unless( $class eq 'path' ) {
+ warn "Cannot add witness label to a $class edge";
+ next;
+ }
my $wit = $edge_data{$wkey};
unless( $witnesses{$wit} ) {
$tradition->add_witness( sigil => $wit );
my $label = $wkey eq 'witness_ante_corr'
? $wit . $collation->ac_label : $wit;
$collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label );
- } else {
+ } elsif( $wkey eq 'relationship' ) {
+ unless( $class eq 'relationship' ) {
+ warn "Cannot add relationship label to a $class edge";
+ next;
+ }
my $rel = $edge_data{$wkey};
# TODO handle global relationships
$collation->add_relationship( $rel, $from->{$IDKEY}, $to->{$IDKEY} );
+ } else {
+ my $seg_edge = $collation->graph->add_edge( $from->{$IDKEY}, $to->{$IDKEY} );
+ $seg_edge->set_attribute( 'class', 'segment' );
}
}
}
## Deal with node information (transposition, relationships, etc.) that
## needs to be processed after all the nodes are created.
+ print STDERR "Adding second-pass data\n";
+ my $linear = undef;
foreach my $nkey ( keys %$extra_data ) {
foreach my $edkey ( keys %{$extra_data->{$nkey}} ) {
my $this_reading = $collation->reading( $nkey );
if( $edkey eq $TRANSPOS_KEY ) {
my $other_reading = $collation->reading( $extra_data->{$nkey}->{$edkey} );
- if( $collation->linear ) {
- $this_reading->set_identical( $other_reading );
- } else {
- $collation->merge_readings( $other_reading, $this_reading );
- }
+ # We evidently have a linear graph.
+ $linear = 1;
+ $this_reading->set_identical( $other_reading );
} elsif ( $edkey eq $POSITION_KEY ) {
$this_reading->position( $extra_data->{$nkey}->{$edkey} );
} else {
}
}
}
+ $collation->linear( $linear );
# We know what the beginning and ending nodes are, no need to
# search or reset.
my $end_node = $collation->reading( '#END#' );
- $DB::single = 1;
# Walk the paths and make reading sequences for our witnesses.
+ # No need to calculate positions as we have them already.
$collation->walk_witness_paths( $end_node );
}
+package Text::Tradition::Parser::TEI;
+
+use strict;
+use warnings;
+use XML::LibXML;
+use XML::LibXML::XPathContext;
+
+=head1 NAME
+
+Text::Tradition::Parser::TEI
+
+=head1 DESCRIPTION
+
+Parser module for Text::Tradition, given a TEI parallel-segmentation
+file that describes a text and its variants.
+
+=head1 METHODS
+
+=over
+
+=item B<parse>
+
+parse( $tei_string );
+
+Takes an initialized tradition and a string containing the TEI;
+creates the appropriate nodes and edges on the graph, as well as
+the appropriate witness objects.
+
+=cut
+
+sub parse {
+ my( $tradition, $xml_str ) = @_;
+
+ # First, parse the XML.
+ my $parser = XML::LibXML->new();
+ my $doc = $parser->parse_string( $xml_str );
+ my $tei = $doc->documentElement();
+ $xpc = XML::LibXML::XPathContext->new( $tei );
+ $xpc->registerNs( 'tei', 'http://www.tei-c.org/ns/1.0' );
+
+ # Then get the witnesses and create the witness objects.
+ foreach my $wit_el ( $xpc->findnodes( '//tei:listWit/tei:witness' ) ) {
+ my $sig = $wit_el->getAttribute( 'xml:id' );
+ my $source = $wit_el->toString(); # Save all the XML info we have
+ $tradition->add_witness( sigil => $sig, source => $source );
+ }
+
+ # Now go through the text and make the tokens.
+ # Assume for now that each word is tokenized in the XML.
+ my $text = {};
+ map { $text->{$_->sigil} = [ $tradition->start ] } @{$tradition->witnesses};
+ foreach my $word_el ( $xpc->findnodes( '//tei:w|tei:seg' ) ) {
+ # If it is contained within a lem or a rdg, look at those witnesses.
+ # Otherwise it is common to all witnesses.
+ # Also common if it is the only lem/rdg within its app.
+ # Thus we are assuming non-nested apps.
+ my $node_id = $word_el->getAttribute( 'xml:id' );
+ my $parent_rdg = $xpc->find( 'parent::tei:lem|parent::tei:rdg', $word_el );
+ my @wits = get_sigla( $parent_rdg );
+ @wits = map { $_->sigil } @{$tradition->witnesses} unless $wits;
+
+ # TODO Create the node
+ my $reading = $word_el->textContent();
+
+ # TODO Figure out if it is a common node
+
+ foreach my $sig ( @wits ) {
+ push( @{$text->{$sig}}, $reading );
+ }
+ }
+
+ # Now we have the text paths through the witnesses, so we can make
+ # the edges.
+
+ # TODO think about relationships, transpositions, etc.
+}
+
+sub get_sigla {
+ my( $rdg ) = @_;
+ # Cope if we have been handed a NodeList. There is only
+ # one reading here.
+ if( ref( $rdg ) eq 'XML::LibXML::NodeList' ) {
+ $rdg = $rdg->shift;
+ }
+
+ my @wits;
+ if( ref( $rdg ) eq 'XML::LibXML::Element' ) {
+ @wits = split( /\s+/, $rdg->get_attribute( 'wit' ) );
+ map { $_ =~ s/^\#// } @wits;
+ }
+ return @wits;
+}
+
+1;
foreach my $wit ( @{$self->collation->tradition->witnesses} ) {
# First implementation: make dumb alignment table, caring about
# nothing except which reading is in which position.
- push( @$table, [ $wit->sigil, make_witness_row( $characters, $wit->path,
+ my $sigilfield = sprintf( "%-10s", $wit->sigil );
+ push( @$table, [ $sigilfield, make_witness_row( $characters, $wit->path,
\@all_pos ) ] );
if( $wit->has_ante_corr ) {
- push( @$table, [ $wit->sigil . "_ac",
+ $sigilfield = sprintf( "%-10s", $wit->sigil . "_ac" );
+ push( @$table, [ $sigilfield,
make_witness_row( $characters, $wit->uncorrected_path,
\@all_pos ) ] );
}
sub make_witness_row {
my( $characters, $path, $positions ) = @_;
- my @row;
- my $pathdrift = 0;
- foreach my $i( 0 .. $#{$positions} ) {
- if( $path->[$i-$pathdrift]->position->minref eq $positions->[$i] ) {
- push( @row, get_character( $path->[$i-$pathdrift], $characters ) );
- } else {
- push( @row, 'X' );
- $pathdrift++;
- }
- $i++;
+ my %char_hash;
+ map { $char_hash{$_} = 'X' } @$positions;
+ foreach my $rdg( @$path ) {
+ $char_hash{$rdg->position->minref} = get_character( $rdg, $characters );
}
+ my @row = map { $char_hash{$_} } @$positions;
return @row;
}
return $this_pos->{$text};
}
-sub run_pars {
+sub pars_input {
my $self = shift;
$self->make_character_matrix unless $self->has_character_matrix;
+ my $matrix = '';
+ my $rows = scalar @{$self->character_matrix};
+ my $columns = scalar @{$self->character_matrix->[0]} - 1;
+ $matrix .= "\t$rows\t$columns\n";
+ foreach my $row ( @{$self->character_matrix} ) {
+ $matrix .= join( '', @$row ) . "\n";
+ }
+ return $matrix;
+}
+
+sub run_pars {
+ my $self = shift;
# Set up a temporary directory for all the default Phylip files.
my $phylip_dir = File::Temp->newdir();
-
- # We need an infile, and we need a command input file.
$DB::single = 1;
+ # We need an infile, and we need a command input file.
open( MATRIX, ">$phylip_dir/infile" ) or die "Could not write $phylip_dir/infile";
- my $rows = scalar @{$self->character_matrix};
- my $columns = scalar @{$self->character_matrix->[0]} - 1;
- print MATRIX "\t$rows\t$columns\n";
- foreach my $row ( @{$self->character_matrix} ) {
- my $wit = shift @$row;
- my $chars = join( '', @$row );
- print MATRIX sprintf( "%-10s%s\n", $wit, $chars );
- }
+ print MATRIX $self->pars_input();
close MATRIX;
open( CMD, ">$phylip_dir/cmdfile" ) or die "Could not write $phylip_dir/cmdfile";
return( 1, join( '', @outtree ) ) if @outtree;
my @error;
- if( -f "$phylip_dir/output" ) {
- open( OUTPUT, "$phylip_dir/output" ) or die "Could not open output for read";
+ if( -f "$phylip_dir/outfile" ) {
+ open( OUTPUT, "$phylip_dir/outfile" ) or die "Could not open output for read";
@error = <OUTPUT>;
close OUTPUT;
} else {
isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
predicate => 'has_ante_corr',
);
+
+# Manuscript name or similar
+has 'identifier' => (
+ is => 'ro',
+ isa => 'Str',
+ );
+
+# Any other info we have
+has 'other_info' => (
+ is => 'ro',
+ isa => 'Str',
+ );
sub BUILD {
# First: read the base. Make a graph, but also note which
# nodes represent line beginnings.
+my $type = 'CollateX'; # either Self or CollateX
open( GRAPH, $ARGV[0] ) or die "Could not read file $ARGV[0]";
my @lines = <GRAPH>;
my $graphml_str = join( '', @lines );
my $tradition = Text::Tradition->new(
- 'CollateX' => $graphml_str,
+ $type => $graphml_str,
'linear' => 1,
);