From: Tara L Andrews <tla@mit.edu>
Date: Wed, 6 Jul 2011 15:57:57 +0000 (+0200)
Subject: checkpoint, not sure what is here
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?p=scpubgit%2Fstemmatology.git;a=commitdiff_plain;h=f6066bac61bc5609c60d48df17aad924c8944177

checkpoint, not sure what is here
---

diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm
index aa0680a..5f569df 100644
--- a/lib/Text/Tradition/Collation.pm
+++ b/lib/Text/Tradition/Collation.pm
@@ -29,7 +29,7 @@ has 'graph' => (
     );
 		
 
-has 'tradition' => (
+has 'tradition' => (  # TODO should this not be ro?
     is => 'rw',
     isa => 'Text::Tradition',
     );
@@ -225,7 +225,11 @@ sub add_relationship {
     my @joined = ( [ $source->name, $target->name ] );  # Keep track of the nodes we join.
     
     $options->{'this_relation'} = [ $source, $target ];
-    my $rel = Text::Tradition::Collation::Relationship->new( %$options );
+    my $rel;
+    eval { $rel = Text::Tradition::Collation::Relationship->new( %$options ) };
+    if( $@ ) {
+       return ( undef, $@ );
+    }
     $self->graph->add_edge( $source, $target, $rel );
     if( $options->{'global'} ) {
 	# Look for all readings with the source label, and if there are
@@ -313,10 +317,14 @@ sub as_dot {
 
     my @edges = $view eq 'relationship' ? $self->relationships : $self->paths;
     foreach my $edge ( @edges ) {
-	$dot .= sprintf( "\t\"%s\" -> \"%s\" [ color=\"%s\", fontcolor=\"%s\", label=\"%s\" ]\n",
-			 $edge->from->name, $edge->to->name, '#000000', '#000000', $edge->label );
+	my %variables = ( 'color' => '#000000',
+			  'fontcolor' => '#000000',
+			  'label' => $edge->label,
+	    );
+	my $varopts = join( ', ', map { $_.'="'.$variables{$_}.'"' } sort keys %variables );
+	$dot .= sprintf( "\t\"%s\" -> \"%s\" [ %s ]\n",
+			 $edge->from->name, $edge->to->name, $varopts );
     }
-
     $dot .= "}\n";
     return $dot;
 }
@@ -349,6 +357,8 @@ sub as_graphml {
     $root->setNamespace( $xsi_ns, 'xsi', 0 );
     $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema );
 
+    # TODO Add some global graph data
+
     # Add the data keys for nodes
     my %node_data_keys;
     my $ndi = 0;
@@ -393,7 +403,8 @@ sub as_graphml {
 	$node_el->setAttribute( 'id', $node_xmlid );
 	_add_graphml_data( $node_el, $node_data_keys{'name'}, $n->name );
 	_add_graphml_data( $node_el, $node_data_keys{'reading'}, $n->label );
-	_add_graphml_data( $node_el, $node_data_keys{'position'}, $n->position );
+	_add_graphml_data( $node_el, $node_data_keys{'position'}, $n->position->reference )
+	    if $n->has_position;
 	_add_graphml_data( $node_el, $node_data_keys{'class'}, $n->sub_class );
 	_add_graphml_data( $node_el, $node_data_keys{'identical'}, $n->primary->name )
 	    if $n->has_primary;
@@ -805,27 +816,87 @@ sub calculate_positions {
 	    }
 	    
 	    # Transform the path values from unique strings to arrays.
+	    my @all_paths;
 	    foreach my $k ( keys %paths ) {
-		my @v = split( /\s+/, $paths{$k} );
-		$paths{$k} = \@v;
+		my @v = split( /\s+/, $k );
+		push( @all_paths, \@v );
 	    }
+	    @all_paths = sort { scalar @$b <=> scalar @$a } @all_paths;
 	    
 	    # Now %paths has all the unique paths, and we know how long the
 	    # longest of these is.  Assign positions, starting with the
 	    # longest.  All non-common positions start at 2.
-	    foreach my $path ( sort { scalar @$b <=> scalar @$a } values %paths  ) {
+	    foreach my $path ( @all_paths  ) {
+		# Initially each element has a minimum position of 2
+		# plus its position in the array (1 is the common
+		# node), and a max position of the longest array
+		# length minus its position in the array.
 		my $range = $longest - scalar @$path;
+		my $min = 2;
 		foreach my $i ( 0 .. $#{$path} ) {
-		    my $min = $i+2;
 		    my $rdg = $self->reading( $path->[$i] );
-		    unless( $rdg->has_position ) {
+		    if( $rdg->has_position ) {
+			# This reading has already had a more specific
+			# position set, so we need to take that into
+			# account when calculating the min and max for
+			# the next reading.
+			my $rangeminus = $rdg->position->min - $min;
+			$min = $rdg->position->min + 1; 
+			$range = $range - $rangeminus;
+			if( $range < 0 ) {
+			    print STDERR "Negative range for position! " . $rdg->name . "\n"; # May remove this warning
+			    $range = 0;
+			}
+		    } else {
 			$rdg->position( $first->position->common, $min, $min+$range );
+			$min++;
+			$longest = $min+$range-2 unless $longest+2 > $min+$range;  # min starts at 2 but longest assumes 0 start
+		    }
+		}
+	    }
+	    # Now go through again and make sure the positions are
+	    # monotonic.  Do this until they are.
+	    my $monotonic = 0;
+	    my $counter = 0;
+	    until( $monotonic ) {
+		$monotonic = 1;
+		$counter++;
+		foreach my $path ( @all_paths ) {
+		    foreach my $i ( 0 .. $#{$path} ) { 
+			my $rdg = $self->reading( $path->[$i] );
+			my $prior = $self->reading( $path->[$i-1] ) if $i > 0;
+			my $next = $self->reading( $path->[$i+1] ) if $i < $#{$path};
+			if( $prior && $rdg->position->min <= $prior->position->min ) {
+			    $monotonic = 0; 
+			    $rdg->position->min( $prior->position->min + 1 );
+			}
+			if( $next && $rdg->position->max >= $next->position->max ) {
+			    $monotonic = 0; 
+			    if( $next->position->max - 1 >= $rdg->position->min ) {
+				# If moving rdg/max down would not send it below 
+				# rdg/min, do that.  
+				$rdg->position->max( $next->position->max - 1 );
+			    } else {
+				# Otherwise increase next/max.
+				$next->position->max( $rdg->position->max + 1 );
+				# ...min will be fixed on the next pass.
+			    }
+			}
 		    }
 		}
+		if( $counter > $#all_paths + 1 && !$monotonic ) {
+		    # We risk an infinite loop.  Get out of here.
+		    warn "Still not monotonic after $counter passes at common point "
+			. $first->position->common;
+		    last;
+		}
 	    }
+	    print STDERR "Took $counter passes for monotonicity at " . $first->position->common. "\n" 
+		if $counter > 1;
 	    
 	    $first = $next;
 	}
+
     } else {
 
 	# Non-linear positions are pretty much impossible to pin down.
@@ -833,7 +904,6 @@ sub calculate_positions {
 	# can do positions where there aren't transpositions...
 
     }
-		
     $self->init_lemmata();
 }
 
@@ -846,7 +916,7 @@ sub _track_paths {
     # Top and tail the array
     shift @path;
     pop @path;
-    $track_hash->{$_[2]} = join( ' ', map { $_->name } @path )
+    $track_hash->{join( ' ', map { $_->name } @path )} = $_[2]
 	if @path;
     return @path;
 }
diff --git a/lib/Text/Tradition/Parser/CollateX.pm b/lib/Text/Tradition/Parser/CollateX.pm
index 2c13921..7775a75 100644
--- a/lib/Text/Tradition/Parser/CollateX.pm
+++ b/lib/Text/Tradition/Parser/CollateX.pm
@@ -50,7 +50,8 @@ sub parse {
 	my %node_data = %$n;
 	my $nodeid = delete $node_data{$IDKEY};
 	my $token = delete $node_data{$CONTENTKEY};
-	unless( $nodeid && $token ) {
+	unless( defined $nodeid && defined $token ) {
+	    $DB::single = 1;
 	    warn "Did not find an ID or token for graph node, can't add it";
 	    next;
 	}
diff --git a/lib/Text/Tradition/Parser/GraphML.pm b/lib/Text/Tradition/Parser/GraphML.pm
index 54a2c32..4e191a4 100644
--- a/lib/Text/Tradition/Parser/GraphML.pm
+++ b/lib/Text/Tradition/Parser/GraphML.pm
@@ -75,7 +75,7 @@ sub parse {
 	foreach my $dkey ( keys %$nodedata ) {
 	    my $keyname = $nodedata->{$dkey};
 	    my $keyvalue = _lookup_node_data( $n, $dkey );
-	    $node_hash->{$keyname} = $keyvalue if $keyvalue;
+	    $node_hash->{$keyname} = $keyvalue if defined $keyvalue;
 	}
 	$node_reg->{$n->getAttribute( 'id' )} = $node_hash;
 	push( @{$graph_hash->{'nodes'}}, $node_hash );
diff --git a/lib/Text/Tradition/Parser/Self.pm b/lib/Text/Tradition/Parser/Self.pm
index 781a739..6baca63 100644
--- a/lib/Text/Tradition/Parser/Self.pm
+++ b/lib/Text/Tradition/Parser/Self.pm
@@ -27,11 +27,12 @@ graph.
 
 =cut
 
-my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $POSITION_KEY ) 
-    = qw/ name reading identical position /;
+my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $POSITION_KEY, $CLASS_KEY ) 
+    = qw/ name reading identical position class /;
 
 sub parse {
     my( $tradition, $graphml_str ) = @_;
+    $DB::single = 1;
     my $graph_data = Text::Tradition::Parser::GraphML::parse( $graphml_str );
 
     my $collation = $tradition->collation;
@@ -41,13 +42,20 @@ sub parse {
 
     my $extra_data = {}; # Keep track of data that needs to be processed
                          # after the nodes & edges are created.
+    print STDERR "Adding graph nodes\n";
     foreach my $n ( @{$graph_data->{'nodes'}} ) {
-	# Could use a better way of registering these
+	# Each node is either a segment or a reading, depending on
+	# its class.  Readings have text, segments don't.
 	my %node_data = %$n;
 	my $nodeid = delete $node_data{$IDKEY};
 	my $reading = delete $node_data{$TOKENKEY};
-	my $gnode = $collation->add_reading( $nodeid );
-	$gnode->text( $reading );
+	my $class = $node_data{$CLASS_KEY} || '';
+	# TODO this is a hack, fix it?
+	$class = 'reading' unless $class eq 'segment';
+	my $method = $class eq 'segment' ? "add_$class" : "add_reading";
+	my $gnode = $collation->$method( $nodeid );
+	$gnode->label( $reading );
+	$gnode->set_common if $class eq 'common';
 
 	# Now save the rest of the data, i.e. not the ID or label,
 	# if it exists.
@@ -57,14 +65,20 @@ sub parse {
     }
 	
     # Now add the edges.
+    print STDERR "Adding graph edges\n";
     foreach my $e ( @{$graph_data->{'edges'}} ) {
 	my %edge_data = %$e;
 	my $from = delete $edge_data{'source'};
 	my $to = delete $edge_data{'target'};
+	my $class = delete $edge_data{'class'};
 
 	# Whatever is left tells us what kind of edge it is.
 	foreach my $wkey ( keys %edge_data ) {
 	    if( $wkey =~ /^witness/ ) {
+		unless( $class eq 'path' ) {
+		    warn "Cannot add witness label to a $class edge";
+		    next;
+		}
 		my $wit = $edge_data{$wkey};
 		unless( $witnesses{$wit} ) {
 		    $tradition->add_witness( sigil => $wit );
@@ -73,26 +87,33 @@ sub parse {
 		my $label = $wkey eq 'witness_ante_corr' 
 		    ? $wit . $collation->ac_label : $wit;
 		$collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label );
-	    } else {
+	    } elsif( $wkey eq 'relationship' ) {
+		unless( $class eq 'relationship' ) {
+		    warn "Cannot add relationship label to a $class edge";
+		    next;
+		}
 		my $rel = $edge_data{$wkey};
 		# TODO handle global relationships
 		$collation->add_relationship( $rel, $from->{$IDKEY}, $to->{$IDKEY} );
+	    } else {
+		my $seg_edge = $collation->graph->add_edge( $from->{$IDKEY}, $to->{$IDKEY} );
+		$seg_edge->set_attribute( 'class', 'segment' );
 	    }
 	}
     }
 
     ## Deal with node information (transposition, relationships, etc.) that
     ## needs to be processed after all the nodes are created.
+    print STDERR "Adding second-pass data\n";
+    my $linear = undef;
     foreach my $nkey ( keys %$extra_data ) {
 	foreach my $edkey ( keys %{$extra_data->{$nkey}} ) {
 	    my $this_reading = $collation->reading( $nkey );
 	    if( $edkey eq $TRANSPOS_KEY ) {
 		my $other_reading = $collation->reading( $extra_data->{$nkey}->{$edkey} );
-		if( $collation->linear ) {
-		    $this_reading->set_identical( $other_reading );
-		} else {
-		    $collation->merge_readings( $other_reading, $this_reading );
-		}
+		# We evidently have a linear graph.
+		$linear = 1;
+		$this_reading->set_identical( $other_reading );
 	    } elsif ( $edkey eq $POSITION_KEY ) {
 		$this_reading->position( $extra_data->{$nkey}->{$edkey} );
 	    } else {
@@ -100,12 +121,13 @@ sub parse {
 	    }
 	}
     }
+    $collation->linear( $linear );
 
     # We know what the beginning and ending nodes are, no need to
     # search or reset.
     my $end_node = $collation->reading( '#END#' );
-    $DB::single = 1;
     # Walk the paths and make reading sequences for our witnesses.
+    # No need to calculate positions as we have them already.
     $collation->walk_witness_paths( $end_node );
 }
 
diff --git a/lib/Text/Tradition/Parser/TEI.pm b/lib/Text/Tradition/Parser/TEI.pm
index e69de29..285b780 100644
--- a/lib/Text/Tradition/Parser/TEI.pm
+++ b/lib/Text/Tradition/Parser/TEI.pm
@@ -0,0 +1,94 @@
+package Text::Tradition::Parser::TEI;
+
+use strict;
+use warnings;
+use XML::LibXML;
+use XML::LibXML::XPathContext;
+
+=head1 NAME
+
+Text::Tradition::Parser::TEI
+
+=head1 DESCRIPTION
+
+Parser module for Text::Tradition, given a TEI parallel-segmentation
+file that describes a text and its variants.
+
+=head1 METHODS
+
+=over
+
+=item B<parse>
+
+parse( $tei_string );
+
+Takes an initialized tradition and a string containing the TEI;
+creates the appropriate nodes and edges on the graph, as well as
+the appropriate witness objects.
+
+=cut
+
+sub parse {
+    my( $tradition, $xml_str ) = @_;
+    
+    # First, parse the XML.
+    my $parser = XML::LibXML->new();
+    my $doc = $parser->parse_string( $xml_str );
+    my $tei = $doc->documentElement();
+    $xpc = XML::LibXML::XPathContext->new( $tei );
+    $xpc->registerNs( 'tei', 'http://www.tei-c.org/ns/1.0' );
+    
+    # Then get the witnesses and create the witness objects.
+    foreach my $wit_el ( $xpc->findnodes( '//tei:listWit/tei:witness' ) ) {
+	my $sig = $wit_el->getAttribute( 'xml:id' );
+	my $source = $wit_el->toString();  # Save all the XML info we have
+	$tradition->add_witness( sigil => $sig, source => $source );
+    }
+
+    # Now go through the text and make the tokens.
+    # Assume for now that each word is tokenized in the XML.
+    my $text = {};
+    map { $text->{$_->sigil} = [ $tradition->start ] } @{$tradition->witnesses};
+    foreach my $word_el ( $xpc->findnodes( '//tei:w|tei:seg' ) ) {
+	# If it is contained within a lem or a rdg, look at those witnesses.
+	# Otherwise it is common to all witnesses.
+	# Also common if it is the only lem/rdg within its app.
+	# Thus we are assuming non-nested apps.
+	my $node_id = $word_el->getAttribute( 'xml:id' );
+	my $parent_rdg = $xpc->find( 'parent::tei:lem|parent::tei:rdg', $word_el );
+	my @wits = get_sigla( $parent_rdg );
+	@wits = map { $_->sigil } @{$tradition->witnesses} unless $wits;
+
+	# TODO Create the node
+	my $reading = $word_el->textContent();
+
+	# TODO Figure out if it is a common node
+
+	foreach my $sig ( @wits ) {
+	    push( @{$text->{$sig}}, $reading );
+	}
+    }
+
+    # Now we have the text paths through the witnesses, so we can make
+    # the edges.
+
+    # TODO think about relationships, transpositions, etc.
+}
+
+sub get_sigla {
+    my( $rdg ) = @_;
+    # Cope if we have been handed a NodeList.  There is only
+    # one reading here.
+    if( ref( $rdg ) eq 'XML::LibXML::NodeList' ) {
+	$rdg = $rdg->shift;
+    }
+
+    my @wits;
+    if( ref( $rdg ) eq 'XML::LibXML::Element' ) {
+	@wits = split( /\s+/, $rdg->get_attribute( 'wit' ) );
+	map { $_ =~ s/^\#// } @wits;
+    }
+    return @wits;
+}
+
+1;
diff --git a/lib/Text/Tradition/Stemma.pm b/lib/Text/Tradition/Stemma.pm
index 448b711..0f3a881 100644
--- a/lib/Text/Tradition/Stemma.pm
+++ b/lib/Text/Tradition/Stemma.pm
@@ -33,10 +33,12 @@ sub make_character_matrix {
     foreach my $wit ( @{$self->collation->tradition->witnesses} ) {
 	# First implementation: make dumb alignment table, caring about
 	# nothing except which reading is in which position.
-	push( @$table, [ $wit->sigil, make_witness_row( $characters, $wit->path, 
+	my $sigilfield = sprintf( "%-10s", $wit->sigil );
+	push( @$table, [ $sigilfield, make_witness_row( $characters, $wit->path, 
 							\@all_pos ) ] );
 	if( $wit->has_ante_corr ) {
-	    push( @$table, [ $wit->sigil . "_ac", 
+	    $sigilfield = sprintf( "%-10s", $wit->sigil . "_ac" );
+	    push( @$table, [ $sigilfield, 
 			     make_witness_row( $characters, $wit->uncorrected_path, 
 					       \@all_pos ) ] );
 	}	    
@@ -46,17 +48,12 @@ sub make_character_matrix {
 
 sub make_witness_row {
     my( $characters, $path, $positions ) = @_;
-    my @row;
-    my $pathdrift = 0;
-    foreach my $i( 0 .. $#{$positions} ) {
-	if( $path->[$i-$pathdrift]->position->minref eq $positions->[$i] ) {
-	    push( @row, get_character( $path->[$i-$pathdrift], $characters ) );
-	} else {
-	    push( @row, 'X' );
-	    $pathdrift++;
-	}
-	$i++;
+    my %char_hash;
+    map { $char_hash{$_} = 'X' } @$positions;
+    foreach my $rdg( @$path ) {
+	$char_hash{$rdg->position->minref} = get_character( $rdg, $characters );
     }
+    my @row = map { $char_hash{$_} } @$positions;
     return @row;
 }
     
@@ -81,24 +78,28 @@ sub get_character {
     return $this_pos->{$text};
 }
 
-sub run_pars {
+sub pars_input {
     my $self = shift;
     $self->make_character_matrix unless $self->has_character_matrix;
+    my $matrix = '';
+    my $rows = scalar @{$self->character_matrix};
+    my $columns = scalar @{$self->character_matrix->[0]} - 1;
+    $matrix .= "\t$rows\t$columns\n";
+    foreach my $row ( @{$self->character_matrix} ) {
+	$matrix .= join( '', @$row ) . "\n";
+    }
+    return $matrix;
+}
+
+sub run_pars {
+    my $self = shift;
 
     # Set up a temporary directory for all the default Phylip files.
     my $phylip_dir = File::Temp->newdir();
-
-    # We need an infile, and we need a command input file.
     $DB::single = 1;
+    # We need an infile, and we need a command input file.
     open( MATRIX, ">$phylip_dir/infile" ) or die "Could not write $phylip_dir/infile";
-    my $rows = scalar @{$self->character_matrix};
-    my $columns = scalar @{$self->character_matrix->[0]} - 1;
-    print MATRIX "\t$rows\t$columns\n";
-    foreach my $row ( @{$self->character_matrix} ) {
-	my $wit = shift @$row;
-	my $chars = join( '', @$row );
-	print MATRIX sprintf( "%-10s%s\n", $wit, $chars );
-    }
+    print MATRIX $self->pars_input();
     close MATRIX;
 
     open( CMD, ">$phylip_dir/cmdfile" ) or die "Could not write $phylip_dir/cmdfile";
@@ -151,8 +152,8 @@ sub run_pars {
     return( 1, join( '', @outtree ) ) if @outtree;
 
     my @error;
-    if( -f "$phylip_dir/output" ) {
-	open( OUTPUT, "$phylip_dir/output" ) or die "Could not open output for read";
+    if( -f "$phylip_dir/outfile" ) {
+	open( OUTPUT, "$phylip_dir/outfile" ) or die "Could not open output for read";
 	@error = <OUTPUT>;
 	close OUTPUT;
     } else {
diff --git a/lib/Text/Tradition/Witness.pm b/lib/Text/Tradition/Witness.pm
index 656c185..eb63bb3 100644
--- a/lib/Text/Tradition/Witness.pm
+++ b/lib/Text/Tradition/Witness.pm
@@ -39,6 +39,18 @@ has 'uncorrected_path' => (
     isa => 'ArrayRef[Text::Tradition::Collation::Reading]',
     predicate => 'has_ante_corr',
     );
+
+# Manuscript name or similar
+has 'identifier' => (
+    is => 'ro',
+    isa => 'Str',
+    );
+
+# Any other info we have
+has 'other_info' => (
+    is => 'ro',
+    isa => 'Str',
+    );
     
 
 sub BUILD {
diff --git a/script/svg_from_graphml.pl b/script/svg_from_graphml.pl
index 894d0d9..2555fa7 100644
--- a/script/svg_from_graphml.pl
+++ b/script/svg_from_graphml.pl
@@ -7,6 +7,7 @@ use Text::Tradition;
 
 # First: read the base. Make a graph, but also note which
 # nodes represent line beginnings.
+my $type = 'CollateX'; # either Self or CollateX
 
 open( GRAPH, $ARGV[0] ) or die "Could not read file $ARGV[0]";
 my @lines = <GRAPH>;
@@ -14,7 +15,7 @@ close GRAPH;
 my $graphml_str = join( '', @lines );
 
 my $tradition = Text::Tradition->new(
-    'CollateX' => $graphml_str,
+    $type => $graphml_str,
     'linear' => 1,
     );