From: Tara L Andrews Date: Wed, 6 Jul 2011 15:57:57 +0000 (+0200) Subject: checkpoint, not sure what is here X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?p=scpubgit%2Fstemmatology.git;a=commitdiff_plain;h=f6066bac61bc5609c60d48df17aad924c8944177 checkpoint, not sure what is here --- diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index aa0680a..5f569df 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -29,7 +29,7 @@ has 'graph' => ( ); -has 'tradition' => ( +has 'tradition' => ( # TODO should this not be ro? is => 'rw', isa => 'Text::Tradition', ); @@ -225,7 +225,11 @@ sub add_relationship { my @joined = ( [ $source->name, $target->name ] ); # Keep track of the nodes we join. $options->{'this_relation'} = [ $source, $target ]; - my $rel = Text::Tradition::Collation::Relationship->new( %$options ); + my $rel; + eval { $rel = Text::Tradition::Collation::Relationship->new( %$options ) }; + if( $@ ) { + return ( undef, $@ ); + } $self->graph->add_edge( $source, $target, $rel ); if( $options->{'global'} ) { # Look for all readings with the source label, and if there are @@ -313,10 +317,14 @@ sub as_dot { my @edges = $view eq 'relationship' ? $self->relationships : $self->paths; foreach my $edge ( @edges ) { - $dot .= sprintf( "\t\"%s\" -> \"%s\" [ color=\"%s\", fontcolor=\"%s\", label=\"%s\" ]\n", - $edge->from->name, $edge->to->name, '#000000', '#000000', $edge->label ); + my %variables = ( 'color' => '#000000', + 'fontcolor' => '#000000', + 'label' => $edge->label, + ); + my $varopts = join( ', ', map { $_.'="'.$variables{$_}.'"' } sort keys %variables ); + $dot .= sprintf( "\t\"%s\" -> \"%s\" [ %s ]\n", + $edge->from->name, $edge->to->name, $varopts ); } - $dot .= "}\n"; return $dot; } @@ -349,6 +357,8 @@ sub as_graphml { $root->setNamespace( $xsi_ns, 'xsi', 0 ); $root->setAttributeNS( $xsi_ns, 'schemaLocation', $graphml_schema ); + # TODO Add some global graph data + # Add the data keys for nodes my %node_data_keys; my $ndi = 0; @@ -393,7 +403,8 @@ sub as_graphml { $node_el->setAttribute( 'id', $node_xmlid ); _add_graphml_data( $node_el, $node_data_keys{'name'}, $n->name ); _add_graphml_data( $node_el, $node_data_keys{'reading'}, $n->label ); - _add_graphml_data( $node_el, $node_data_keys{'position'}, $n->position ); + _add_graphml_data( $node_el, $node_data_keys{'position'}, $n->position->reference ) + if $n->has_position; _add_graphml_data( $node_el, $node_data_keys{'class'}, $n->sub_class ); _add_graphml_data( $node_el, $node_data_keys{'identical'}, $n->primary->name ) if $n->has_primary; @@ -805,27 +816,87 @@ sub calculate_positions { } # Transform the path values from unique strings to arrays. + my @all_paths; foreach my $k ( keys %paths ) { - my @v = split( /\s+/, $paths{$k} ); - $paths{$k} = \@v; + my @v = split( /\s+/, $k ); + push( @all_paths, \@v ); } + @all_paths = sort { scalar @$b <=> scalar @$a } @all_paths; # Now %paths has all the unique paths, and we know how long the # longest of these is. Assign positions, starting with the # longest. All non-common positions start at 2. - foreach my $path ( sort { scalar @$b <=> scalar @$a } values %paths ) { + foreach my $path ( @all_paths ) { + # Initially each element has a minimum position of 2 + # plus its position in the array (1 is the common + # node), and a max position of the longest array + # length minus its position in the array. my $range = $longest - scalar @$path; + my $min = 2; foreach my $i ( 0 .. $#{$path} ) { - my $min = $i+2; my $rdg = $self->reading( $path->[$i] ); - unless( $rdg->has_position ) { + if( $rdg->has_position ) { + # This reading has already had a more specific + # position set, so we need to take that into + # account when calculating the min and max for + # the next reading. + my $rangeminus = $rdg->position->min - $min; + $min = $rdg->position->min + 1; + $range = $range - $rangeminus; + if( $range < 0 ) { + print STDERR "Negative range for position! " . $rdg->name . "\n"; # May remove this warning + $range = 0; + } + } else { $rdg->position( $first->position->common, $min, $min+$range ); + $min++; + $longest = $min+$range-2 unless $longest+2 > $min+$range; # min starts at 2 but longest assumes 0 start + } + } + } + # Now go through again and make sure the positions are + # monotonic. Do this until they are. + my $monotonic = 0; + my $counter = 0; + until( $monotonic ) { + $monotonic = 1; + $counter++; + foreach my $path ( @all_paths ) { + foreach my $i ( 0 .. $#{$path} ) { + my $rdg = $self->reading( $path->[$i] ); + my $prior = $self->reading( $path->[$i-1] ) if $i > 0; + my $next = $self->reading( $path->[$i+1] ) if $i < $#{$path}; + if( $prior && $rdg->position->min <= $prior->position->min ) { + $monotonic = 0; + $rdg->position->min( $prior->position->min + 1 ); + } + if( $next && $rdg->position->max >= $next->position->max ) { + $monotonic = 0; + if( $next->position->max - 1 >= $rdg->position->min ) { + # If moving rdg/max down would not send it below + # rdg/min, do that. + $rdg->position->max( $next->position->max - 1 ); + } else { + # Otherwise increase next/max. + $next->position->max( $rdg->position->max + 1 ); + # ...min will be fixed on the next pass. + } + } } } + if( $counter > $#all_paths + 1 && !$monotonic ) { + # We risk an infinite loop. Get out of here. + warn "Still not monotonic after $counter passes at common point " + . $first->position->common; + last; + } } + print STDERR "Took $counter passes for monotonicity at " . $first->position->common. "\n" + if $counter > 1; $first = $next; } + } else { # Non-linear positions are pretty much impossible to pin down. @@ -833,7 +904,6 @@ sub calculate_positions { # can do positions where there aren't transpositions... } - $self->init_lemmata(); } @@ -846,7 +916,7 @@ sub _track_paths { # Top and tail the array shift @path; pop @path; - $track_hash->{$_[2]} = join( ' ', map { $_->name } @path ) + $track_hash->{join( ' ', map { $_->name } @path )} = $_[2] if @path; return @path; } diff --git a/lib/Text/Tradition/Parser/CollateX.pm b/lib/Text/Tradition/Parser/CollateX.pm index 2c13921..7775a75 100644 --- a/lib/Text/Tradition/Parser/CollateX.pm +++ b/lib/Text/Tradition/Parser/CollateX.pm @@ -50,7 +50,8 @@ sub parse { my %node_data = %$n; my $nodeid = delete $node_data{$IDKEY}; my $token = delete $node_data{$CONTENTKEY}; - unless( $nodeid && $token ) { + unless( defined $nodeid && defined $token ) { + $DB::single = 1; warn "Did not find an ID or token for graph node, can't add it"; next; } diff --git a/lib/Text/Tradition/Parser/GraphML.pm b/lib/Text/Tradition/Parser/GraphML.pm index 54a2c32..4e191a4 100644 --- a/lib/Text/Tradition/Parser/GraphML.pm +++ b/lib/Text/Tradition/Parser/GraphML.pm @@ -75,7 +75,7 @@ sub parse { foreach my $dkey ( keys %$nodedata ) { my $keyname = $nodedata->{$dkey}; my $keyvalue = _lookup_node_data( $n, $dkey ); - $node_hash->{$keyname} = $keyvalue if $keyvalue; + $node_hash->{$keyname} = $keyvalue if defined $keyvalue; } $node_reg->{$n->getAttribute( 'id' )} = $node_hash; push( @{$graph_hash->{'nodes'}}, $node_hash ); diff --git a/lib/Text/Tradition/Parser/Self.pm b/lib/Text/Tradition/Parser/Self.pm index 781a739..6baca63 100644 --- a/lib/Text/Tradition/Parser/Self.pm +++ b/lib/Text/Tradition/Parser/Self.pm @@ -27,11 +27,12 @@ graph. =cut -my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $POSITION_KEY ) - = qw/ name reading identical position /; +my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $POSITION_KEY, $CLASS_KEY ) + = qw/ name reading identical position class /; sub parse { my( $tradition, $graphml_str ) = @_; + $DB::single = 1; my $graph_data = Text::Tradition::Parser::GraphML::parse( $graphml_str ); my $collation = $tradition->collation; @@ -41,13 +42,20 @@ sub parse { my $extra_data = {}; # Keep track of data that needs to be processed # after the nodes & edges are created. + print STDERR "Adding graph nodes\n"; foreach my $n ( @{$graph_data->{'nodes'}} ) { - # Could use a better way of registering these + # Each node is either a segment or a reading, depending on + # its class. Readings have text, segments don't. my %node_data = %$n; my $nodeid = delete $node_data{$IDKEY}; my $reading = delete $node_data{$TOKENKEY}; - my $gnode = $collation->add_reading( $nodeid ); - $gnode->text( $reading ); + my $class = $node_data{$CLASS_KEY} || ''; + # TODO this is a hack, fix it? + $class = 'reading' unless $class eq 'segment'; + my $method = $class eq 'segment' ? "add_$class" : "add_reading"; + my $gnode = $collation->$method( $nodeid ); + $gnode->label( $reading ); + $gnode->set_common if $class eq 'common'; # Now save the rest of the data, i.e. not the ID or label, # if it exists. @@ -57,14 +65,20 @@ sub parse { } # Now add the edges. + print STDERR "Adding graph edges\n"; foreach my $e ( @{$graph_data->{'edges'}} ) { my %edge_data = %$e; my $from = delete $edge_data{'source'}; my $to = delete $edge_data{'target'}; + my $class = delete $edge_data{'class'}; # Whatever is left tells us what kind of edge it is. foreach my $wkey ( keys %edge_data ) { if( $wkey =~ /^witness/ ) { + unless( $class eq 'path' ) { + warn "Cannot add witness label to a $class edge"; + next; + } my $wit = $edge_data{$wkey}; unless( $witnesses{$wit} ) { $tradition->add_witness( sigil => $wit ); @@ -73,26 +87,33 @@ sub parse { my $label = $wkey eq 'witness_ante_corr' ? $wit . $collation->ac_label : $wit; $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label ); - } else { + } elsif( $wkey eq 'relationship' ) { + unless( $class eq 'relationship' ) { + warn "Cannot add relationship label to a $class edge"; + next; + } my $rel = $edge_data{$wkey}; # TODO handle global relationships $collation->add_relationship( $rel, $from->{$IDKEY}, $to->{$IDKEY} ); + } else { + my $seg_edge = $collation->graph->add_edge( $from->{$IDKEY}, $to->{$IDKEY} ); + $seg_edge->set_attribute( 'class', 'segment' ); } } } ## Deal with node information (transposition, relationships, etc.) that ## needs to be processed after all the nodes are created. + print STDERR "Adding second-pass data\n"; + my $linear = undef; foreach my $nkey ( keys %$extra_data ) { foreach my $edkey ( keys %{$extra_data->{$nkey}} ) { my $this_reading = $collation->reading( $nkey ); if( $edkey eq $TRANSPOS_KEY ) { my $other_reading = $collation->reading( $extra_data->{$nkey}->{$edkey} ); - if( $collation->linear ) { - $this_reading->set_identical( $other_reading ); - } else { - $collation->merge_readings( $other_reading, $this_reading ); - } + # We evidently have a linear graph. + $linear = 1; + $this_reading->set_identical( $other_reading ); } elsif ( $edkey eq $POSITION_KEY ) { $this_reading->position( $extra_data->{$nkey}->{$edkey} ); } else { @@ -100,12 +121,13 @@ sub parse { } } } + $collation->linear( $linear ); # We know what the beginning and ending nodes are, no need to # search or reset. my $end_node = $collation->reading( '#END#' ); - $DB::single = 1; # Walk the paths and make reading sequences for our witnesses. + # No need to calculate positions as we have them already. $collation->walk_witness_paths( $end_node ); } diff --git a/lib/Text/Tradition/Parser/TEI.pm b/lib/Text/Tradition/Parser/TEI.pm index e69de29..285b780 100644 --- a/lib/Text/Tradition/Parser/TEI.pm +++ b/lib/Text/Tradition/Parser/TEI.pm @@ -0,0 +1,94 @@ +package Text::Tradition::Parser::TEI; + +use strict; +use warnings; +use XML::LibXML; +use XML::LibXML::XPathContext; + +=head1 NAME + +Text::Tradition::Parser::TEI + +=head1 DESCRIPTION + +Parser module for Text::Tradition, given a TEI parallel-segmentation +file that describes a text and its variants. + +=head1 METHODS + +=over + +=item B + +parse( $tei_string ); + +Takes an initialized tradition and a string containing the TEI; +creates the appropriate nodes and edges on the graph, as well as +the appropriate witness objects. + +=cut + +sub parse { + my( $tradition, $xml_str ) = @_; + + # First, parse the XML. + my $parser = XML::LibXML->new(); + my $doc = $parser->parse_string( $xml_str ); + my $tei = $doc->documentElement(); + $xpc = XML::LibXML::XPathContext->new( $tei ); + $xpc->registerNs( 'tei', 'http://www.tei-c.org/ns/1.0' ); + + # Then get the witnesses and create the witness objects. + foreach my $wit_el ( $xpc->findnodes( '//tei:listWit/tei:witness' ) ) { + my $sig = $wit_el->getAttribute( 'xml:id' ); + my $source = $wit_el->toString(); # Save all the XML info we have + $tradition->add_witness( sigil => $sig, source => $source ); + } + + # Now go through the text and make the tokens. + # Assume for now that each word is tokenized in the XML. + my $text = {}; + map { $text->{$_->sigil} = [ $tradition->start ] } @{$tradition->witnesses}; + foreach my $word_el ( $xpc->findnodes( '//tei:w|tei:seg' ) ) { + # If it is contained within a lem or a rdg, look at those witnesses. + # Otherwise it is common to all witnesses. + # Also common if it is the only lem/rdg within its app. + # Thus we are assuming non-nested apps. + my $node_id = $word_el->getAttribute( 'xml:id' ); + my $parent_rdg = $xpc->find( 'parent::tei:lem|parent::tei:rdg', $word_el ); + my @wits = get_sigla( $parent_rdg ); + @wits = map { $_->sigil } @{$tradition->witnesses} unless $wits; + + # TODO Create the node + my $reading = $word_el->textContent(); + + # TODO Figure out if it is a common node + + foreach my $sig ( @wits ) { + push( @{$text->{$sig}}, $reading ); + } + } + + # Now we have the text paths through the witnesses, so we can make + # the edges. + + # TODO think about relationships, transpositions, etc. +} + +sub get_sigla { + my( $rdg ) = @_; + # Cope if we have been handed a NodeList. There is only + # one reading here. + if( ref( $rdg ) eq 'XML::LibXML::NodeList' ) { + $rdg = $rdg->shift; + } + + my @wits; + if( ref( $rdg ) eq 'XML::LibXML::Element' ) { + @wits = split( /\s+/, $rdg->get_attribute( 'wit' ) ); + map { $_ =~ s/^\#// } @wits; + } + return @wits; +} + +1; diff --git a/lib/Text/Tradition/Stemma.pm b/lib/Text/Tradition/Stemma.pm index 448b711..0f3a881 100644 --- a/lib/Text/Tradition/Stemma.pm +++ b/lib/Text/Tradition/Stemma.pm @@ -33,10 +33,12 @@ sub make_character_matrix { foreach my $wit ( @{$self->collation->tradition->witnesses} ) { # First implementation: make dumb alignment table, caring about # nothing except which reading is in which position. - push( @$table, [ $wit->sigil, make_witness_row( $characters, $wit->path, + my $sigilfield = sprintf( "%-10s", $wit->sigil ); + push( @$table, [ $sigilfield, make_witness_row( $characters, $wit->path, \@all_pos ) ] ); if( $wit->has_ante_corr ) { - push( @$table, [ $wit->sigil . "_ac", + $sigilfield = sprintf( "%-10s", $wit->sigil . "_ac" ); + push( @$table, [ $sigilfield, make_witness_row( $characters, $wit->uncorrected_path, \@all_pos ) ] ); } @@ -46,17 +48,12 @@ sub make_character_matrix { sub make_witness_row { my( $characters, $path, $positions ) = @_; - my @row; - my $pathdrift = 0; - foreach my $i( 0 .. $#{$positions} ) { - if( $path->[$i-$pathdrift]->position->minref eq $positions->[$i] ) { - push( @row, get_character( $path->[$i-$pathdrift], $characters ) ); - } else { - push( @row, 'X' ); - $pathdrift++; - } - $i++; + my %char_hash; + map { $char_hash{$_} = 'X' } @$positions; + foreach my $rdg( @$path ) { + $char_hash{$rdg->position->minref} = get_character( $rdg, $characters ); } + my @row = map { $char_hash{$_} } @$positions; return @row; } @@ -81,24 +78,28 @@ sub get_character { return $this_pos->{$text}; } -sub run_pars { +sub pars_input { my $self = shift; $self->make_character_matrix unless $self->has_character_matrix; + my $matrix = ''; + my $rows = scalar @{$self->character_matrix}; + my $columns = scalar @{$self->character_matrix->[0]} - 1; + $matrix .= "\t$rows\t$columns\n"; + foreach my $row ( @{$self->character_matrix} ) { + $matrix .= join( '', @$row ) . "\n"; + } + return $matrix; +} + +sub run_pars { + my $self = shift; # Set up a temporary directory for all the default Phylip files. my $phylip_dir = File::Temp->newdir(); - - # We need an infile, and we need a command input file. $DB::single = 1; + # We need an infile, and we need a command input file. open( MATRIX, ">$phylip_dir/infile" ) or die "Could not write $phylip_dir/infile"; - my $rows = scalar @{$self->character_matrix}; - my $columns = scalar @{$self->character_matrix->[0]} - 1; - print MATRIX "\t$rows\t$columns\n"; - foreach my $row ( @{$self->character_matrix} ) { - my $wit = shift @$row; - my $chars = join( '', @$row ); - print MATRIX sprintf( "%-10s%s\n", $wit, $chars ); - } + print MATRIX $self->pars_input(); close MATRIX; open( CMD, ">$phylip_dir/cmdfile" ) or die "Could not write $phylip_dir/cmdfile"; @@ -151,8 +152,8 @@ sub run_pars { return( 1, join( '', @outtree ) ) if @outtree; my @error; - if( -f "$phylip_dir/output" ) { - open( OUTPUT, "$phylip_dir/output" ) or die "Could not open output for read"; + if( -f "$phylip_dir/outfile" ) { + open( OUTPUT, "$phylip_dir/outfile" ) or die "Could not open output for read"; @error = ; close OUTPUT; } else { diff --git a/lib/Text/Tradition/Witness.pm b/lib/Text/Tradition/Witness.pm index 656c185..eb63bb3 100644 --- a/lib/Text/Tradition/Witness.pm +++ b/lib/Text/Tradition/Witness.pm @@ -39,6 +39,18 @@ has 'uncorrected_path' => ( isa => 'ArrayRef[Text::Tradition::Collation::Reading]', predicate => 'has_ante_corr', ); + +# Manuscript name or similar +has 'identifier' => ( + is => 'ro', + isa => 'Str', + ); + +# Any other info we have +has 'other_info' => ( + is => 'ro', + isa => 'Str', + ); sub BUILD { diff --git a/script/svg_from_graphml.pl b/script/svg_from_graphml.pl index 894d0d9..2555fa7 100644 --- a/script/svg_from_graphml.pl +++ b/script/svg_from_graphml.pl @@ -7,6 +7,7 @@ use Text::Tradition; # First: read the base. Make a graph, but also note which # nodes represent line beginnings. +my $type = 'CollateX'; # either Self or CollateX open( GRAPH, $ARGV[0] ) or die "Could not read file $ARGV[0]"; my @lines = ; @@ -14,7 +15,7 @@ close GRAPH; my $graphml_str = join( '', @lines ); my $tradition = Text::Tradition->new( - 'CollateX' => $graphml_str, + $type => $graphml_str, 'linear' => 1, );