X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FParser%2FCollateX.pm;h=62123fe4707106332225f9c2cf28ab5e2f731481;hb=94a077d641e8c906d7131a059b009f335781337a;hp=7775a75b9d83ee106890b426be7e70c5237fc43b;hpb=f6066bac61bc5609c60d48df17aad924c8944177;p=scpubgit%2Fstemmatology.git

diff --git a/lib/Text/Tradition/Parser/CollateX.pm b/lib/Text/Tradition/Parser/CollateX.pm
index 7775a75..62123fe 100644
--- a/lib/Text/Tradition/Parser/CollateX.pm
+++ b/lib/Text/Tradition/Parser/CollateX.pm
@@ -2,12 +2,28 @@ package Text::Tradition::Parser::CollateX;
 
 use strict;
 use warnings;
-use Text::Tradition::Parser::GraphML;
+use Text::Tradition::Parser::GraphML qw/ graphml_parse populate_witness_path /;
 
 =head1 NAME
 
 Text::Tradition::Parser::CollateX
 
+=head1 SYNOPSIS
+
+  use Text::Tradition;
+  
+  my $t_from_file = Text::Tradition->new( 
+    'name' => 'my text',
+    'input' => 'CollateX',
+    'file' => '/path/to/collation.xml'
+    );
+    
+  my $t_from_string = Text::Tradition->new( 
+    'name' => 'my text',
+    'input' => 'CollateX',
+    'string' => $collation_xml,
+    );
+
 =head1 DESCRIPTION
 
 Parser module for Text::Tradition, given a GraphML file from the
@@ -17,15 +33,42 @@ http://gregor.middell.net/collatex/
 
 =head1 METHODS
 
-=over
+=head2 B<parse>
+
+parse( $tradition, $init_options );
 
-=item B<parse>
+Takes an initialized Text::Tradition object and a set of options; creates
+the appropriate nodes and edges on the graph.  The options hash should
+include either a 'file' argument or a 'string' argument, depending on the
+source of the XML to be parsed.
 
-parse( $graph, $graphml_string );
+=begin testing
 
-Takes an initialized Text::Tradition::Graph object and a string
-containing the GraphML; creates the appropriate nodes and edges on the
-graph.
+use Text::Tradition;
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+eval { no warnings; binmode $DB::OUT, ":utf8"; };
+
+my $cxfile = 't/data/Collatex-16.xml';
+my $t = Text::Tradition->new( 
+    'name'  => 'inline', 
+    'input' => 'CollateX',
+    'file'  => $cxfile,
+    );
+
+is( ref( $t ), 'Text::Tradition', "Parsed our own GraphML" );
+if( $t ) {
+    is( scalar $t->collation->readings, 26, "Collation has all readings" );
+    is( scalar $t->collation->paths, 49, "Collation has all paths" );
+    is( scalar $t->witnesses, 3, "Collation has all witnesses" );
+    
+    # Check an 'identical' node
+    my $transposed = $t->collation->reading( 'n15' );
+    ok( $transposed->has_primary, "Reading links to transposed primary" );
+    is( $transposed->primary->name, 'n17', "Correct transposition link" );
+}
+
+=end testing
 
 =cut
 
@@ -34,8 +77,8 @@ my $CONTENTKEY = 'token';
 my $TRANSKEY = 'identical';
 
 sub parse {
-    my( $tradition, $graphml_str ) = @_;
-    my $graph_data = Text::Tradition::Parser::GraphML::parse( $graphml_str );
+    my( $tradition, $opts ) = @_;
+    my $graph_data = graphml_parse( $opts );
     my $collation = $tradition->collation;
     my %witnesses; # Keep track of the witnesses we encounter as we
                    # run through the graph data.
@@ -43,92 +86,98 @@ sub parse {
     # Add the nodes to the graph.  First delete the start node, because
     # GraphML graphs will have their own start nodes.
     $collation->del_reading( $collation->start() );
+    $collation->del_reading( $collation->end() );
 
     my $extra_data = {}; # Keep track of info to be processed after all
                          # nodes have been created
     foreach my $n ( @{$graph_data->{'nodes'}} ) {
-	my %node_data = %$n;
-	my $nodeid = delete $node_data{$IDKEY};
-	my $token = delete $node_data{$CONTENTKEY};
-	unless( defined $nodeid && defined $token ) {
-	    $DB::single = 1;
-	    warn "Did not find an ID or token for graph node, can't add it";
-	    next;
-	}
-	my $gnode = $collation->add_reading( $nodeid );
-	$gnode->text( $token );
-
-	# Whatever is left is extra info to be processed later.
-	if( keys %node_data ) {
-	    $extra_data->{$nodeid} = \%node_data;
-	}
+        my %node_data = %$n;
+        my $nodeid = delete $node_data{$IDKEY};
+        my $token = delete $node_data{$CONTENTKEY};
+        unless( defined $nodeid && defined $token ) {
+            warn "Did not find an ID or token for graph node, can't add it";
+            next;
+        }
+        my $gnode = $collation->add_reading( $nodeid );
+        $gnode->text( $token );
+
+        # Whatever is left is extra info to be processed later.
+        if( keys %node_data ) {
+            $extra_data->{$nodeid} = \%node_data;
+        }
     }
-	
+        
     # Now add the edges.
     foreach my $e ( @{$graph_data->{'edges'}} ) {
-	my %edge_data = %$e;
-	my $from = delete $edge_data{'source'};
-	my $to = delete $edge_data{'target'};
-
-	# In CollateX, we have a distinct witness data ID per witness,
-	# so that we can have multiple witnesses per edge.  We want to
-	# translate this to one witness per edge in our own
-	# representation.
-	foreach my $ekey ( keys %edge_data ) {
-	    my $wit = $edge_data{$ekey};
-	    # Create the witness object if it does not yet exist.
-	    unless( $witnesses{$wit} ) {
-		$tradition->add_witness( 'sigil' => $wit );
-		$witnesses{$wit} = 1;
-	    }
-	    $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $wit );
-	}
+        my %edge_data = %$e;
+        my $from = delete $edge_data{'source'};
+        my $to = delete $edge_data{'target'};
+
+        # In CollateX, we have a distinct witness data ID per witness,
+        # so that we can have multiple witnesses per edge.  We want to
+        # translate this to one witness per edge in our own
+        # representation.
+        foreach my $ekey ( keys %edge_data ) {
+            my $wit = $edge_data{$ekey};
+            # Create the witness object if it does not yet exist.
+            unless( $witnesses{$wit} ) {
+                $tradition->add_witness( 'sigil' => $wit );
+                $witnesses{$wit} = 1;
+            }
+            $collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $wit );
+        }
     }
 
     # Process the extra node data if it exists.
     foreach my $nodeid ( keys %$extra_data ) {
-	my $ed = $extra_data->{$nodeid};
-	if( exists $ed->{$TRANSKEY} ) {
-	    
-	    my $tn_reading = $collation->reading( $nodeid );
-	    my $main_reading = $collation->reading( $ed->{$TRANSKEY} );
-	    if( $collation->linear ) {
-		$tn_reading->set_identical( $main_reading );
-	    } else {
-		$collation->merge_readings( $main_reading, $tn_reading );
-	    }
-	} # else we don't have any other tags to process yet.
+        my $ed = $extra_data->{$nodeid};
+        if( exists $ed->{$TRANSKEY} ) {
+            
+            my $tn_reading = $collation->reading( $nodeid );
+            my $main_reading = $collation->reading( $ed->{$TRANSKEY} );
+            if( $collation->linear ) {
+                $tn_reading->set_identical( $main_reading );
+            } else {
+                $collation->merge_readings( $main_reading, $tn_reading );
+            }
+        } # else we don't have any other tags to process yet.
     }
 
     # Find the beginning and end nodes of the graph.  The beginning node
     # has no incoming edges; the end node has no outgoing edges.
     my( $begin_node, $end_node );
     foreach my $gnode ( $collation->readings() ) {
-	# print STDERR "Checking node " . $gnode->name . "\n";
-	my @outgoing = $gnode->outgoing();
-	my @incoming = $gnode->incoming();
-
-	unless( scalar @incoming ) {
-	    warn "Already have a beginning node" if $begin_node;
-	    $begin_node = $gnode;
-	    $collation->start( $gnode );
-	}
-	unless( scalar @outgoing ) {
-	    warn "Already have an ending node" if $end_node;
-	    $end_node = $gnode;
-	}
+        # print STDERR "Checking node " . $gnode->name . "\n";
+        my @outgoing = $gnode->outgoing();
+        my @incoming = $gnode->incoming();
+
+        unless( scalar @incoming ) {
+            warn "Already have a beginning node" if $begin_node;
+            $begin_node = $gnode;
+            $collation->start( $gnode );
+        }
+        unless( scalar @outgoing ) {
+            warn "Already have an ending node" if $end_node;
+            $end_node = $gnode;
+            $collation->end( $gnode );
+        }
     }
+    
+    # Set the $witness->path arrays for each wit.
+    populate_witness_path( $tradition );
 
-    # Record for each witness its sequence of readings, and determine
-    # the common nodes.
-    my @common_nodes = $collation->walk_witness_paths( $end_node );
-
-    # Now we have added the witnesses and their paths, so have also
-    # implicitly marked the common nodes. Now we can calculate their
-    # explicit positions.
-    $collation->calculate_positions( @common_nodes );
+    # Rank the readings.
+    $collation->calculate_ranks() if $collation->linear;
 }
     
+=head1 BUGS / TODO
+
+=over
+
+=item * Make this into a stream parser with GraphML
+
+=item * Use CollateX-calculated ranks instead of recalculating our own
+
 =back
 
 =head1 LICENSE