X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FParser%2FSelf.pm;h=8483891b9964c645437ea3d2e5e14b1855205f2f;hb=2a8127263ef278f3f14b480a12b84f9aa4f92fdc;hp=781a73913fe0aadd60e29f9fdc5a60eced4255b3;hpb=32014ec936b48809e9f2dae8f20a01c887253427;p=scpubgit%2Fstemmatology.git

diff --git a/lib/Text/Tradition/Parser/Self.pm b/lib/Text/Tradition/Parser/Self.pm
index 781a739..8483891 100644
--- a/lib/Text/Tradition/Parser/Self.pm
+++ b/lib/Text/Tradition/Parser/Self.pm
@@ -2,113 +2,289 @@ package Text::Tradition::Parser::Self;
 
 use strict;
 use warnings;
-use Text::Tradition::Parser::GraphML;
+use Text::Tradition::Parser::GraphML qw/ graphml_parse /;
+use TryCatch;
 
 =head1 NAME
 
 Text::Tradition::Parser::GraphML
 
+=head1 SYNOPSIS
+
+  use Text::Tradition;
+  
+  my $t_from_file = Text::Tradition->new( 
+    'name' => 'my text',
+    'input' => 'Self',
+    'file' => '/path/to/tradition.xml'
+    );
+    
+  my $t_from_string = Text::Tradition->new( 
+    'name' => 'my text',
+    'input' => 'Self',
+    'string' => $tradition_xml,
+    );
+
 =head1 DESCRIPTION
 
 Parser module for Text::Tradition to read in its own GraphML output format.
-TODO document what this format is.
+GraphML is a relatively simple graph description language; a 'graph' element
+can have 'node' and 'edge' elements, and each of these can have simple 'data'
+elements for attributes to be saved.
 
-=head1 METHODS
+The graph itself has attributes as in the Collation object:
 
 =over
 
-=item B<parse>
+=item * linear 
 
-parse( $graph, $graphml_string );
+=item * ac_label
 
-Takes an initialized Text::Tradition::Graph object and a string
-containing the GraphML; creates the appropriate nodes and edges on the
-graph.
+=item * baselabel
 
-=cut
+=item * wit_list_separator
+
+=back
 
-my( $IDKEY, $TOKENKEY, $TRANSPOS_KEY, $POSITION_KEY ) 
-    = qw/ name reading identical position /;
+The node objects have the following attributes:
 
-sub parse {
-    my( $tradition, $graphml_str ) = @_;
-    my $graph_data = Text::Tradition::Parser::GraphML::parse( $graphml_str );
+=over
+
+=item * name
+
+=item * reading
+
+=item * identical
+
+=item * rank
+
+=item * class
+
+=back
+
+The edge objects have the following attributes:
+
+=over
+
+=item * class
+
+=item * witness (for 'path' class edges)
+
+=item * extra   (for 'path' class edges)
+
+=item * relationship    (for 'relationship' class edges)
+
+=item * equal_rank      (for 'relationship' class edges)
+
+=item * non_correctable (for 'relationship' class edges)
+
+=item * non_independent (for 'relationship' class edges)
+
+=back
+
+=head1 METHODS
+
+=head2 B<parse>
+
+parse( $graph, $opts );
+
+Takes an initialized Text::Tradition object and a set of options; creates
+the appropriate nodes and edges on the graph.  The options hash should
+include either a 'file' argument or a 'string' argument, depending on the
+source of the XML to be parsed.
+
+=begin testing
+
+use Text::Tradition;
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+eval { no warnings; binmode $DB::OUT, ":utf8"; };
+
+my $tradition = 't/data/florilegium_graphml.xml';
+my $t = Text::Tradition->new( 
+    'name'  => 'inline', 
+    'input' => 'Self',
+    'file'  => $tradition,
+    );
+
+is( ref( $t ), 'Text::Tradition', "Parsed GraphML version 2" );
+if( $t ) {
+    is( scalar $t->collation->readings, 319, "Collation has all readings" );
+    is( scalar $t->collation->paths, 376, "Collation has all paths" );
+    is( scalar $t->witnesses, 13, "Collation has all witnesses" );
+}
 
+# TODO add a relationship, add a stemma, write graphml, reparse it, check that 
+# the new data is there
+$t->language('Greek');
+$t->add_stemma( 'dotfile' => 't/data/florilegium.dot' );
+$t->collation->add_relationship( 'w12', 'w13', 
+	{ 'type' => 'grammatical', 'scope' => 'global', 
+	  'annotation' => 'This is some note' } );
+ok( $t->collation->get_relationship( 'w12', 'w13' ), "Relationship set" );
+my $graphml_str = $t->collation->as_graphml;
+
+my $newt = Text::Tradition->new( 'input' => 'Self', 'string' => $graphml_str );
+is( ref( $newt ), 'Text::Tradition', "Parsed current GraphML version" );
+if( $newt ) {
+    is( scalar $newt->collation->readings, 319, "Collation has all readings" );
+    is( scalar $newt->collation->paths, 376, "Collation has all paths" );
+    is( scalar $newt->witnesses, 13, "Collation has all witnesses" );
+    is( scalar $newt->collation->relationships, 1, "Collation has added relationship" );
+    is( $newt->language, 'Greek', "Tradition has correct language setting" );
+    my $rel = $newt->collation->get_relationship( 'w12', 'w13' );
+    ok( $rel, "Found set relationship" );
+    is( $rel->annotation, 'This is some note', "Relationship has its properties" );
+    is( scalar $newt->stemmata, 1, "Tradition has its stemma" );
+    is( $newt->stemma(0)->witnesses, $t->stemma(0)->witnesses, "Stemma has correct length witness list" );
+}
+
+
+=end testing
+
+=cut
+
+sub parse {
+    my( $tradition, $opts ) = @_;
+    
+    # Collation data is in the first graph; relationship-specific stuff 
+    # is in the second.
+    my( $graph_data, $rel_data ) = graphml_parse( $opts );
+    
     my $collation = $tradition->collation;
     my %witnesses;
-
-    # Add the nodes to the graph. 
-
-    my $extra_data = {}; # Keep track of data that needs to be processed
-                         # after the nodes & edges are created.
-    foreach my $n ( @{$graph_data->{'nodes'}} ) {
-	# Could use a better way of registering these
-	my %node_data = %$n;
-	my $nodeid = delete $node_data{$IDKEY};
-	my $reading = delete $node_data{$TOKENKEY};
-	my $gnode = $collation->add_reading( $nodeid );
-	$gnode->text( $reading );
-
-	# Now save the rest of the data, i.e. not the ID or label,
-	# if it exists.
-	if ( keys %node_data ) {
-	    $extra_data->{$nodeid} = \%node_data;
+    
+    # print STDERR "Setting graph globals\n";
+    $tradition->name( $graph_data->{'name'} );
+    my $use_version;
+    my $tmeta = $tradition->meta;
+    my $cmeta = $collation->meta;
+    foreach my $gkey ( keys %{$graph_data->{'global'}} ) {
+		my $val = $graph_data->{'global'}->{$gkey};
+		if( $gkey eq 'version' ) {
+			$use_version = $val;
+		} elsif( $gkey eq 'stemmata' ) { # Special case, yuck
+			foreach my $dotstr ( split( /\n/, $val ) ) {
+				$tradition->add_stemma( 'dot' => $dotstr );
+			}
+		} elsif( $tmeta->has_attribute( $gkey ) ) {
+			$tradition->$gkey( $val );
+		} else {
+			$collation->$gkey( $val );
+		}
 	}
+		
+    # Add the nodes to the graph.
+    # Note any reading IDs that were changed in order to comply with XML 
+    # name restrictions; we have to hardcode start & end.
+    my %namechange = ( '#START#' => '__START__', '#END#' => '__END__' );
+
+    # print STDERR "Adding collation readings\n";
+    foreach my $n ( @{$graph_data->{'nodes'}} ) {    	
+    	# If it is the start or end node, we already have one, so
+    	# grab the rank and go.
+    	next if( defined $n->{'is_start'} );
+    	if( defined $n->{'is_end'} ) {
+    		$collation->end->rank( $n->{'rank'} );
+    		next;
+    	}
+		my $gnode = $collation->add_reading( $n );
+		if( $gnode->id ne $n->{'id'} ) {
+			$namechange{$n->{'id'}} = $gnode->id;
+		}
     }
-	
+        
     # Now add the edges.
+    # print STDERR "Adding collation path edges\n";
     foreach my $e ( @{$graph_data->{'edges'}} ) {
-	my %edge_data = %$e;
-	my $from = delete $edge_data{'source'};
-	my $to = delete $edge_data{'target'};
-
-	# Whatever is left tells us what kind of edge it is.
-	foreach my $wkey ( keys %edge_data ) {
-	    if( $wkey =~ /^witness/ ) {
-		my $wit = $edge_data{$wkey};
-		unless( $witnesses{$wit} ) {
-		    $tradition->add_witness( sigil => $wit );
-		    $witnesses{$wit} = 1;
+    	my $sourceid = exists $namechange{$e->{'source'}->{'id'}}
+    		? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'};
+    	my $targetid = exists $namechange{$e->{'target'}->{'id'}}
+    		? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'};
+        my $from = $collation->reading( $sourceid );
+        my $to = $collation->reading( $targetid );
+
+		warn "No witness label on path edge!" unless $e->{'witness'};
+		my $label = $e->{'witness'} . ( $e->{'extra'} ? $collation->ac_label : '' );
+		$collation->add_path( $from, $to, $label );
+		
+		# Add the witness if we don't have it already.
+		unless( $witnesses{$e->{'witness'}} ) {
+			$tradition->add_witness( 
+				sigil => $e->{'witness'}, 'sourcetype' => 'collation' );
+			$witnesses{$e->{'witness'}} = 1;
 		}
-		my $label = $wkey eq 'witness_ante_corr' 
-		    ? $wit . $collation->ac_label : $wit;
-		$collation->add_path( $from->{$IDKEY}, $to->{$IDKEY}, $label );
-	    } else {
-		my $rel = $edge_data{$wkey};
-		# TODO handle global relationships
-		$collation->add_relationship( $rel, $from->{$IDKEY}, $to->{$IDKEY} );
-	    }
-	}
+		$tradition->witness( $e->{'witness'} )->is_layered( 1 ) if $e->{'extra'};
     }
-
-    ## Deal with node information (transposition, relationships, etc.) that
-    ## needs to be processed after all the nodes are created.
-    foreach my $nkey ( keys %$extra_data ) {
-	foreach my $edkey ( keys %{$extra_data->{$nkey}} ) {
-	    my $this_reading = $collation->reading( $nkey );
-	    if( $edkey eq $TRANSPOS_KEY ) {
-		my $other_reading = $collation->reading( $extra_data->{$nkey}->{$edkey} );
-		if( $collation->linear ) {
-		    $this_reading->set_identical( $other_reading );
-		} else {
-		    $collation->merge_readings( $other_reading, $this_reading );
+    
+    ## Done with the main graph, now look at the relationships.
+	# Nodes are added via the call to add_reading above.  We only need
+	# add the relationships themselves.
+	# TODO check that scoping does trt
+	$rel_data->{'edges'} ||= []; # so that the next line doesn't break on no rels
+	foreach my $e ( sort { _layersort_rel( $a, $b ) } @{$rel_data->{'edges'}} ) {
+    	my $sourceid = exists $namechange{$e->{'source'}->{'id'}}
+    		? $namechange{$e->{'source'}->{'id'}} : $e->{'source'}->{'id'};
+    	my $targetid = exists $namechange{$e->{'target'}->{'id'}}
+    		? $namechange{$e->{'target'}->{'id'}} : $e->{'target'}->{'id'};
+        my $from = $collation->reading( $sourceid );
+        my $to = $collation->reading( $targetid );
+		delete $e->{'source'};
+		delete $e->{'target'};
+		# The remaining keys are relationship attributes.
+		# Backward compatibility...
+		if( $use_version eq '2.0' || $use_version eq '3.0' ) {
+			delete $e->{'class'};
+			$e->{'type'} = delete $e->{'relationship'} if exists $e->{'relationship'};
+		}
+		# Add the specified relationship unless we already have done.
+		my $rel_exists;
+		if( $e->{'scope'} ne 'local' ) {
+			my $relobj = $collation->get_relationship( $from, $to );
+			if( $relobj && $relobj->scope eq $e->{'scope'}
+				&& $relobj->type eq $e->{'type'} ) {
+				$rel_exists = 1;
+			}
+		}
+		try {
+			$collation->add_relationship( $from, $to, $e ) unless $rel_exists;
+		} catch( Text::Tradition::Error $e ) {
+			warn "DROPPING $from -> $to: " . $e->message;
 		}
-	    } elsif ( $edkey eq $POSITION_KEY ) {
-		$this_reading->position( $extra_data->{$nkey}->{$edkey} );
-	    } else {
-		warn "Unfamiliar reading node data $edkey for $nkey";
-	    }
 	}
-    }
+	
+    # Save the text for each witness so that we can ensure consistency
+    # later on
+	$collation->text_from_paths();	
+}
 
-    # We know what the beginning and ending nodes are, no need to
-    # search or reset.
-    my $end_node = $collation->reading( '#END#' );
-    $DB::single = 1;
-    # Walk the paths and make reading sequences for our witnesses.
-    $collation->walk_witness_paths( $end_node );
+## Return the relationship that comes first in priority.
+my %LAYERS = (
+	'collated' => 1,
+	'orthographic' => 2,
+	'spelling' => 3,
+	);
+
+sub _layersort_rel {
+	my( $a, $b ) = @_;
+	my $key = exists $a->{'type'} ? 'type' : 'relationship';
+	my $at = $LAYERS{$a->{$key}} || 99;
+	my $bt = $LAYERS{$b->{$key}} || 99;
+	return $at <=> $bt;
 }
 
+1;
+
+=head1 BUGS / TODO
+
+=over
+
+=item * Make this into a stream parser with GraphML
+
+=item * Simply field -> attribute correspondence for nodes and edges
+
+=item * Share key name constants with Collation.pm
+
 =back
 
 =head1 LICENSE
@@ -119,8 +295,4 @@ the same terms as Perl itself.
 
 =head1 AUTHOR
 
-Tara L Andrews, aurum@cpan.org
-
-=cut
-
-1;
+Tara L Andrews E<lt>aurum@cpan.orgE<gt>