From: Tara L Andrews <tla@mit.edu>
Date: Fri, 6 May 2011 13:23:10 +0000 (+0200)
Subject: made CSV parser standalone, lots of changes to Base, etc.
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=52ce987f46da555e925cf6d1e159fa9ec413f5b3;p=scpubgit%2Fstemmatology.git

made CSV parser standalone, lots of changes to Base, etc.
---

diff --git a/lib/Text/Tradition/Graph.pm b/lib/Text/Tradition/Graph.pm
index be16099..ee90761 100644
--- a/lib/Text/Tradition/Graph.pm
+++ b/lib/Text/Tradition/Graph.pm
@@ -81,8 +81,7 @@ sub new {
     my @formats = grep { /^(GraphML|CSV|CTE|TEI)$/ } keys( %opts );
     my $format = shift( @formats );
     unless( $format ) {
-	warn "No data given to create a graph: need GraphML, CSV, or TEI";
-	return;
+	warn "No data given to create a graph; will initialize an empty one";
     }
     if( $format =~ /^(CSV|CTE)$/ && !exists $opts{'base'} ) {
 	warn "Cannot make a graph from $format without a base text";
@@ -99,14 +98,20 @@ sub new {
     bless( $self, $class );
 
     # Now do the parsing.
-    my $mod = "Text::Tradition::Parser::$format";
-    load( $mod );
-    my @args = ( $opts{ $format } );
-    if( $format =~ /^(CSV|CTE)$/ ) {
-	push( @args, $opts{'base'} );
+    if( $format ) {
+	my @args;
+	if( $format =~ /^(CSV|CTE)$/ ) {
+	    @args = ( 'base' => $opts{'base'},
+		      'data' => $opts{$format},
+		      'format' => $format );
+	    $format = 'BaseText';
+	} else {
+	    @args = ( $opts{ $format } ); 
+	}
+	my $mod = "Text::Tradition::Parser::$format";
+	load( $mod );
+	$mod->can('parse')->( $self, @args );
     }
-    $mod->can('parse')->( $self, @args );
-
     return $self;
 }
 
@@ -341,14 +346,13 @@ with $last, along the given witness path.
 =cut
 
 sub node_sequence {
-    my( $self, $start, $end, $label ) = @_;
-    # TODO make label able to follow a single MS
+    my( $self, $start, $end, $witness, $backup ) = @_;
     unless( ref( $start ) eq 'Graph::Easy::Node'
 	&& ref( $end ) eq 'Graph::Easy::Node' ) {
 	warn "Called node_sequence without two nodes!";
 	return ();
     }
-    $label = 'base text' unless $label;
+    $witness = 'base text' unless $witness;
     my @nodes = ( $start );
     my %seen;
     my $n = $start;
@@ -360,11 +364,19 @@ sub node_sequence {
 	$seen{$n->name()} = 1;
 
 	my @edges = $n->outgoing();
-	my @relevant_edges = grep { $_->label =~ /^$label$/ } @edges;
-	warn "Did not find an edge $label from node " . $n->label
+	my @relevant_edges = grep { my @w = split( /, /, $_->label ); 
+				    grep { /^\Q$witness\E$/ } @w } @edges;
+	unless( @relevant_edges ) {
+	    @relevant_edges = grep { my @w = split( /, /, $_->label ); 
+				     grep { /^\Q$backup\E$/ } @w } @edges
+					 if $backup;
+	}
+	unless( @relevant_edges ) {
+	    @relevant_edges = grep { $_->label() eq 'base text' } @edges;
+	}
+
+	warn "Did not find an edge for $witness from node " . $n->label
 	    unless scalar @relevant_edges;
-	warn "Found more than one edge $label from node " . $n->label
-	    unless scalar @relevant_edges == 1;
 	my $next = $relevant_edges[0]->to();
 	push( @nodes, $next );
 	$n = $next;
@@ -372,7 +384,7 @@ sub node_sequence {
     # Check that the last node is our end node.
     my $last = $nodes[$#nodes];
     warn "Last node found from " . $start->label() . 
-	" via path $label is not the end!"
+	" for witness $witness is not the end!"
 	unless $last eq $end;
 
     return @nodes;
@@ -514,7 +526,7 @@ sub toggle_node {
     # In case this is being called for the first time.
     $self->init_lemmatizer();
 
-    if( $self->is_common( $node ) ) {
+    if( !$node || $self->is_common( $node ) ) {
 	# Do nothing, it's a common node.
 	return;
     } 
@@ -635,8 +647,7 @@ sub active_nodes {
     return @answer;
 }
 
-# A couple of helpers. TODO These should be gathered in the same place
-# eventually
+# A couple of helpers. 
 
 sub is_common {
     my( $self, $node ) = @_;
diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm
index 59f5a59..07e9d57 100644
--- a/lib/Text/Tradition/Parser/BaseText.pm
+++ b/lib/Text/Tradition/Parser/BaseText.pm
@@ -2,9 +2,7 @@ package Text::Tradition::Parser::BaseText;
 
 use strict;
 use warnings;
-use Exporter 'import';
-use vars qw( @EXPORT_OK );
-@EXPORT_OK = qw( merge_base );
+use Module::Load;
 
 =head1 NAME
 
@@ -30,6 +28,26 @@ will join those listed variants onto the reference text.
 
 =over
 
+=item B<parse>
+
+parse( $graph, %opts );
+
+Takes an initialized graph and a set of options, which must include:
+- 'base' - the base text referenced by the variants
+- 'format' - the format of the variant list
+- 'data' - the variants, in the given format.
+
+=cut
+
+sub parse {
+    my( $graph, %opts ) = @_;
+
+    my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
+    load( $format_mod );
+    my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
+    merge_base( $graph, $opts{'base'}, @apparatus_entries );
+}
+
 =item B<merge_base>
 
 merge_base( $graph, 'reference.txt', @apparatus_entries )
@@ -61,6 +79,7 @@ sub merge_base {
     my( $graph, $base_file, @app_entries ) = @_;
     my @base_line_starts = read_base( $base_file, $graph );
 
+    my %all_witnesses;
     foreach my $app ( @app_entries ) {
 	my( $line, $num ) = split( /\./, $app->{_id} );
 	# DEBUG with a short graph
@@ -158,6 +177,7 @@ sub merge_base {
 	    
 	    # Determine the label name for the edges here.
 	    my $edge_name = join(', ', @mss );
+	    @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
 	    
 	    # Make the variant into a set of nodes.
 	    my $ctr = 0;
@@ -189,19 +209,39 @@ sub merge_base {
     ## read.  So I collapse nodes that have only one edge in and one edge
     ## out, and I do this by looking at the edges.
     
-    foreach my $edge ( $graph->edges() ) {
-	my @out_edges = $edge->from()->outgoing();
-	my @in_edges = $edge->to()->incoming();
+#     foreach my $edge ( $graph->edges() ) {
+# 	my @out_edges = $edge->from()->outgoing();
+# 	my @in_edges = $edge->to()->incoming();
 	
-	next unless scalar( @out_edges ) == 1;
-	next unless scalar( @in_edges ) == 1;
-	next unless $out_edges[0] eq $in_edges[0];
-	# In theory if we've got this far, we're safe, but just to
-	# double-check...
-	next unless $out_edges[0] eq $edge;
+# 	next if $edge->from() eq $graph->start();
+# 	next if $edge->to()->name() eq '#END#';
+# 	next unless scalar( @out_edges ) == 1;
+# 	next unless scalar( @in_edges ) == 1;
+# 	next unless $out_edges[0] eq $in_edges[0];
+# 	# In theory if we've got this far, we're safe, but just to
+# 	# double-check...
+# 	next unless $out_edges[0] eq $edge;
 	
-	$graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
+# 	$graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
+#     }
+
+    # Now walk the path for each witness, so that we can do the
+    # position calculations.
+    my $paths = {};
+    foreach my $w ( keys %all_witnesses ) {
+	my $back = undef;
+	if( $w =~ /^(.*)\s*\(p\.\s*c\.\)/ ) {
+	    $back = $1;
+	}
+	my @wit_nodes = $graph->node_sequence( $graph->start, 
+					       $graph->node( '#END#' ), 
+					       $w, $back );
+	my @wn_names = map { $_->name() } @wit_nodes;
+	$paths->{$w} = \@wn_names;
     }
+    $DB::single = 1;
+    my @common_nodes = grep { $graph->is_common( $_ ) } $graph->nodes();
+    $graph->make_positions( \@common_nodes, $paths );
 }
 
 =item B<read_base>
@@ -276,7 +316,7 @@ sub collate_variants {
     my( $graph, @readings ) = @_;
     my $lemma_start = shift @readings;
     my $lemma_end = shift @readings;
-    my $detranspose = 1;
+    my $detranspose = 0;
 
     # Start the list of distinct nodes with those nodes in the lemma.
     my @distinct_nodes;
@@ -333,6 +373,7 @@ sub collate_variants {
  				    $graph->prior_word( $l, $edgelabel ), $l );
  		    remove_duplicate_edges( $graph, $l, 
  				    $graph->next_word( $l, $edgelabel ) );
+		    last if $matched;
 		}
 	    }
 	    push( @remaining_nodes, [ $w, $var_label ] ) unless $matched;
diff --git a/lib/Text/Tradition/Parser/CSV.pm b/lib/Text/Tradition/Parser/CSV.pm
index a7e3ea4..a4836aa 100644
--- a/lib/Text/Tradition/Parser/CSV.pm
+++ b/lib/Text/Tradition/Parser/CSV.pm
@@ -3,7 +3,6 @@ package Text::Tradition::Parser::CSV;
 use strict;
 use warnings;
 use Text::CSV::Simple;
-use Text::Tradition::Parser::BaseText qw( merge_base );
 
 =head1 NAME
 
@@ -19,29 +18,16 @@ breaks.
 
 =over
 
-=item B<parse>
+=item B<read>
 
-parse( $graph, 'variants.csv', 'reference.txt' );
+my @apparatus = read( $csv_file );
 
-Takes an initialized Text::Tradition::Graph object and the relevant
-data files; puts the text and its variants onto the graph.
+Takes a CSV file; returns a data structure of apparatus entries to be
+merged with a base text.
 
 =cut
 
-sub parse {
-    my( $graph, $csv_file, $base_text ) = @_;
-
-    # Parse the CSV file into a list of apparatus entries.
-    my @app_list = _read_csv( $csv_file );
-    # Now put the base text onto the graph, and merge in the 
-    # apparatus entries.
-    merge_base( $graph, $base_text, @app_list );
-}
-
-# Takes a CSV file; returns a data structure of apparatus entries to
-# be merged with a base text.
-
-sub _read_csv {
+sub read {
     my( $csv_file ) = @_;
     my $parser = Text::CSV::Simple->new();
     my @fields = qw/ reference text variant type context non_corr non_indep