enable output of CSV without witness layers. Fixes #13

[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Collation.pm
diff --git a/base/lib/Text/Tradition/Collation.pm b/base/lib/Text/Tradition/Collation.pm

index a47e3d1..15f516b 100644 (file)
--- a/base/lib/Text/Tradition/Collation.pm
+++ b/base/lib/Text/Tradition/Collation.pm
@@ -476,7 +476,7 @@ is( scalar( $sc->readings ), $numr, "There are $numr readings in the graph" );
 is( $sc->end->rank, 14, "There are fourteen ranks in the graph" );
 
 # Detach the erroneously collated reading
-my $newr = $sc->duplicate_reading( 'n131', 'Ba96' );
+my( $newr, @del_rdgs ) = $sc->duplicate_reading( 'n131', 'Ba96' );
 ok( $newr, "New reading was created" );
 ok( $sc->reading('n131_0'), "Detached the bad collation with a new reading" );
 is( scalar( $sc->readings ), $numr + 1, "A reading was added to the graph" );
@@ -485,13 +485,17 @@ my $csucc = $sc->common_successor( 'n131', 'n131_0' );
 is( $csucc->id, 'n136', "Found correct common successor to duped reading" ); 
 
 # Check that the bad transposition is gone
+is( scalar @del_rdgs, 1, "Deleted reading was returned by API call" );
 is( $sc->get_relationship( 'n130', 'n135' ), undef, "Bad transposition relationship is gone" );
 
+# The collation should not be fixed
+my @pairs = $sc->identical_readings();
+is( scalar @pairs, 0, "Not re-collated yet" );
 # Fix the collation
 ok( $sc->merge_readings( 'n124', 'n131_0' ), "Collated the readings correctly" );
-my @pairs = $sc->identical_readings( start => 'n124', end => $csucc->id );
-is( $sc->end->rank, 11, "The ranks shifted appropriately" );
+@pairs = $sc->identical_readings( start => 'n124', end => $csucc->id );
 is( scalar @pairs, 3, "Found three more identical readings" );
+is( $sc->end->rank, 11, "The ranks shifted appropriately" );
 $sc->flatten_ranks();
 is( scalar( $sc->readings ), $numr - 3, "Now we are collated correctly" );
 
@@ -553,6 +557,7 @@ sub duplicate_reading {
        # remove them. If not, we can skip it.
        my $succ;
        my %rrk;
+       my @deleted_relations;
        if( $self->end->has_rank ) {
                # Find the point where we can stop checking
                $succ = $self->common_successor( $r, $newr );
@@ -573,11 +578,13 @@ sub duplicate_reading {
                        my @noncolo = $rdg->related_readings( sub { !$_[0]->colocated } );
                        next unless @noncolo;
                        foreach my $nc ( @noncolo ) {
-                               $self->relations->verify_or_delete( $rdg, $nc );
+                               unless( $self->relations->verify_or_delete( $rdg, $nc ) ) {
+                                       push( @deleted_relations, [ $rdg->id, $nc->id ] );
+                               }
                        }
                }
        }
-       return $newr;
+       return ( $newr, @deleted_relations );
 }
 
 sub _generate_dup_id {
@@ -696,8 +703,8 @@ around qw/ get_relationship del_relationship / => sub {
        if( @args == 1 && ref( $args[0] ) eq 'ARRAY' ) {
                @args = @{$_[0]};
        }
-       my( $source, $target ) = $self->_stringify_args( @args );
-       $self->$orig( $source, $target );
+       my @stringargs = $self->_stringify_args( @args );
+       $self->$orig( @stringargs );
 };
 
 =head2 reading_witnesses( $reading )
@@ -860,7 +867,8 @@ sub as_dot {
     foreach my $edge ( @edges ) {
        # Do we need to output this edge?
        if( $used{$edge->[0]} && $used{$edge->[1]} ) {
-               my $label = $self->_path_display_label( $self->path_witnesses( $edge ) );
+               my $label = $self->_path_display_label( $opts,
+                       $self->path_witnesses( $edge ) );
                        my $variables = { %edge_attrs, 'label' => $label };
                        
                        # Account for the rank gap if necessary
@@ -920,7 +928,8 @@ sub as_dot {
     
     # Add substitute start and end edges if necessary
     foreach my $node ( keys %substart ) {
-       my $witstr = $self->_path_display_label ( $self->path_witnesses( $substart{$node}, $node ) );
+       my $witstr = $self->_path_display_label( $opts, 
+               $self->path_witnesses( $substart{$node}, $node ) );
        my $variables = { %edge_attrs, 'label' => $witstr };
        my $nrdg = $self->reading( $node );
        if( $nrdg->has_rank && $nrdg->rank > $startrank ) {
@@ -931,7 +940,8 @@ sub as_dot {
         $dot .= "\t\"__SUBSTART__\" -> \"$node\" $varopts;\n";
        }
     foreach my $node ( keys %subend ) {
-       my $witstr = $self->_path_display_label ( $self->path_witnesses( $node, $subend{$node} ) );
+       my $witstr = $self->_path_display_label( $opts,
+               $self->path_witnesses( $node, $subend{$node} ) );
        my $variables = { %edge_attrs, 'label' => $witstr };
         my $varopts = _dot_attr_string( $variables );
         $dot .= "\t\"$node\" -> \"__SUBEND__\" $varopts;\n";
@@ -1004,6 +1014,7 @@ sub path_witnesses {
 # witnesses only where the main witness is not also in the list.
 sub _path_display_label {
        my $self = shift;
+       my $opts = shift;
        my %wits;
        map { $wits{$_} = 1 } @_;
 
@@ -1021,14 +1032,18 @@ sub _path_display_label {
                }
        }
        
-       # See if we are in a majority situation.
-       my $maj = scalar( $self->tradition->witnesses ) * 0.6;
-       $maj = $maj > 5 ? $maj : 5;
-       if( scalar keys %wits > $maj ) {
-               unshift( @disp_ac, 'majority' );
-               return join( ', ', @disp_ac );
-       } else {
+       if( $opts->{'explicit_wits'} ) {
                return join( ', ', sort keys %wits );
+       } else {
+               # See if we are in a majority situation.
+               my $maj = scalar( $self->tradition->witnesses ) * 0.6;
+               $maj = $maj > 5 ? $maj : 5;
+               if( scalar keys %wits > $maj ) {
+                       unshift( @disp_ac, 'majority' );
+                       return join( ', ', @disp_ac );
+               } else {
+                       return join( ', ', sort keys %wits );
+               }
        }
 }
 
@@ -1374,26 +1389,100 @@ sub _add_graphml_data {
 Returns a CSV alignment table representation of the collation graph, one
 row per witness (or witness uncorrected.) 
 
+=head2 as_tsv
+
+Returns a tab-separated alignment table representation of the collation graph, 
+one row per witness (or witness uncorrected.) 
+
+=begin testing
+
+use Text::Tradition;
+use Text::CSV;
+
+my $READINGS = 311;
+my $PATHS = 361;
+my $WITS = 13;
+my $WITAC = 4;
+
+my $datafile = 't/data/florilegium_tei_ps.xml';
+my $tradition = Text::Tradition->new( 'input' => 'TEI',
+                                      'name' => 'test0',
+                                      'file' => $datafile,
+                                      'linear' => 1 );
+
+my $c = $tradition->collation;
+# Export the thing to CSV
+my $csvstr = $c->as_csv();
+# Count the columns
+my $csv = Text::CSV->new({ sep_char => ',', binary => 1 });
+my @lines = split(/\n/, $csvstr );
+ok( $csv->parse( $lines[0] ), "Successfully parsed first line of CSV" );
+is( scalar( $csv->fields ), $WITS + $WITAC, "CSV has correct number of witness columns" );
+my $t2 = Text::Tradition->new( input => 'Tabular',
+                                                          name => 'test2',
+                                                          string => $csvstr,
+                                                          sep_char => ',' );
+is( scalar $t2->collation->readings, $READINGS, "Reparsed CSV collation has all readings" );
+is( scalar $t2->collation->paths, $PATHS, "Reparsed CSV collation has all paths" );
+
+# Now do it with TSV
+my $tsvstr = $c->as_tsv();
+my $t3 = Text::Tradition->new( input => 'Tabular',
+                                                          name => 'test3',
+                                                          string => $tsvstr,
+                                                          sep_char => "\t" );
+is( scalar $t3->collation->readings, $READINGS, "Reparsed TSV collation has all readings" );
+is( scalar $t3->collation->paths, $PATHS, "Reparsed TSV collation has all paths" );
+
+my $noaccsv = $c->as_csv({ noac => 1 });
+my @noaclines = split(/\n/, $noaccsv );
+ok( $csv->parse( $noaclines[0] ), "Successfully parsed first line of no-ac CSV" );
+is( scalar( $csv->fields ), $WITS, "CSV has correct number of witness columns" );
+
+
+=end testing
+
 =cut
 
-sub as_csv {
-    my( $self ) = @_;
-    my $table = $self->alignment_table;
-    my $csv = Text::CSV->new( { binary => 1, quote_null => 0 } );    
+sub _tabular {
+    my( $self, $opts ) = @_;
+    my $table = $self->alignment_table( $opts );
+       my $csv_options = { binary => 1, quote_null => 0 };
+       $csv_options->{'sep_char'} = $opts->{fieldsep};
+       if( $opts->{fieldsep} eq "\t" ) {
+               # If it is really tab separated, nothing is an escape char.
+               $csv_options->{'quote_char'} = undef;
+               $csv_options->{'escape_char'} = '';
+       }
+    my $csv = Text::CSV->new( $csv_options );    
     my @result;
     # Make the header row
     $csv->combine( map { $_->{'witness'} } @{$table->{'alignment'}} );
-       push( @result, decode_utf8( $csv->string ) );
+       push( @result, $csv->string );
     # Make the rest of the rows
     foreach my $idx ( 0 .. $table->{'length'} - 1 ) {
        my @rowobjs = map { $_->{'tokens'}->[$idx] } @{$table->{'alignment'}};
        my @row = map { $_ ? $_->{'t'}->text : $_ } @rowobjs;
         $csv->combine( @row );
-        push( @result, decode_utf8( $csv->string ) );
+        push( @result, $csv->string );
     }
     return join( "\n", @result );
 }
 
+sub as_csv {
+       my $self = shift;
+       my $opts = shift || {};
+       $opts->{fieldsep} = ',';
+       return $self->_tabular( $opts );
+}
+
+sub as_tsv {
+       my $self = shift;
+       my $opts = shift || {};
+       $opts->{fieldsep} = "\t";
+       return $self->_tabular( $opts );
+}
+
 =head2 alignment_table
 
 Return a reference to an alignment table, in a slightly enhanced CollateX
@@ -1409,8 +1498,9 @@ format which looks like this:
 =cut
 
 sub alignment_table {
-    my( $self ) = @_;
-    return $self->cached_table if $self->has_cached_table;
+    my( $self, $opts ) = @_;
+    return $self->cached_table 
+       if $self->has_cached_table && !$opts->{noac};
     
     # Make sure we can do this
        throw( "Need a linear graph in order to make an alignment table" )
@@ -1427,7 +1517,7 @@ sub alignment_table {
         my $witobj = { 'witness' => $wit->sigil, 'tokens' => \@row };
         $witobj->{'identifier'} = $wit->identifier if $wit->identifier;
         push( @{$table->{'alignment'}}, $witobj );
-        if( $wit->is_layered ) {
+        if( $wit->is_layered && !$opts->{noac} ) {
                my @wit_ac_path = $self->reading_sequence( $self->start, $self->end, 
                        $wit->sigil.$self->ac_label );
             my @ac_row = _make_witness_row( \@wit_ac_path, \@all_pos );
@@ -1816,7 +1906,6 @@ graph, specified either by node or by rank.
 sub identical_readings {
        my ( $self, %args ) = @_;
     # Find where we should start and end.
-    $DB::single = 1;
     my $startrank = $args{startrank} || 0;
     if( $args{start} ) {
        throw( "Starting reading has no rank" ) unless $self->reading( $args{start} )