X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FParser%2FTabular.pm;h=a9ce519ac9f003e90bbf8ec1ffec0638b9d36094;hb=9bdf9d67ee6007981f4cc59e844f02de4df7a289;hp=4c1e5114238612b79917d0fb555d7ce00e41a3f7;hpb=1d3104950074a3d7470f01ef0ec8e9046d95b124;p=scpubgit%2Fstemmatology.git

diff --git a/lib/Text/Tradition/Parser/Tabular.pm b/lib/Text/Tradition/Parser/Tabular.pm
index 4c1e511..a9ce519 100644
--- a/lib/Text/Tradition/Parser/Tabular.pm
+++ b/lib/Text/Tradition/Parser/Tabular.pm
@@ -2,7 +2,7 @@ package Text::Tradition::Parser::Tabular;
 
 use strict;
 use warnings;
-use Text::CSV_XS;
+use Text::CSV;
 
 =head1 NAME
 
@@ -68,11 +68,49 @@ is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
 
 ### TODO Check these figures
 if( $t ) {
-    is( scalar $t->collation->readings, 312, "Collation has all readings" );
-    is( scalar $t->collation->paths, 363, "Collation has all paths" );
+    is( scalar $t->collation->readings, 311, "Collation has all readings" );
+    is( scalar $t->collation->paths, 361, "Collation has all paths" );
     is( scalar $t->witnesses, 13, "Collation has all witnesses" );
 }
 
+# Check that we have the right witnesses
+my %seen_wits;
+map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /;
+foreach my $wit ( $t->witnesses ) {
+	$seen_wits{$wit->sigil} = 1;
+}
+is( scalar keys %seen_wits, 13, "No extra witnesses were made" );
+foreach my $k ( keys %seen_wits ) {
+	ok( $seen_wits{$k}, "Witness $k still exists" );
+}
+
+# Check that the witnesses have the right texts
+foreach my $wit ( $t->witnesses ) {
+	my $origtext = join( ' ', @{$wit->text} );
+	my $graphtext = $t->collation->path_text( $wit->sigil );
+	is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil );
+}
+
+# Check that the a.c. witnesses have the right text
+map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /;
+foreach my $k ( keys %seen_wits ) {
+	my $wit = $t->witness( $k );
+	if( $seen_wits{$k} ) {
+		ok( $wit->is_layered, "Witness $k got marked as layered" );
+		ok( $wit->has_layertext, "Witness $k has an a.c. version" );
+		my $origtext = join( ' ', @{$wit->layertext} );
+		my $acsig = $wit->sigil . $t->collation->ac_label;
+		my $graphtext = $t->collation->path_text( $acsig );
+		is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" );
+	} else {
+		ok( !$wit->is_layered, "Witness $k not marked as layered" );
+		ok( !$wit->has_layertext, "Witness $k has no a.c. version" );
+	}
+}	
+
+# Check that we only have collation relationships where we need them
+is( scalar $t->collation->relationships, 3, "Redundant collations were removed" );
+
 =end testing
 
 =cut
@@ -80,10 +118,14 @@ if( $t ) {
 sub parse {
     my( $tradition, $opts ) = @_;
     my $c = $tradition->collation; # shorthand
-    my $csv = Text::CSV_XS->new( { 
-        binary => 1, # binary for UTF-8
-        sep_char => exists $opts->{'sep_char'} ? $opts->{'sep_char'} : "\t" } 
-        );
+    my $csv_options = { 'binary' => 1 };
+    $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t";
+    if( $csv_options->{'sep_char'} eq "\t" ) {
+    	# If it is really tab separated, nothing is an escape char.
+    	$csv_options->{'quote_char'} = undef;
+    	$csv_options->{'escape_char'} = undef;
+    }
+    my $csv = Text::CSV->new( $csv_options );
     
     my $alignment_table;
     if( exists $opts->{'string' } ) {
@@ -111,22 +153,41 @@ sub parse {
 
     # Set up the witnesses we find in the first line
     my @witnesses;
-    my %ac_wits;  # Track these for later removal
+    my %ac_wits;  # Track layered witness -> main witness mapping
+    my $aclabel = $c->ac_label;
     foreach my $sigil ( @{$alignment_table->[0]} ) {
-        my $wit = $tradition->add_witness( 'sigil' => $sigil );
+        if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
+        	# Sanitize the sigil name to an XML name
+        	$sigil = $1 . '_layered';
+            $ac_wits{$sigil} = $1;
+        }
+        my $wit = $tradition->add_witness( 
+        	'sigil' => $sigil, 'sourcetype' => 'collation' );
         $wit->path( [ $c->start ] );
         push( @witnesses, $wit );
         my $aclabel = $c->ac_label;
-        if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
-            $ac_wits{$1} = $wit;
-        }
     }
     
+    # Save the original witness text sequences. Have to loop back through
+    # the witness columns after we have identified all the a.c. witnesses.
+    foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) {
+    	my @sequence = map { $_->[$idx] } @{$alignment_table};
+    	my $sigil = shift @sequence;
+    	my $is_layer = exists( $ac_wits{$sigil} );
+    	my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil );	
+    	# Now get rid of gaps and meta-readings like #LACUNA#
+    	my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence;
+    	$is_layer ? $wit->layertext( \@words ) : $wit->text( \@words );
+    }    
+    
+    my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000;
+    print STDERR "Tradition too big for row collation\n" if $nocollate;
+    
     # Now for the next rows, make nodes as necessary, assign their ranks, and 
     # add them to the witness paths.
     foreach my $idx ( 1 .. $#{$alignment_table} ) {
         my $row = $alignment_table->[$idx];
-        my $nodes = make_nodes( $c, $row, $idx );
+        my $nodes = _make_nodes( $c, $row, $idx, $nocollate );
         foreach my $w ( 0 .. $#{$row} ) {
             # push the appropriate node onto the appropriate witness path
             my $word = $row->[$w];
@@ -146,30 +207,11 @@ sub parse {
         my $last_rdg = shift @$p;
         my $new_p = [ $last_rdg ];
         foreach my $rdg ( @$p ) {
-        	$DB::single = 1 if $rdg->id eq '228,1';
-            if( $rdg->text eq '#LACUNA#' ) {
-                # If we are in a lacuna already, drop this node.
-                # Otherwise make a lacuna node and drop this node.
-                unless( $last_rdg->is_lacuna ) {
-                	my $l_id = 'l' . $rdg->id;
-                	my $l;
-                	if( $c->has_reading( $l_id ) ) {
-                		$l = $c->reading( $l_id );
-                	} else {
-                    	$l = $c->add_reading( {
-							'collation' => $c,
-							'id' => $l_id,
-							'is_lacuna' => 1,
-							} );
-					}
-                    push( @$new_p, $l );
-                    $last_rdg = $l;
-                }
-                $c->del_reading( $rdg );
-            } else {
-                # No lacuna, save the reading.
-                push( @$new_p, $rdg );
-            }
+        	# Omit the reading if we are in a lacuna already.
+        	next if $rdg->is_lacuna && $last_rdg->is_lacuna;
+			# Save the reading otherwise.
+			push( @$new_p, $rdg );
+			$last_rdg = $rdg;
         }
         push( @$new_p, $c->end );
         $wit->path( $new_p );
@@ -178,35 +220,88 @@ sub parse {
     # Fold any a.c. witnesses into their main witness objects, and
     # delete the independent a.c. versions.
     foreach my $a ( keys %ac_wits ) {
-        my $main_wit = $tradition->witness( $a );
+    	my $ac_wit = $tradition->witness( $a );
+        my $main_wit = $tradition->witness( $ac_wits{$a} );
         next unless $main_wit;
-        my $ac_wit = $ac_wits{$a};
+        $main_wit->is_layered(1);
         $main_wit->uncorrected_path( $ac_wit->path );
         $tradition->del_witness( $ac_wit );
     }
-
+    
     # Join up the paths.
     $c->make_witness_paths;
+    # Delete our unused lacuna nodes.
+	foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) {
+		$c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg );
+	}
+	
+	# Do a consistency check.
+	foreach my $wit ( $tradition->witnesses ) {
+		my $pathtext = $c->path_text( $wit->sigil );
+		my $origtext = join( ' ', @{$wit->text} );
+		warn "Text differs for witness " . $wit->sigil 
+			unless $pathtext eq $origtext;
+		if( $wit->is_layered ) {
+			$pathtext = $c->path_text( $wit->sigil.$c->ac_label );
+			$origtext = join( ' ', @{$wit->layertext} );
+			warn "Ante-corr text differs for witness " . $wit->sigil
+				unless $pathtext eq $origtext;
+		} else {
+			warn "Text " . $wit->sigil . " has a layered text but is not marked as layered"
+				if $wit->has_layertext;
+		}
+	}
+	
+	# Note that our ranks and common readings are set.
+	$c->_graphcalc_done(1);
+	# Remove redundant collation relationships.
+	$c->relations->filter_collations() unless $nocollate;
 }
 
-sub make_nodes {
-    my( $collation, $row, $index ) = @_;
+sub _make_nodes {
+    my( $collation, $row, $index, $nocollate ) = @_;
     my %unique;
+    my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae.
     foreach my $w ( @$row ) {
         $unique{$w} = 1 if $w;
+        $commonctr +=1 unless ( $w && $w eq '#LACUNA#' );
     }
     my $ctr = 1;
     foreach my $w ( keys %unique ) {
     	my $rargs = {
-    		'collation' => $collation,
-    		'id' => "$index,$ctr",
+    		'id' => "r$index.$ctr",
     		'rank' => $index,
     		'text' => $w,
     		};
+    	if( $w eq '#LACUNA#' ) {
+    		$rargs->{'is_lacuna'} = 1;
+    	} elsif( $commonctr == 1 ) {
+    		$rargs->{'is_common'} = 1;
+    	}
         my $r = $collation->add_reading( $rargs );
         $unique{$w} = $r;
         $ctr++;
     }
+    # Collate this sequence of readings via a single 'collation' relationship.
+    unless( $nocollate ) {
+		my @rankrdgs = values %unique;
+		my $collation_rel;
+		while( @rankrdgs ) {
+			my $r = shift @rankrdgs;
+			next if $r->is_meta;
+			foreach my $nr ( @rankrdgs ) {
+				next if $nr->is_meta;
+				if( $collation_rel ) {
+					$collation->add_relationship( $r, $nr, $collation_rel );
+				} else {
+					$collation->add_relationship( $r, $nr, 
+						{ 'type' => 'collated', 
+						  'annotation' => "Parsed together for rank $index" } );
+					$collation_rel = $collation->get_relationship( $r, $nr );
+				}
+			}
+		}
+	}    
     return \%unique;
 }