From: Tara L Andrews <tla@mit.edu>
Date: Wed, 21 Sep 2011 13:35:55 +0000 (+0200)
Subject: add support for lacunas within the witnesses
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=eca160573f4f28f53faa7dd97ee5cb6740a4b517;p=scpubgit%2Fstemmatology.git

add support for lacunas within the witnesses
---

diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm
index 7d48cfc..ea55378 100644
--- a/lib/Text/Tradition/Collation.pm
+++ b/lib/Text/Tradition/Collation.pm
@@ -17,6 +17,7 @@ has 'graph' => (
     isa => 'Graph::Easy',
     handles => {
         add_reading => 'add_node',
+        add_lacuna => 'add_node',
         del_reading => 'del_node',
         del_segment => 'del_node',
         add_path => 'add_edge',
@@ -121,6 +122,15 @@ sub BUILD {
     $self->graph->set_attribute( 'node', 'shape', $shape );
 }
 
+around add_lacuna => sub {
+    my $orig = shift;
+    my $self = shift;
+    my $id = shift @_;
+    my $l = $self->$orig( '#LACUNA_' . $id . '#' );
+    $l->is_lacuna( 1 );
+    return $l;
+};
+
 # Wrapper around add_path 
 
 around add_path => sub {
@@ -296,7 +306,7 @@ sub as_svg {
     my( $svg, $err );
     my $dotfile = File::Temp->new();
     ## TODO REMOVE
-    $dotfile->unlink_on_destroy(0);
+    # $dotfile->unlink_on_destroy(0);
     binmode $dotfile, ':utf8';
     print $dotfile $self->as_dot();
     push( @cmd, $dotfile->filename );
@@ -518,6 +528,7 @@ sub make_alignment_table {
     my $table;
     my @all_pos = sort { $a <=> $b } $self->possible_positions;
     foreach my $wit ( $self->tradition->witnesses ) {
+        # print STDERR "Making witness row(s) for " . $wit->sigil . "\n";
         my @row = _make_witness_row( $wit->path, \@all_pos );
         unshift( @row, $wit->sigil );
         push( @$table, \@row );
@@ -537,10 +548,22 @@ sub _make_witness_row {
     my %char_hash;
     map { $char_hash{$_} = undef } @$positions;
     foreach my $rdg ( @$path ) {
-        $char_hash{$rdg->rank} = $rdg->text;
+        my $rtext = $rdg->text;
+        $rtext = '#LACUNA#' if $rdg->is_lacuna;
+        $char_hash{$rdg->rank} = $rtext;
     }
     my @row = map { $char_hash{$_} } @$positions;
-    return @row;
+    # Fill in lacuna markers for undef spots in the row
+    my $last_el = shift @row;
+    my @filled_row = ( $last_el );
+    foreach my $el ( @row ) {
+        if( $last_el && $last_el eq '#LACUNA#' && !defined $el ) {
+            $el = '#LACUNA#';
+        }
+        push( @filled_row, $el );
+        $last_el = $el;
+    }
+    return @filled_row;
 }
 
 # Helper to turn the witnesses along columns rather than rows.  Assumes
diff --git a/lib/Text/Tradition/Collation/Reading.pm b/lib/Text/Tradition/Collation/Reading.pm
index 5c1d866..f598418 100644
--- a/lib/Text/Tradition/Collation/Reading.pm
+++ b/lib/Text/Tradition/Collation/Reading.pm
@@ -17,6 +17,11 @@ has 'rank' => (
     isa => 'Int',
     predicate => 'has_rank',
     );
+    
+has 'is_lacuna' => (
+    is => 'rw',
+    isa => 'Bool',
+    );
 
 # This contains an array of reading objects; the array is a pool,
 # shared by the reading objects inside the pool.  When a reading is
@@ -63,7 +68,11 @@ sub text {
     # Wrapper function around 'label' attribute.
     my $self = shift;
     if( @_ ) {
-	$self->set_attribute( 'label', $_[0] );
+        if( defined $_[0] ) {
+        	$self->set_attribute( 'label', $_[0] );
+        } else {
+            $self->del_attribute( 'label' );
+        }
     }
     return $self->label;
 }
diff --git a/lib/Text/Tradition/Parser/TEI.pm b/lib/Text/Tradition/Parser/TEI.pm
index 68172f7..0af2e64 100644
--- a/lib/Text/Tradition/Parser/TEI.pm
+++ b/lib/Text/Tradition/Parser/TEI.pm
@@ -33,6 +33,7 @@ my $text = {}; # Hash of arrays, one per eventual witness we find.
 my $substitutions = {}; # Keep track of merged readings
 my $app_anchors = {};   # Track apparatus references
 my $app_ac = {};        # Save a.c. readings
+my $app_count;          # Keep track of how many apps we have
 
 # Create the package variables for tag names.
 
@@ -76,12 +77,16 @@ sub parse {
         my $source = $wit_el->toString();
         $tradition->add_witness( sigil => $sig, source => $source );
     }
-
     map { $text->{$_->sigil} = [] } $tradition->witnesses;
+
     # Look for all word/seg node IDs and note their pre-existence.
     my @attrs = $xpc->findnodes( "//$W|$SEG/attribute::xml:id" );
     save_preexisting_nodeids( @attrs );
 
+    # Count up how many apps we have.
+    my @apps = $xpc->findnodes( "//$APP" );
+    $app_count = scalar( @apps );
+
     # Now go through the children of the text element and pull out the
     # actual text.
     foreach my $xml_el ( $xpc->findnodes( "//$TEXT" ) ) {
@@ -197,6 +202,7 @@ sub _return_rdg {
 {
     my @active_wits;
     my $current_app;
+    my $seen_apps;
 
     sub _get_readings {
         my( $tradition, $xn, $in_var, $ac, @cur_wits ) = @_;
@@ -249,6 +255,7 @@ sub _return_rdg {
                 push( @{$text->{$_}}, $rdg ) unless $ac;
             }
         } elsif ( $xn->nodeName eq 'app' ) {
+            $seen_apps++;
             $current_app = $xn->getAttribute( 'xml:id' );
             # print STDERR "Handling app $current_app\n";
             # Keep the reading sets in this app.
@@ -310,6 +317,14 @@ sub _return_rdg {
             #print STDERR "Handling witEnd\n";
             my $regexp = '^(' . join( '|', @cur_wits ) . ')$';
             @active_wits = grep { $_ !~ /$regexp/ } @active_wits;
+            # Record a lacuna, unless this is the last app.
+            unless( $seen_apps == $app_count ) {
+                foreach my $i ( 0 .. $#cur_wits ) {
+                    my $w = $cur_wits[$i];
+                    my $l = $tradition->collation->add_lacuna( $current_app . "_$i" );
+                    push( @{$text->{$w}}, $l );
+                }
+            }
         } elsif( $xn->nodeName eq 'witDetail' ) {
             # Ignore these for now.
             return;
diff --git a/lib/Text/Tradition/Parser/Tabular.pm b/lib/Text/Tradition/Parser/Tabular.pm
index 10d0730..16e2863 100644
--- a/lib/Text/Tradition/Parser/Tabular.pm
+++ b/lib/Text/Tradition/Parser/Tabular.pm
@@ -30,7 +30,8 @@ sub parse {
     my( $tradition, $tab_str ) = @_;
     # TODO Allow setting of sep_char
     my $c = $tradition->collation; # shorthand
-    my $csv = Text::CSV_XS->new( { binary => 1 } ); # binary for UTF-8
+    my $csv = Text::CSV_XS->new( { binary => 1, # binary for UTF-8
+        sep_char => "\t" } );
     my @lines = split( "\n", $tab_str );
     # Conveniently, we are basically receiving exactly the sort of alignment table
     # we might want to produce later.  May as well save it.
@@ -69,10 +70,32 @@ sub parse {
         }
     }
     
-    # Push the end node onto all paths.
+    
+    # Collapse our lacunae into a single node and
+    # push the end node onto all paths.
     $c->end->rank( scalar @$alignment_table );
     foreach my $wit ( @witnesses ) {
-        push( @{$wit->path}, $c->end );
+        my $p = $wit->path;
+        my $last_rdg = shift @$p;
+        my $new_p = [ $last_rdg ];
+        foreach my $rdg ( @$p ) {
+            if( $rdg->text eq '#LACUNA#' ) {
+                # If we are in a lacuna already, drop this node.
+                # Otherwise make a lacuna node and drop this node.
+                unless( $last_rdg->is_lacuna ) {
+                    my $l = $c->add_lacuna( $rdg->name );
+                    $l->rank( $rdg->rank );
+                    push( @$new_p, $l );
+                    $last_rdg = $l;
+                }
+                $c->del_reading( $rdg );
+            } else {
+                # No lacuna, save the reading.
+                push( @$new_p, $rdg );
+            }
+        }
+        push( @$new_p, $c->end );
+        $wit->path( $new_p );
     }
     
     # Join up the paths.
diff --git a/lib/Text/Tradition/Stemma.pm b/lib/Text/Tradition/Stemma.pm
index d4466aa..5928016 100644
--- a/lib/Text/Tradition/Stemma.pm
+++ b/lib/Text/Tradition/Stemma.pm
@@ -54,7 +54,9 @@ sub convert_characters {
     # This is a simple algorithm that treats every reading as different.
     # Eventually we will want to be able to specify how relationships
     # affect the character matrix.
-    my %unique = ( '__UNDEF__' => 'X' );
+    my %unique = ( '__UNDEF__' => 'X',
+                   '#LACUNA#'  => '?',
+                 );
     my $ctr = 0;
     foreach my $word ( @$row ) {
         if( $word && !exists $unique{$word} ) {
@@ -87,6 +89,8 @@ sub run_pars {
 
     # Set up a temporary directory for all the default Phylip files.
     my $phylip_dir = File::Temp->newdir();
+    print STDERR $phylip_dir . "\n";
+    # $phylip_dir->unlink_on_destroy(0);
     # We need an infile, and we need a command input file.
     open( MATRIX, ">$phylip_dir/infile" ) or die "Could not write $phylip_dir/infile";
     print MATRIX $self->pars_input();