From: Tara L Andrews <tla@mit.edu>
Date: Mon, 3 Oct 2011 18:53:36 +0000 (+0200)
Subject: fix Tabular parser to account for a.c. wits; more doc and tests
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=3b853983204d888a90c029c1e66d77b9fa9642b5;p=scpubgit%2Fstemmatology.git

fix Tabular parser to account for a.c. wits; more doc and tests
---

diff --git a/lib/Text/Tradition.pm b/lib/Text/Tradition.pm
index a978e9e..14b2253 100644
--- a/lib/Text/Tradition.pm
+++ b/lib/Text/Tradition.pm
@@ -14,14 +14,17 @@ has 'collation' => (
     writer => '_save_collation',
     );
 
-has 'witnesses' => (
-    traits => ['Array'],
-    isa => 'ArrayRef[Text::Tradition::Witness]',
+has 'witness_hash' => (
+    traits => ['Hash'],
+    isa => 'HashRef[Text::Tradition::Witness]',
     handles => {
-        witnesses    => 'elements',
-        add_witness  => 'push',
+        witness     => 'get',
+        add_witness => 'set',
+        del_witness => 'delete',
+        has_witness => 'exists',
+        witnesses   => 'values',
     },
-    default => sub { [] },
+    default => sub { {} },
     );
 
 has 'name' => (
@@ -29,16 +32,36 @@ has 'name' => (
     isa => 'Str',
     default => 'Tradition',
     );
-    
+  
+# Create the witness before trying to add it
 around 'add_witness' => sub {
     my $orig = shift;
     my $self = shift;
     # TODO allow add of a Witness object?
     my $new_wit = Text::Tradition::Witness->new( @_ );
-    $self->$orig( $new_wit );
+    $self->$orig( $new_wit->sigil => $new_wit );
     return $new_wit;
 };
 
+# Allow deletion of witness by object as well as by sigil
+around 'del_witness' => sub {
+    my $orig = shift;
+    my $self = shift;
+    my @key_args;
+    foreach my $arg ( @_ ) {
+        push( @key_args, 
+              ref( $arg ) eq 'Text::Tradition::Witness' ? $arg->sigil : $arg );
+    }
+    return $self->$orig( @key_args );
+};
+
+# Don't allow an empty hash value
+around 'witness' => sub {
+    my( $orig, $self, $arg ) = @_;
+    return unless $self->has_witness( $arg );
+    return $self->$orig( $arg );
+};
+
 =head1 NAME
 
 Text::Tradition - a software model for a set of collated texts
@@ -163,6 +186,10 @@ is( ref( $w ), 'Text::Tradition::Witness', "new witness created" );
 is( $w->sigil, 'D', "witness has correct sigil" );
 is( scalar $s->witnesses, 4, "object now has four witnesses" );
 
+my $del = $s->del_witness( 'D' );
+is( $del, $w, "Deleted correct witness" );
+is( scalar $s->witnesses, 3, "object has three witnesses again" );
+
 # TODO test initialization by witness list when we have it
 
 =end testing
@@ -257,20 +284,6 @@ is( $s->witness('X'), undef, "There is no witness X" );
 
 =cut
 
-sub witness {
-    my( $self, $sigil ) = @_;
-    my $requested_wit;
-    foreach my $wit ( $self->witnesses ) {
-        if( $wit->sigil eq $sigil ) {
-            $requested_wit = $wit;
-            last;
-        }
-    }
-    # We depend on an undef return value for no such witness.
-    # warn "No such witness $sigil" unless $requested_wit;
-    return $requested_wit;
-}
-
 no Moose;
 __PACKAGE__->meta->make_immutable;
 
diff --git a/lib/Text/Tradition/Parser/TEI.pm b/lib/Text/Tradition/Parser/TEI.pm
index 7b68ba2..c7e607d 100644
--- a/lib/Text/Tradition/Parser/TEI.pm
+++ b/lib/Text/Tradition/Parser/TEI.pm
@@ -10,22 +10,65 @@ use XML::LibXML::XPathContext;
 
 Text::Tradition::Parser::TEI
 
+=head1 SYNOPSIS
+
+  use Text::Tradition;
+  
+  my $t_from_file = Text::Tradition->new( 
+    'name' => 'my text',
+    'input' => 'TEI',
+    'file' => '/path/to/parallel_seg_file.xml'
+    );
+    
+  my $t_from_string = Text::Tradition->new( 
+    'name' => 'my text',
+    'input' => 'TEI',
+    'string' => $parallel_seg_xml,
+    );
+
+
 =head1 DESCRIPTION
 
-Parser module for Text::Tradition, given a TEI parallel-segmentation
-file that describes a text and its variants.
+Parser module for Text::Tradition, given a TEI parallel-segmentation file
+that describes a text and its variants.  Normally called upon
+initialization of Text::Tradition.
+
+The witnesses for the tradition are taken from the <listWit/> element
+within the TEI header; the readings are taken from any <p/> element that
+appears in the text body (including <head/> elements therein.)
 
 =head1 METHODS
 
 =over
 
-=item B<parse>
+=item B<parse>( $tradition, $option_hash )
+
+Takes an initialized tradition and a set of options; creates the
+appropriate nodes and edges on the graph, as well as the appropriate
+witness objects.  The $option_hash must contain either a 'file' or a
+'string' argument with the XML to be parsed.
+
+=begin testing
+
+use Text::Tradition;
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+eval { no warnings; binmode $DB::OUT, ":utf8"; };
+
+my $par_seg = 't/data/florilegium_tei_ps.xml';
+my $t = Text::Tradition->new( 
+    'name'  => 'inline', 
+    'input' => 'TEI',
+    'file'  => $par_seg,
+    );
 
-parse( $tei_string );
+is( ref( $t ), 'Text::Tradition', "Parsed parallel-segmentation TEI" );
+if( $t ) {
+    is( scalar $t->collation->readings, 319, "Collation has all readings" );
+    is( scalar $t->collation->paths, 2854, "Collation has all paths" );
+}
 
-Takes an initialized tradition and a string containing the TEI;
-creates the appropriate nodes and edges on the graph, as well as
-the appropriate witness objects.
+=end testing
 
 =cut
 
@@ -41,7 +84,7 @@ my $app_count;          # Keep track of how many apps we have
 # is considered a bad idea.  The long way round then.
 my( $LISTWIT, $WITNESS, $TEXT, $W, $SEG, $APP, $RDG, $LEM ) 
     = ( 'listWit', 'witness', 'text', 'w', 'seg', 'app', 'rdg', 'lem' );
-sub make_tagnames {
+sub _make_tagnames {
     my( $ns ) = @_;
     if( $ns ) {
         $LISTWIT = "$ns:$LISTWIT";
@@ -77,7 +120,7 @@ sub parse {
         $ns = 'tei';
         $xpc->registerNs( $ns, $tei->namespaceURI );
     }
-    make_tagnames( $ns );
+    _make_tagnames( $ns );
 
     # Then get the witnesses and create the witness objects.
     foreach my $wit_el ( $xpc->findnodes( "//$LISTWIT/$WITNESS" ) ) {
@@ -89,7 +132,7 @@ sub parse {
 
     # Look for all word/seg node IDs and note their pre-existence.
     my @attrs = $xpc->findnodes( "//$W|$SEG/attribute::xml:id" );
-    save_preexisting_nodeids( @attrs );
+    _save_preexisting_nodeids( @attrs );
 
     # Count up how many apps we have.
     my @apps = $xpc->findnodes( "//$APP" );
@@ -143,11 +186,8 @@ sub parse {
             $tradition->witness( $sig )->uncorrected_path( \@uncorrected );
         }
     }
-    # Delete readings that are no longer part of the graph.
-    # TODO think this is useless actually
-    foreach ( keys %$substitutions ) {
-        $tradition->collation->del_reading( $tradition->collation->reading( $_ ) );
-    }
+    
+    # Calculate the ranks for the nodes.
     $tradition->collation->calculate_ranks();
     
     # Now that we have ranks, see if we have distinct nodes with identical
@@ -205,6 +245,12 @@ sub _return_rdg {
     return $real;
 }
 
+=begin testing
+
+## TODO test specific sorts of nodes of the parallel-seg XML.
+
+=end testing
+
 ## Recursive helper function to help us navigate through nested XML,
 ## picking out the text.  $tradition is the tradition, needed for
 ## making readings; $xn is the XML node currently being looked at,
@@ -237,7 +283,7 @@ sub _return_rdg {
             foreach my $w ( split( /\s+/, $str ) ) {
                 # For now, skip punctuation.
                 next if $w !~ /[[:alnum:]]/;
-                my $rdg = make_reading( $tradition->collation, $w );
+                my $rdg = _make_reading( $tradition->collation, $w );
                 push( @new_readings, $rdg );
                 unless( $in_var ) {
                     $rdg->make_common;
@@ -256,7 +302,7 @@ sub _return_rdg {
                 warn "$c is not among active wits" unless $active_wits{$c};
             }
             my $xml_id = $xn->getAttribute( 'xml:id' );
-            my $rdg = make_reading( $tradition->collation, $xn->textContent, $xml_id );
+            my $rdg = _make_reading( $tradition->collation, $xn->textContent, $xml_id );
             push( @new_readings, $rdg );
             unless( $in_var ) {
                 $rdg->make_common;
@@ -294,7 +340,7 @@ sub _return_rdg {
             #print STDERR "Handling reading for " . $xn->getAttribute( 'wit' ) . "\n";
             # TODO handle p.c. and s.l. designations too
             $ac = $xn->getAttribute( 'type' ) && $xn->getAttribute( 'type' ) eq 'a.c.';
-            my @rdg_wits = get_sigla( $xn );
+            my @rdg_wits = _get_sigla( $xn );
             @rdg_wits = ( 'base' ) unless @rdg_wits;  # Allow for editorially-supplied readings
             my @words;
             foreach ( $xn->childNodes ) {
@@ -363,8 +409,26 @@ sub _return_rdg {
 
 }
 
+=begin testing
+
+use XML::LibXML;
+use XML::LibXML::XPathContext;
+use Text::Tradition::Parser::TEI;
+
+my $xml_str = '<tei><rdg wit="#A #B #C #D">some text</rdg></tei>';
+my $el = XML::LibXML->new()->parse_string( $xml_str )->documentElement;
+my $xpc = XML::LibXML::XPathContext->new( $el );
+my $obj = $xpc->find( '//rdg' );
+
+my @wits = Text::Tradition::Parser::TEI::_get_sigla( $obj );
+is( join( ' ', @wits) , "A B C D", "correctly parsed reading wit string" );
+
+=end testing
+
+=cut
+
 # Helper to extract a list of witness sigla from a reading element.
-sub get_sigla {
+sub _get_sigla {
     my( $rdg ) = @_;
     # Cope if we have been handed a NodeList.  There is only
     # one reading here.
@@ -388,13 +452,13 @@ sub get_sigla {
     my $word_ctr = 0;
     my %used_nodeids;
 
-    sub save_preexisting_nodeids {
+    sub _save_preexisting_nodeids {
         foreach( @_ ) {
             $used_nodeids{$_->getValue()} = 1;
         }
     }
 
-    sub make_reading {
+    sub _make_reading {
         my( $graph, $word, $xml_id ) = @_;
         if( $xml_id ) {
             if( exists $used_nodeids{$xml_id} ) {
@@ -421,3 +485,21 @@ sub get_sigla {
 }
 
 1;
+
+=head1 BUGS / TODO
+
+=over
+
+=item * More unit testing
+
+=back
+
+=head1 LICENSE
+
+This package is free software and is provided "as is" without express
+or implied warranty.  You can redistribute it and/or modify it under
+the same terms as Perl itself.
+
+=head1 AUTHOR
+
+Tara L Andrews E<lt>aurum@cpan.orgE<gt>
diff --git a/lib/Text/Tradition/Parser/Tabular.pm b/lib/Text/Tradition/Parser/Tabular.pm
index 2cf17d0..9daed14 100644
--- a/lib/Text/Tradition/Parser/Tabular.pm
+++ b/lib/Text/Tradition/Parser/Tabular.pm
@@ -8,6 +8,24 @@ use Text::CSV_XS;
 
 Text::Tradition::Parser::Tabular
 
+=head1 SYNOPSIS
+
+  use Text::Tradition;
+  
+  my $t_from_file = Text::Tradition->new( 
+    'name' => 'my text',
+    'input' => 'Tabular',
+    'file' => '/path/to/collation.csv',
+    'sep_char' => ','
+    );
+    
+  my $t_from_string = Text::Tradition->new( 
+    'name' => 'my text',
+    'input' => 'Tabular',
+    'string' => $tab_separated_collation,
+    'sep_char' => "\t",
+    );
+
 =head1 DESCRIPTION
 
 Parser module for Text::Tradition to read an alignment table format, such as CSV.
@@ -16,13 +34,48 @@ Parser module for Text::Tradition to read an alignment table format, such as CSV
 
 =over
 
-=item B<parse>
+=item B<parse>( $tradition, $option_hash )
+
+Takes an initialized tradition and a set of options; creates the
+appropriate nodes and edges on the graph, as well as the appropriate
+witness objects.  The $option_hash must contain either a 'file' or a
+'string' argument with the table to be parsed; it may also contain a 
+'sep_char' argument to specify how the fields are separated.
+
+The table should have witnesses arranged in columns, with the witness sigla
+in the first row.  Empty cells are interpreted as omissions (and thus
+stemmatologically relevant.) Longer lacunae in the text, to be disregarded
+in cladistic analysis, may be represented by filling the appropriate cells
+with the tag '#LACUNA#'.
+
+If a witness name ends in the collation's ac_label, it will be treated as
+an 'ante-correction' version of the 'main' witness whose sigil it shares.
+
+=begin testing
+
+use Text::Tradition;
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+eval { no warnings; binmode $DB::OUT, ":utf8"; };
+
+my $csv = 't/data/florilegium.csv';
+my $t = Text::Tradition->new( 
+    'name'  => 'inline', 
+    'input' => 'Tabular',
+    'file'  => $csv,
+    'sep_char' => ',',
+    );
 
-parse( $graph, $graphml_string );
+is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
 
-Takes an initialized Text::Tradition::Graph object and a string
-containing the GraphML; creates the appropriate nodes and edges on the
-graph.
+### TODO Check these figures
+if( $t ) {
+    is( scalar $t->collation->readings, 313, "Collation has all readings" );
+    is( scalar $t->collation->paths, 2877, "Collation has all paths" );
+    is( scalar $t->witnesses, 13, "Collation has all witnesses" );
+}
+
+=end testing
 
 =cut
 
@@ -33,7 +86,6 @@ sub parse {
         binary => 1, # binary for UTF-8
         sep_char => exists $opts->{'sep_char'} ? $opts->{'sep_char'} : "\t" } 
         );
-    # TODO Handle being given a file
     
     my $alignment_table;
     if( exists $opts->{'string' } ) {
@@ -59,10 +111,15 @@ sub parse {
 
     # Set up the witnesses we find in the first line
     my @witnesses;
+    my %ac_wits;  # Track these for later removal
     foreach my $sigil ( @{$alignment_table->[0]} ) {
         my $wit = $tradition->add_witness( 'sigil' => $sigil );
         $wit->path( [ $c->start ] );
         push( @witnesses, $wit );
+        my $aclabel = $c->ac_label;
+        if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) {
+            $ac_wits{$1} = $wit;
+        }
     }
     
     # Now for the next rows, make nodes as necessary, assign their ranks, and 
@@ -109,6 +166,16 @@ sub parse {
         $wit->path( $new_p );
     }
     
+    # Fold any a.c. witnesses into their main witness objects, and
+    # delete the independent a.c. versions.
+    foreach my $a ( keys %ac_wits ) {
+        my $main_wit = $tradition->witness( $a );
+        next unless $main_wit;
+        my $ac_wit = $ac_wits{$a};
+        $main_wit->uncorrected_path( $ac_wit->path );
+        $tradition->del_witness( $ac_wit );
+    }
+
     # Join up the paths.
     $c->make_witness_paths;
 }
@@ -130,4 +197,14 @@ sub make_nodes {
     return \%unique;
 }
 
-1;
\ No newline at end of file
+1;
+
+=head1 LICENSE
+
+This package is free software and is provided "as is" without express
+or implied warranty.  You can redistribute it and/or modify it under
+the same terms as Perl itself.
+
+=head1 AUTHOR
+
+Tara L Andrews E<lt>aurum@cpan.orgE<gt>
diff --git a/t/data/florilegium_graphml.csv b/t/data/florilegium.csv
similarity index 100%
rename from t/data/florilegium_graphml.csv
rename to t/data/florilegium.csv
diff --git a/t/text_tradition.t b/t/text_tradition.t
index 7ddc7e1..4c763b9 100644
--- a/t/text_tradition.t
+++ b/t/text_tradition.t
@@ -30,6 +30,10 @@ is( ref( $w ), 'Text::Tradition::Witness', "new witness created" );
 is( $w->sigil, 'D', "witness has correct sigil" );
 is( scalar $s->witnesses, 4, "object now has four witnesses" );
 
+my $del = $s->del_witness( 'D' );
+is( $del, $w, "Deleted correct witness" );
+is( scalar $s->witnesses, 3, "object has three witnesses again" );
+
 # TODO test initialization by witness list when we have it
 }
 
diff --git a/t/text_tradition_parser_tabular.t b/t/text_tradition_parser_tabular.t
new file mode 100644
index 0000000..2696b52
--- /dev/null
+++ b/t/text_tradition_parser_tabular.t
@@ -0,0 +1,37 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Test::More 'no_plan';
+$| = 1;
+
+
+
+# =begin testing
+{
+use Text::Tradition;
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+eval { no warnings; binmode $DB::OUT, ":utf8"; };
+
+my $csv = 't/data/florilegium.csv';
+my $t = Text::Tradition->new( 
+    'name'  => 'inline', 
+    'input' => 'Tabular',
+    'file'  => $csv,
+    'sep_char' => ',',
+    );
+
+is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" );
+
+### TODO Check these figures
+if( $t ) {
+    is( scalar $t->collation->readings, 313, "Collation has all readings" );
+    is( scalar $t->collation->paths, 2877, "Collation has all paths" );
+    is( scalar $t->witnesses, 13, "Collation has all witnesses" );
+}
+}
+
+
+
+
+1;
diff --git a/t/text_tradition_parser_tei.t b/t/text_tradition_parser_tei.t
new file mode 100644
index 0000000..09e9f0b
--- /dev/null
+++ b/t/text_tradition_parser_tei.t
@@ -0,0 +1,57 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Test::More 'no_plan';
+$| = 1;
+
+
+
+# =begin testing
+{
+use Text::Tradition;
+binmode STDOUT, ":utf8";
+binmode STDERR, ":utf8";
+eval { no warnings; binmode $DB::OUT, ":utf8"; };
+
+my $par_seg = 't/data/florilegium_tei_ps.xml';
+my $t = Text::Tradition->new( 
+    'name'  => 'inline', 
+    'input' => 'TEI',
+    'file'  => $par_seg,
+    );
+
+is( ref( $t ), 'Text::Tradition', "Parsed parallel-segmentation TEI" );
+if( $t ) {
+    is( scalar $t->collation->readings, 319, "Collation has all readings" );
+    is( scalar $t->collation->paths, 2854, "Collation has all paths" );
+}
+}
+
+
+
+# =begin testing
+{
+## TODO test specific sorts of nodes of the parallel-seg XML.
+}
+
+
+
+# =begin testing
+{
+use XML::LibXML;
+use XML::LibXML::XPathContext;
+use Text::Tradition::Parser::TEI;
+
+my $xml_str = '<tei><rdg wit="#A #B #C #D">some text</rdg></tei>';
+my $el = XML::LibXML->new()->parse_string( $xml_str )->documentElement;
+my $xpc = XML::LibXML::XPathContext->new( $el );
+my $obj = $xpc->find( '//rdg' );
+
+my @wits = Text::Tradition::Parser::TEI::_get_sigla( $obj );
+is( join( ' ', @wits) , "A B C D", "correctly parsed reading wit string" );
+}
+
+
+
+
+1;