From: Tara L Andrews Date: Mon, 3 Oct 2011 18:53:36 +0000 (+0200) Subject: fix Tabular parser to account for a.c. wits; more doc and tests X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=3b853983204d888a90c029c1e66d77b9fa9642b5;p=scpubgit%2Fstemmatology.git fix Tabular parser to account for a.c. wits; more doc and tests --- diff --git a/lib/Text/Tradition.pm b/lib/Text/Tradition.pm index a978e9e..14b2253 100644 --- a/lib/Text/Tradition.pm +++ b/lib/Text/Tradition.pm @@ -14,14 +14,17 @@ has 'collation' => ( writer => '_save_collation', ); -has 'witnesses' => ( - traits => ['Array'], - isa => 'ArrayRef[Text::Tradition::Witness]', +has 'witness_hash' => ( + traits => ['Hash'], + isa => 'HashRef[Text::Tradition::Witness]', handles => { - witnesses => 'elements', - add_witness => 'push', + witness => 'get', + add_witness => 'set', + del_witness => 'delete', + has_witness => 'exists', + witnesses => 'values', }, - default => sub { [] }, + default => sub { {} }, ); has 'name' => ( @@ -29,16 +32,36 @@ has 'name' => ( isa => 'Str', default => 'Tradition', ); - + +# Create the witness before trying to add it around 'add_witness' => sub { my $orig = shift; my $self = shift; # TODO allow add of a Witness object? my $new_wit = Text::Tradition::Witness->new( @_ ); - $self->$orig( $new_wit ); + $self->$orig( $new_wit->sigil => $new_wit ); return $new_wit; }; +# Allow deletion of witness by object as well as by sigil +around 'del_witness' => sub { + my $orig = shift; + my $self = shift; + my @key_args; + foreach my $arg ( @_ ) { + push( @key_args, + ref( $arg ) eq 'Text::Tradition::Witness' ? $arg->sigil : $arg ); + } + return $self->$orig( @key_args ); +}; + +# Don't allow an empty hash value +around 'witness' => sub { + my( $orig, $self, $arg ) = @_; + return unless $self->has_witness( $arg ); + return $self->$orig( $arg ); +}; + =head1 NAME Text::Tradition - a software model for a set of collated texts @@ -163,6 +186,10 @@ is( ref( $w ), 'Text::Tradition::Witness', "new witness created" ); is( $w->sigil, 'D', "witness has correct sigil" ); is( scalar $s->witnesses, 4, "object now has four witnesses" ); +my $del = $s->del_witness( 'D' ); +is( $del, $w, "Deleted correct witness" ); +is( scalar $s->witnesses, 3, "object has three witnesses again" ); + # TODO test initialization by witness list when we have it =end testing @@ -257,20 +284,6 @@ is( $s->witness('X'), undef, "There is no witness X" ); =cut -sub witness { - my( $self, $sigil ) = @_; - my $requested_wit; - foreach my $wit ( $self->witnesses ) { - if( $wit->sigil eq $sigil ) { - $requested_wit = $wit; - last; - } - } - # We depend on an undef return value for no such witness. - # warn "No such witness $sigil" unless $requested_wit; - return $requested_wit; -} - no Moose; __PACKAGE__->meta->make_immutable; diff --git a/lib/Text/Tradition/Parser/TEI.pm b/lib/Text/Tradition/Parser/TEI.pm index 7b68ba2..c7e607d 100644 --- a/lib/Text/Tradition/Parser/TEI.pm +++ b/lib/Text/Tradition/Parser/TEI.pm @@ -10,22 +10,65 @@ use XML::LibXML::XPathContext; Text::Tradition::Parser::TEI +=head1 SYNOPSIS + + use Text::Tradition; + + my $t_from_file = Text::Tradition->new( + 'name' => 'my text', + 'input' => 'TEI', + 'file' => '/path/to/parallel_seg_file.xml' + ); + + my $t_from_string = Text::Tradition->new( + 'name' => 'my text', + 'input' => 'TEI', + 'string' => $parallel_seg_xml, + ); + + =head1 DESCRIPTION -Parser module for Text::Tradition, given a TEI parallel-segmentation -file that describes a text and its variants. +Parser module for Text::Tradition, given a TEI parallel-segmentation file +that describes a text and its variants. Normally called upon +initialization of Text::Tradition. + +The witnesses for the tradition are taken from the element +within the TEI header; the readings are taken from any

element that +appears in the text body (including elements therein.) =head1 METHODS =over -=item B +=item B( $tradition, $option_hash ) + +Takes an initialized tradition and a set of options; creates the +appropriate nodes and edges on the graph, as well as the appropriate +witness objects. The $option_hash must contain either a 'file' or a +'string' argument with the XML to be parsed. + +=begin testing + +use Text::Tradition; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; +eval { no warnings; binmode $DB::OUT, ":utf8"; }; + +my $par_seg = 't/data/florilegium_tei_ps.xml'; +my $t = Text::Tradition->new( + 'name' => 'inline', + 'input' => 'TEI', + 'file' => $par_seg, + ); -parse( $tei_string ); +is( ref( $t ), 'Text::Tradition', "Parsed parallel-segmentation TEI" ); +if( $t ) { + is( scalar $t->collation->readings, 319, "Collation has all readings" ); + is( scalar $t->collation->paths, 2854, "Collation has all paths" ); +} -Takes an initialized tradition and a string containing the TEI; -creates the appropriate nodes and edges on the graph, as well as -the appropriate witness objects. +=end testing =cut @@ -41,7 +84,7 @@ my $app_count; # Keep track of how many apps we have # is considered a bad idea. The long way round then. my( $LISTWIT, $WITNESS, $TEXT, $W, $SEG, $APP, $RDG, $LEM ) = ( 'listWit', 'witness', 'text', 'w', 'seg', 'app', 'rdg', 'lem' ); -sub make_tagnames { +sub _make_tagnames { my( $ns ) = @_; if( $ns ) { $LISTWIT = "$ns:$LISTWIT"; @@ -77,7 +120,7 @@ sub parse { $ns = 'tei'; $xpc->registerNs( $ns, $tei->namespaceURI ); } - make_tagnames( $ns ); + _make_tagnames( $ns ); # Then get the witnesses and create the witness objects. foreach my $wit_el ( $xpc->findnodes( "//$LISTWIT/$WITNESS" ) ) { @@ -89,7 +132,7 @@ sub parse { # Look for all word/seg node IDs and note their pre-existence. my @attrs = $xpc->findnodes( "//$W|$SEG/attribute::xml:id" ); - save_preexisting_nodeids( @attrs ); + _save_preexisting_nodeids( @attrs ); # Count up how many apps we have. my @apps = $xpc->findnodes( "//$APP" ); @@ -143,11 +186,8 @@ sub parse { $tradition->witness( $sig )->uncorrected_path( \@uncorrected ); } } - # Delete readings that are no longer part of the graph. - # TODO think this is useless actually - foreach ( keys %$substitutions ) { - $tradition->collation->del_reading( $tradition->collation->reading( $_ ) ); - } + + # Calculate the ranks for the nodes. $tradition->collation->calculate_ranks(); # Now that we have ranks, see if we have distinct nodes with identical @@ -205,6 +245,12 @@ sub _return_rdg { return $real; } +=begin testing + +## TODO test specific sorts of nodes of the parallel-seg XML. + +=end testing + ## Recursive helper function to help us navigate through nested XML, ## picking out the text. $tradition is the tradition, needed for ## making readings; $xn is the XML node currently being looked at, @@ -237,7 +283,7 @@ sub _return_rdg { foreach my $w ( split( /\s+/, $str ) ) { # For now, skip punctuation. next if $w !~ /[[:alnum:]]/; - my $rdg = make_reading( $tradition->collation, $w ); + my $rdg = _make_reading( $tradition->collation, $w ); push( @new_readings, $rdg ); unless( $in_var ) { $rdg->make_common; @@ -256,7 +302,7 @@ sub _return_rdg { warn "$c is not among active wits" unless $active_wits{$c}; } my $xml_id = $xn->getAttribute( 'xml:id' ); - my $rdg = make_reading( $tradition->collation, $xn->textContent, $xml_id ); + my $rdg = _make_reading( $tradition->collation, $xn->textContent, $xml_id ); push( @new_readings, $rdg ); unless( $in_var ) { $rdg->make_common; @@ -294,7 +340,7 @@ sub _return_rdg { #print STDERR "Handling reading for " . $xn->getAttribute( 'wit' ) . "\n"; # TODO handle p.c. and s.l. designations too $ac = $xn->getAttribute( 'type' ) && $xn->getAttribute( 'type' ) eq 'a.c.'; - my @rdg_wits = get_sigla( $xn ); + my @rdg_wits = _get_sigla( $xn ); @rdg_wits = ( 'base' ) unless @rdg_wits; # Allow for editorially-supplied readings my @words; foreach ( $xn->childNodes ) { @@ -363,8 +409,26 @@ sub _return_rdg { } +=begin testing + +use XML::LibXML; +use XML::LibXML::XPathContext; +use Text::Tradition::Parser::TEI; + +my $xml_str = 'some text'; +my $el = XML::LibXML->new()->parse_string( $xml_str )->documentElement; +my $xpc = XML::LibXML::XPathContext->new( $el ); +my $obj = $xpc->find( '//rdg' ); + +my @wits = Text::Tradition::Parser::TEI::_get_sigla( $obj ); +is( join( ' ', @wits) , "A B C D", "correctly parsed reading wit string" ); + +=end testing + +=cut + # Helper to extract a list of witness sigla from a reading element. -sub get_sigla { +sub _get_sigla { my( $rdg ) = @_; # Cope if we have been handed a NodeList. There is only # one reading here. @@ -388,13 +452,13 @@ sub get_sigla { my $word_ctr = 0; my %used_nodeids; - sub save_preexisting_nodeids { + sub _save_preexisting_nodeids { foreach( @_ ) { $used_nodeids{$_->getValue()} = 1; } } - sub make_reading { + sub _make_reading { my( $graph, $word, $xml_id ) = @_; if( $xml_id ) { if( exists $used_nodeids{$xml_id} ) { @@ -421,3 +485,21 @@ sub get_sigla { } 1; + +=head1 BUGS / TODO + +=over + +=item * More unit testing + +=back + +=head1 LICENSE + +This package is free software and is provided "as is" without express +or implied warranty. You can redistribute it and/or modify it under +the same terms as Perl itself. + +=head1 AUTHOR + +Tara L Andrews Eaurum@cpan.orgE diff --git a/lib/Text/Tradition/Parser/Tabular.pm b/lib/Text/Tradition/Parser/Tabular.pm index 2cf17d0..9daed14 100644 --- a/lib/Text/Tradition/Parser/Tabular.pm +++ b/lib/Text/Tradition/Parser/Tabular.pm @@ -8,6 +8,24 @@ use Text::CSV_XS; Text::Tradition::Parser::Tabular +=head1 SYNOPSIS + + use Text::Tradition; + + my $t_from_file = Text::Tradition->new( + 'name' => 'my text', + 'input' => 'Tabular', + 'file' => '/path/to/collation.csv', + 'sep_char' => ',' + ); + + my $t_from_string = Text::Tradition->new( + 'name' => 'my text', + 'input' => 'Tabular', + 'string' => $tab_separated_collation, + 'sep_char' => "\t", + ); + =head1 DESCRIPTION Parser module for Text::Tradition to read an alignment table format, such as CSV. @@ -16,13 +34,48 @@ Parser module for Text::Tradition to read an alignment table format, such as CSV =over -=item B +=item B( $tradition, $option_hash ) + +Takes an initialized tradition and a set of options; creates the +appropriate nodes and edges on the graph, as well as the appropriate +witness objects. The $option_hash must contain either a 'file' or a +'string' argument with the table to be parsed; it may also contain a +'sep_char' argument to specify how the fields are separated. + +The table should have witnesses arranged in columns, with the witness sigla +in the first row. Empty cells are interpreted as omissions (and thus +stemmatologically relevant.) Longer lacunae in the text, to be disregarded +in cladistic analysis, may be represented by filling the appropriate cells +with the tag '#LACUNA#'. + +If a witness name ends in the collation's ac_label, it will be treated as +an 'ante-correction' version of the 'main' witness whose sigil it shares. + +=begin testing + +use Text::Tradition; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; +eval { no warnings; binmode $DB::OUT, ":utf8"; }; + +my $csv = 't/data/florilegium.csv'; +my $t = Text::Tradition->new( + 'name' => 'inline', + 'input' => 'Tabular', + 'file' => $csv, + 'sep_char' => ',', + ); -parse( $graph, $graphml_string ); +is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" ); -Takes an initialized Text::Tradition::Graph object and a string -containing the GraphML; creates the appropriate nodes and edges on the -graph. +### TODO Check these figures +if( $t ) { + is( scalar $t->collation->readings, 313, "Collation has all readings" ); + is( scalar $t->collation->paths, 2877, "Collation has all paths" ); + is( scalar $t->witnesses, 13, "Collation has all witnesses" ); +} + +=end testing =cut @@ -33,7 +86,6 @@ sub parse { binary => 1, # binary for UTF-8 sep_char => exists $opts->{'sep_char'} ? $opts->{'sep_char'} : "\t" } ); - # TODO Handle being given a file my $alignment_table; if( exists $opts->{'string' } ) { @@ -59,10 +111,15 @@ sub parse { # Set up the witnesses we find in the first line my @witnesses; + my %ac_wits; # Track these for later removal foreach my $sigil ( @{$alignment_table->[0]} ) { my $wit = $tradition->add_witness( 'sigil' => $sigil ); $wit->path( [ $c->start ] ); push( @witnesses, $wit ); + my $aclabel = $c->ac_label; + if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) { + $ac_wits{$1} = $wit; + } } # Now for the next rows, make nodes as necessary, assign their ranks, and @@ -109,6 +166,16 @@ sub parse { $wit->path( $new_p ); } + # Fold any a.c. witnesses into their main witness objects, and + # delete the independent a.c. versions. + foreach my $a ( keys %ac_wits ) { + my $main_wit = $tradition->witness( $a ); + next unless $main_wit; + my $ac_wit = $ac_wits{$a}; + $main_wit->uncorrected_path( $ac_wit->path ); + $tradition->del_witness( $ac_wit ); + } + # Join up the paths. $c->make_witness_paths; } @@ -130,4 +197,14 @@ sub make_nodes { return \%unique; } -1; \ No newline at end of file +1; + +=head1 LICENSE + +This package is free software and is provided "as is" without express +or implied warranty. You can redistribute it and/or modify it under +the same terms as Perl itself. + +=head1 AUTHOR + +Tara L Andrews Eaurum@cpan.orgE diff --git a/t/data/florilegium_graphml.csv b/t/data/florilegium.csv similarity index 100% rename from t/data/florilegium_graphml.csv rename to t/data/florilegium.csv diff --git a/t/text_tradition.t b/t/text_tradition.t index 7ddc7e1..4c763b9 100644 --- a/t/text_tradition.t +++ b/t/text_tradition.t @@ -30,6 +30,10 @@ is( ref( $w ), 'Text::Tradition::Witness', "new witness created" ); is( $w->sigil, 'D', "witness has correct sigil" ); is( scalar $s->witnesses, 4, "object now has four witnesses" ); +my $del = $s->del_witness( 'D' ); +is( $del, $w, "Deleted correct witness" ); +is( scalar $s->witnesses, 3, "object has three witnesses again" ); + # TODO test initialization by witness list when we have it } diff --git a/t/text_tradition_parser_tabular.t b/t/text_tradition_parser_tabular.t new file mode 100644 index 0000000..2696b52 --- /dev/null +++ b/t/text_tradition_parser_tabular.t @@ -0,0 +1,37 @@ +#!/usr/bin/perl -w + +use strict; +use Test::More 'no_plan'; +$| = 1; + + + +# =begin testing +{ +use Text::Tradition; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; +eval { no warnings; binmode $DB::OUT, ":utf8"; }; + +my $csv = 't/data/florilegium.csv'; +my $t = Text::Tradition->new( + 'name' => 'inline', + 'input' => 'Tabular', + 'file' => $csv, + 'sep_char' => ',', + ); + +is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" ); + +### TODO Check these figures +if( $t ) { + is( scalar $t->collation->readings, 313, "Collation has all readings" ); + is( scalar $t->collation->paths, 2877, "Collation has all paths" ); + is( scalar $t->witnesses, 13, "Collation has all witnesses" ); +} +} + + + + +1; diff --git a/t/text_tradition_parser_tei.t b/t/text_tradition_parser_tei.t new file mode 100644 index 0000000..09e9f0b --- /dev/null +++ b/t/text_tradition_parser_tei.t @@ -0,0 +1,57 @@ +#!/usr/bin/perl -w + +use strict; +use Test::More 'no_plan'; +$| = 1; + + + +# =begin testing +{ +use Text::Tradition; +binmode STDOUT, ":utf8"; +binmode STDERR, ":utf8"; +eval { no warnings; binmode $DB::OUT, ":utf8"; }; + +my $par_seg = 't/data/florilegium_tei_ps.xml'; +my $t = Text::Tradition->new( + 'name' => 'inline', + 'input' => 'TEI', + 'file' => $par_seg, + ); + +is( ref( $t ), 'Text::Tradition', "Parsed parallel-segmentation TEI" ); +if( $t ) { + is( scalar $t->collation->readings, 319, "Collation has all readings" ); + is( scalar $t->collation->paths, 2854, "Collation has all paths" ); +} +} + + + +# =begin testing +{ +## TODO test specific sorts of nodes of the parallel-seg XML. +} + + + +# =begin testing +{ +use XML::LibXML; +use XML::LibXML::XPathContext; +use Text::Tradition::Parser::TEI; + +my $xml_str = 'some text'; +my $el = XML::LibXML->new()->parse_string( $xml_str )->documentElement; +my $xpc = XML::LibXML::XPathContext->new( $el ); +my $obj = $xpc->find( '//rdg' ); + +my @wits = Text::Tradition::Parser::TEI::_get_sigla( $obj ); +is( join( ' ', @wits) , "A B C D", "correctly parsed reading wit string" ); +} + + + + +1;