From: Tara L Andrews Date: Tue, 20 Dec 2011 13:39:27 +0000 (+0100) Subject: cope with some parsing niggles X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=a7fb313395e449d44e91da2bb6a217451cae57ec;p=scpubgit%2Fstemmatology.git cope with some parsing niggles --- diff --git a/META.yml b/META.yml index 18a6c3b..64a5c0f 100644 --- a/META.yml +++ b/META.yml @@ -1,13 +1,14 @@ --- -abstract: ~ +abstract: 'a software model for a set of collated texts' author: - 'Tara L Andrews ' build_requires: - ExtUtils::MakeMaker: 6.42 + ExtUtils::MakeMaker: 6.56 configure_requires: - ExtUtils::MakeMaker: 6.42 + ExtUtils::MakeMaker: 6.56 distribution_type: module -generated_by: 'Module::Install version 0.91' +dynamic_config: 1 +generated_by: 'Module::Install version 1.00' license: perl meta-spec: url: http://module-build.sourceforge.net/META-spec-v1.4.html @@ -35,7 +36,7 @@ requires: Text::CSV_XS: 0 XML::LibXML: 0 XML::LibXML::XPathContext: 0 - perl: 5.010 + perl: 5.12.0 resources: license: http://dev.perl.org/licenses/ -version: undef +version: 0.1 diff --git a/lib/Text/Tradition/Parser/TEI.pm b/lib/Text/Tradition/Parser/TEI.pm index a86adf7..db18da6 100644 --- a/lib/Text/Tradition/Parser/TEI.pm +++ b/lib/Text/Tradition/Parser/TEI.pm @@ -129,7 +129,7 @@ sub parse { map { $text->{$_->sigil} = [] } $tradition->witnesses; # Look for all word/seg node IDs and note their pre-existence. - my @attrs = $xpc->findnodes( "//$W|$SEG/attribute::xml:id" ); + my @attrs = $xpc->findnodes( "//$W/attribute::xml:id" ); _save_preexisting_nodeids( @attrs ); # Count up how many apps we have. @@ -147,7 +147,6 @@ sub parse { # Join them up. my $c = $tradition->collation; foreach my $sig ( keys %$text ) { - next if $sig eq 'base'; # Skip base text readings with no witnesses. # Determine the list of readings for my $sequence = $text->{$sig}; my @real_sequence = ( $c->start ); @@ -186,7 +185,7 @@ sub parse { } # Calculate the ranks for the nodes. - $tradition->collation->calculate_ranks(); + $tradition->collation->calculate_ranks(); # Now that we have ranks, see if we have distinct nodes with identical # text and identical rank that can be merged. @@ -335,7 +334,7 @@ sub _return_rdg { # TODO handle p.c. and s.l. designations too $ac = $xn->getAttribute( 'type' ) && $xn->getAttribute( 'type' ) eq 'a.c.'; my @rdg_wits = _get_sigla( $xn ); - @rdg_wits = ( 'base' ) unless @rdg_wits; # Allow for editorially-supplied readings + return unless @rdg_wits; # Skip readings that appear in no witnesses my @words; foreach ( $xn->childNodes ) { my @rdg_set = _get_readings( $tradition, $_, 1, $ac, @rdg_wits ); @@ -388,7 +387,8 @@ sub _return_rdg { push( @{$text->{$w}}, $l ); } } - } elsif( $xn->nodeName eq 'witDetail' ) { + } elsif( $xn->nodeName eq 'witDetail' + || $xn->nodeName eq 'note' ) { # Ignore these for now. return; } else { @@ -433,6 +433,7 @@ sub _get_sigla { my @wits; if( ref( $rdg ) eq 'XML::LibXML::Element' ) { my $witstr = $rdg->getAttribute( 'wit' ); + return () unless $witstr; $witstr =~ s/^\s+//; $witstr =~ s/\s+$//; @wits = split( /\s+/, $witstr ); diff --git a/lib/Text/Tradition/Stemma.pm b/lib/Text/Tradition/Stemma.pm index e241d56..28a9109 100644 --- a/lib/Text/Tradition/Stemma.pm +++ b/lib/Text/Tradition/Stemma.pm @@ -145,14 +145,25 @@ sub convert_characters { my %unique = ( '__UNDEF__' => 'X', '#LACUNA#' => '?', ); + my %count; my $ctr = 0; foreach my $word ( @$row ) { if( $word && !exists $unique{$word} ) { $unique{$word} = chr( 65 + $ctr ); $ctr++; } + $count{$word}++ if $word; } + # Try to keep variants under 8 by lacunizing any singletons. if( scalar( keys %unique ) > 8 ) { + foreach my $word ( keys %count ) { + if( $count{$word} == 1 ) { + $unique{$word} = '?'; + } + } + } + my %u = reverse %unique; + if( scalar( keys %u ) > 8 ) { warn "Have more than 8 variants on this location; phylip will break"; } my @chars = map { $_ ? $unique{$_} : $unique{'__UNDEF__' } } @$row; diff --git a/script/make_tradition.pl b/script/make_tradition.pl index f928c33..bb7cb33 100755 --- a/script/make_tradition.pl +++ b/script/make_tradition.pl @@ -11,8 +11,8 @@ binmode STDERR, ":utf8"; binmode STDOUT, ":utf8"; eval { no warnings; binmode $DB::OUT, ":utf8"; }; -my( $informat, $inbase, $outformat, $help, $linear, $name, $HACK ) - = ( '', '', '', '', 1, 'Tradition', 0 ); +my( $informat, $inbase, $outformat, $help, $linear, $name, $HACK, $sep ) + = ( '', '', '', '', 1, 'Tradition', 0, ',' ); GetOptions( 'i|in=s' => \$informat, 'b|base=s' => \$inbase, @@ -20,6 +20,7 @@ GetOptions( 'i|in=s' => \$informat, 'l|linear!' => \$linear, 'n|name' => \$name, 'h|help' => \$help, + 'sep=s' => \$sep, 'hack' => \$HACK, ); @@ -57,6 +58,7 @@ my %args = ( 'input' => $informat, 'linear' => $linear ); $args{'base'} = $inbase if $inbase; $args{'name'} = $name if $name; +$args{'sep_char'} = $sep if $informat eq 'Tabular'; ### Custom hacking for Stone if( $informat eq 'CollateText' ) { $args{'sigla'} = [ qw/ S M X V Z Bb B K W L / ];