From: Tara L Andrews Date: Wed, 21 Sep 2011 13:35:55 +0000 (+0200) Subject: add support for lacunas within the witnesses X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=eca160573f4f28f53faa7dd97ee5cb6740a4b517;p=scpubgit%2Fstemmatology.git add support for lacunas within the witnesses --- diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index 7d48cfc..ea55378 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -17,6 +17,7 @@ has 'graph' => ( isa => 'Graph::Easy', handles => { add_reading => 'add_node', + add_lacuna => 'add_node', del_reading => 'del_node', del_segment => 'del_node', add_path => 'add_edge', @@ -121,6 +122,15 @@ sub BUILD { $self->graph->set_attribute( 'node', 'shape', $shape ); } +around add_lacuna => sub { + my $orig = shift; + my $self = shift; + my $id = shift @_; + my $l = $self->$orig( '#LACUNA_' . $id . '#' ); + $l->is_lacuna( 1 ); + return $l; +}; + # Wrapper around add_path around add_path => sub { @@ -296,7 +306,7 @@ sub as_svg { my( $svg, $err ); my $dotfile = File::Temp->new(); ## TODO REMOVE - $dotfile->unlink_on_destroy(0); + # $dotfile->unlink_on_destroy(0); binmode $dotfile, ':utf8'; print $dotfile $self->as_dot(); push( @cmd, $dotfile->filename ); @@ -518,6 +528,7 @@ sub make_alignment_table { my $table; my @all_pos = sort { $a <=> $b } $self->possible_positions; foreach my $wit ( $self->tradition->witnesses ) { + # print STDERR "Making witness row(s) for " . $wit->sigil . "\n"; my @row = _make_witness_row( $wit->path, \@all_pos ); unshift( @row, $wit->sigil ); push( @$table, \@row ); @@ -537,10 +548,22 @@ sub _make_witness_row { my %char_hash; map { $char_hash{$_} = undef } @$positions; foreach my $rdg ( @$path ) { - $char_hash{$rdg->rank} = $rdg->text; + my $rtext = $rdg->text; + $rtext = '#LACUNA#' if $rdg->is_lacuna; + $char_hash{$rdg->rank} = $rtext; } my @row = map { $char_hash{$_} } @$positions; - return @row; + # Fill in lacuna markers for undef spots in the row + my $last_el = shift @row; + my @filled_row = ( $last_el ); + foreach my $el ( @row ) { + if( $last_el && $last_el eq '#LACUNA#' && !defined $el ) { + $el = '#LACUNA#'; + } + push( @filled_row, $el ); + $last_el = $el; + } + return @filled_row; } # Helper to turn the witnesses along columns rather than rows. Assumes diff --git a/lib/Text/Tradition/Collation/Reading.pm b/lib/Text/Tradition/Collation/Reading.pm index 5c1d866..f598418 100644 --- a/lib/Text/Tradition/Collation/Reading.pm +++ b/lib/Text/Tradition/Collation/Reading.pm @@ -17,6 +17,11 @@ has 'rank' => ( isa => 'Int', predicate => 'has_rank', ); + +has 'is_lacuna' => ( + is => 'rw', + isa => 'Bool', + ); # This contains an array of reading objects; the array is a pool, # shared by the reading objects inside the pool. When a reading is @@ -63,7 +68,11 @@ sub text { # Wrapper function around 'label' attribute. my $self = shift; if( @_ ) { - $self->set_attribute( 'label', $_[0] ); + if( defined $_[0] ) { + $self->set_attribute( 'label', $_[0] ); + } else { + $self->del_attribute( 'label' ); + } } return $self->label; } diff --git a/lib/Text/Tradition/Parser/TEI.pm b/lib/Text/Tradition/Parser/TEI.pm index 68172f7..0af2e64 100644 --- a/lib/Text/Tradition/Parser/TEI.pm +++ b/lib/Text/Tradition/Parser/TEI.pm @@ -33,6 +33,7 @@ my $text = {}; # Hash of arrays, one per eventual witness we find. my $substitutions = {}; # Keep track of merged readings my $app_anchors = {}; # Track apparatus references my $app_ac = {}; # Save a.c. readings +my $app_count; # Keep track of how many apps we have # Create the package variables for tag names. @@ -76,12 +77,16 @@ sub parse { my $source = $wit_el->toString(); $tradition->add_witness( sigil => $sig, source => $source ); } - map { $text->{$_->sigil} = [] } $tradition->witnesses; + # Look for all word/seg node IDs and note their pre-existence. my @attrs = $xpc->findnodes( "//$W|$SEG/attribute::xml:id" ); save_preexisting_nodeids( @attrs ); + # Count up how many apps we have. + my @apps = $xpc->findnodes( "//$APP" ); + $app_count = scalar( @apps ); + # Now go through the children of the text element and pull out the # actual text. foreach my $xml_el ( $xpc->findnodes( "//$TEXT" ) ) { @@ -197,6 +202,7 @@ sub _return_rdg { { my @active_wits; my $current_app; + my $seen_apps; sub _get_readings { my( $tradition, $xn, $in_var, $ac, @cur_wits ) = @_; @@ -249,6 +255,7 @@ sub _return_rdg { push( @{$text->{$_}}, $rdg ) unless $ac; } } elsif ( $xn->nodeName eq 'app' ) { + $seen_apps++; $current_app = $xn->getAttribute( 'xml:id' ); # print STDERR "Handling app $current_app\n"; # Keep the reading sets in this app. @@ -310,6 +317,14 @@ sub _return_rdg { #print STDERR "Handling witEnd\n"; my $regexp = '^(' . join( '|', @cur_wits ) . ')$'; @active_wits = grep { $_ !~ /$regexp/ } @active_wits; + # Record a lacuna, unless this is the last app. + unless( $seen_apps == $app_count ) { + foreach my $i ( 0 .. $#cur_wits ) { + my $w = $cur_wits[$i]; + my $l = $tradition->collation->add_lacuna( $current_app . "_$i" ); + push( @{$text->{$w}}, $l ); + } + } } elsif( $xn->nodeName eq 'witDetail' ) { # Ignore these for now. return; diff --git a/lib/Text/Tradition/Parser/Tabular.pm b/lib/Text/Tradition/Parser/Tabular.pm index 10d0730..16e2863 100644 --- a/lib/Text/Tradition/Parser/Tabular.pm +++ b/lib/Text/Tradition/Parser/Tabular.pm @@ -30,7 +30,8 @@ sub parse { my( $tradition, $tab_str ) = @_; # TODO Allow setting of sep_char my $c = $tradition->collation; # shorthand - my $csv = Text::CSV_XS->new( { binary => 1 } ); # binary for UTF-8 + my $csv = Text::CSV_XS->new( { binary => 1, # binary for UTF-8 + sep_char => "\t" } ); my @lines = split( "\n", $tab_str ); # Conveniently, we are basically receiving exactly the sort of alignment table # we might want to produce later. May as well save it. @@ -69,10 +70,32 @@ sub parse { } } - # Push the end node onto all paths. + + # Collapse our lacunae into a single node and + # push the end node onto all paths. $c->end->rank( scalar @$alignment_table ); foreach my $wit ( @witnesses ) { - push( @{$wit->path}, $c->end ); + my $p = $wit->path; + my $last_rdg = shift @$p; + my $new_p = [ $last_rdg ]; + foreach my $rdg ( @$p ) { + if( $rdg->text eq '#LACUNA#' ) { + # If we are in a lacuna already, drop this node. + # Otherwise make a lacuna node and drop this node. + unless( $last_rdg->is_lacuna ) { + my $l = $c->add_lacuna( $rdg->name ); + $l->rank( $rdg->rank ); + push( @$new_p, $l ); + $last_rdg = $l; + } + $c->del_reading( $rdg ); + } else { + # No lacuna, save the reading. + push( @$new_p, $rdg ); + } + } + push( @$new_p, $c->end ); + $wit->path( $new_p ); } # Join up the paths. diff --git a/lib/Text/Tradition/Stemma.pm b/lib/Text/Tradition/Stemma.pm index d4466aa..5928016 100644 --- a/lib/Text/Tradition/Stemma.pm +++ b/lib/Text/Tradition/Stemma.pm @@ -54,7 +54,9 @@ sub convert_characters { # This is a simple algorithm that treats every reading as different. # Eventually we will want to be able to specify how relationships # affect the character matrix. - my %unique = ( '__UNDEF__' => 'X' ); + my %unique = ( '__UNDEF__' => 'X', + '#LACUNA#' => '?', + ); my $ctr = 0; foreach my $word ( @$row ) { if( $word && !exists $unique{$word} ) { @@ -87,6 +89,8 @@ sub run_pars { # Set up a temporary directory for all the default Phylip files. my $phylip_dir = File::Temp->newdir(); + print STDERR $phylip_dir . "\n"; + # $phylip_dir->unlink_on_destroy(0); # We need an infile, and we need a command input file. open( MATRIX, ">$phylip_dir/infile" ) or die "Could not write $phylip_dir/infile"; print MATRIX $self->pars_input();