X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FParser%2FTabular.pm;h=a9ce519ac9f003e90bbf8ec1ffec0638b9d36094;hb=9bdf9d67ee6007981f4cc59e844f02de4df7a289;hp=4c1e5114238612b79917d0fb555d7ce00e41a3f7;hpb=1d3104950074a3d7470f01ef0ec8e9046d95b124;p=scpubgit%2Fstemmatology.git diff --git a/lib/Text/Tradition/Parser/Tabular.pm b/lib/Text/Tradition/Parser/Tabular.pm index 4c1e511..a9ce519 100644 --- a/lib/Text/Tradition/Parser/Tabular.pm +++ b/lib/Text/Tradition/Parser/Tabular.pm @@ -2,7 +2,7 @@ package Text::Tradition::Parser::Tabular; use strict; use warnings; -use Text::CSV_XS; +use Text::CSV; =head1 NAME @@ -68,11 +68,49 @@ is( ref( $t ), 'Text::Tradition', "Parsed florilegium CSV file" ); ### TODO Check these figures if( $t ) { - is( scalar $t->collation->readings, 312, "Collation has all readings" ); - is( scalar $t->collation->paths, 363, "Collation has all paths" ); + is( scalar $t->collation->readings, 311, "Collation has all readings" ); + is( scalar $t->collation->paths, 361, "Collation has all paths" ); is( scalar $t->witnesses, 13, "Collation has all witnesses" ); } +# Check that we have the right witnesses +my %seen_wits; +map { $seen_wits{$_} = 0 } qw/ A B C D E F G H K P Q S T /; +foreach my $wit ( $t->witnesses ) { + $seen_wits{$wit->sigil} = 1; +} +is( scalar keys %seen_wits, 13, "No extra witnesses were made" ); +foreach my $k ( keys %seen_wits ) { + ok( $seen_wits{$k}, "Witness $k still exists" ); +} + +# Check that the witnesses have the right texts +foreach my $wit ( $t->witnesses ) { + my $origtext = join( ' ', @{$wit->text} ); + my $graphtext = $t->collation->path_text( $wit->sigil ); + is( $graphtext, $origtext, "Collation matches original for witness " . $wit->sigil ); +} + +# Check that the a.c. witnesses have the right text +map { $seen_wits{$_} = 0 } qw/ A B C D F G H K S /; +foreach my $k ( keys %seen_wits ) { + my $wit = $t->witness( $k ); + if( $seen_wits{$k} ) { + ok( $wit->is_layered, "Witness $k got marked as layered" ); + ok( $wit->has_layertext, "Witness $k has an a.c. version" ); + my $origtext = join( ' ', @{$wit->layertext} ); + my $acsig = $wit->sigil . $t->collation->ac_label; + my $graphtext = $t->collation->path_text( $acsig ); + is( $graphtext, $origtext, "Collation matches original a.c. for witness $k" ); + } else { + ok( !$wit->is_layered, "Witness $k not marked as layered" ); + ok( !$wit->has_layertext, "Witness $k has no a.c. version" ); + } +} + +# Check that we only have collation relationships where we need them +is( scalar $t->collation->relationships, 3, "Redundant collations were removed" ); + =end testing =cut @@ -80,10 +118,14 @@ if( $t ) { sub parse { my( $tradition, $opts ) = @_; my $c = $tradition->collation; # shorthand - my $csv = Text::CSV_XS->new( { - binary => 1, # binary for UTF-8 - sep_char => exists $opts->{'sep_char'} ? $opts->{'sep_char'} : "\t" } - ); + my $csv_options = { 'binary' => 1 }; + $csv_options->{'sep_char'} = $opts->{'sep_char'} || "\t"; + if( $csv_options->{'sep_char'} eq "\t" ) { + # If it is really tab separated, nothing is an escape char. + $csv_options->{'quote_char'} = undef; + $csv_options->{'escape_char'} = undef; + } + my $csv = Text::CSV->new( $csv_options ); my $alignment_table; if( exists $opts->{'string' } ) { @@ -111,22 +153,41 @@ sub parse { # Set up the witnesses we find in the first line my @witnesses; - my %ac_wits; # Track these for later removal + my %ac_wits; # Track layered witness -> main witness mapping + my $aclabel = $c->ac_label; foreach my $sigil ( @{$alignment_table->[0]} ) { - my $wit = $tradition->add_witness( 'sigil' => $sigil ); + if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) { + # Sanitize the sigil name to an XML name + $sigil = $1 . '_layered'; + $ac_wits{$sigil} = $1; + } + my $wit = $tradition->add_witness( + 'sigil' => $sigil, 'sourcetype' => 'collation' ); $wit->path( [ $c->start ] ); push( @witnesses, $wit ); my $aclabel = $c->ac_label; - if( $sigil =~ /^(.*)\Q$aclabel\E$/ ) { - $ac_wits{$1} = $wit; - } } + # Save the original witness text sequences. Have to loop back through + # the witness columns after we have identified all the a.c. witnesses. + foreach my $idx ( 0 .. $#{$alignment_table->[0]} ) { + my @sequence = map { $_->[$idx] } @{$alignment_table}; + my $sigil = shift @sequence; + my $is_layer = exists( $ac_wits{$sigil} ); + my $wit = $tradition->witness( $is_layer ? $ac_wits{$sigil} : $sigil ); + # Now get rid of gaps and meta-readings like #LACUNA# + my @words = grep { $_ && $_ !~ /^\#.*\#$/ } @sequence; + $is_layer ? $wit->layertext( \@words ) : $wit->text( \@words ); + } + + my $nocollate = ( scalar( @witnesses ) * scalar @$alignment_table ) > 150000; + print STDERR "Tradition too big for row collation\n" if $nocollate; + # Now for the next rows, make nodes as necessary, assign their ranks, and # add them to the witness paths. foreach my $idx ( 1 .. $#{$alignment_table} ) { my $row = $alignment_table->[$idx]; - my $nodes = make_nodes( $c, $row, $idx ); + my $nodes = _make_nodes( $c, $row, $idx, $nocollate ); foreach my $w ( 0 .. $#{$row} ) { # push the appropriate node onto the appropriate witness path my $word = $row->[$w]; @@ -146,30 +207,11 @@ sub parse { my $last_rdg = shift @$p; my $new_p = [ $last_rdg ]; foreach my $rdg ( @$p ) { - $DB::single = 1 if $rdg->id eq '228,1'; - if( $rdg->text eq '#LACUNA#' ) { - # If we are in a lacuna already, drop this node. - # Otherwise make a lacuna node and drop this node. - unless( $last_rdg->is_lacuna ) { - my $l_id = 'l' . $rdg->id; - my $l; - if( $c->has_reading( $l_id ) ) { - $l = $c->reading( $l_id ); - } else { - $l = $c->add_reading( { - 'collation' => $c, - 'id' => $l_id, - 'is_lacuna' => 1, - } ); - } - push( @$new_p, $l ); - $last_rdg = $l; - } - $c->del_reading( $rdg ); - } else { - # No lacuna, save the reading. - push( @$new_p, $rdg ); - } + # Omit the reading if we are in a lacuna already. + next if $rdg->is_lacuna && $last_rdg->is_lacuna; + # Save the reading otherwise. + push( @$new_p, $rdg ); + $last_rdg = $rdg; } push( @$new_p, $c->end ); $wit->path( $new_p ); @@ -178,35 +220,88 @@ sub parse { # Fold any a.c. witnesses into their main witness objects, and # delete the independent a.c. versions. foreach my $a ( keys %ac_wits ) { - my $main_wit = $tradition->witness( $a ); + my $ac_wit = $tradition->witness( $a ); + my $main_wit = $tradition->witness( $ac_wits{$a} ); next unless $main_wit; - my $ac_wit = $ac_wits{$a}; + $main_wit->is_layered(1); $main_wit->uncorrected_path( $ac_wit->path ); $tradition->del_witness( $ac_wit ); } - + # Join up the paths. $c->make_witness_paths; + # Delete our unused lacuna nodes. + foreach my $rdg ( grep { $_->is_lacuna } $c->readings ) { + $c->del_reading( $rdg ) unless $c->reading_witnesses( $rdg ); + } + + # Do a consistency check. + foreach my $wit ( $tradition->witnesses ) { + my $pathtext = $c->path_text( $wit->sigil ); + my $origtext = join( ' ', @{$wit->text} ); + warn "Text differs for witness " . $wit->sigil + unless $pathtext eq $origtext; + if( $wit->is_layered ) { + $pathtext = $c->path_text( $wit->sigil.$c->ac_label ); + $origtext = join( ' ', @{$wit->layertext} ); + warn "Ante-corr text differs for witness " . $wit->sigil + unless $pathtext eq $origtext; + } else { + warn "Text " . $wit->sigil . " has a layered text but is not marked as layered" + if $wit->has_layertext; + } + } + + # Note that our ranks and common readings are set. + $c->_graphcalc_done(1); + # Remove redundant collation relationships. + $c->relations->filter_collations() unless $nocollate; } -sub make_nodes { - my( $collation, $row, $index ) = @_; +sub _make_nodes { + my( $collation, $row, $index, $nocollate ) = @_; my %unique; + my $commonctr = 0; # Holds the number of unique readings + gaps, ex. lacunae. foreach my $w ( @$row ) { $unique{$w} = 1 if $w; + $commonctr +=1 unless ( $w && $w eq '#LACUNA#' ); } my $ctr = 1; foreach my $w ( keys %unique ) { my $rargs = { - 'collation' => $collation, - 'id' => "$index,$ctr", + 'id' => "r$index.$ctr", 'rank' => $index, 'text' => $w, }; + if( $w eq '#LACUNA#' ) { + $rargs->{'is_lacuna'} = 1; + } elsif( $commonctr == 1 ) { + $rargs->{'is_common'} = 1; + } my $r = $collation->add_reading( $rargs ); $unique{$w} = $r; $ctr++; } + # Collate this sequence of readings via a single 'collation' relationship. + unless( $nocollate ) { + my @rankrdgs = values %unique; + my $collation_rel; + while( @rankrdgs ) { + my $r = shift @rankrdgs; + next if $r->is_meta; + foreach my $nr ( @rankrdgs ) { + next if $nr->is_meta; + if( $collation_rel ) { + $collation->add_relationship( $r, $nr, $collation_rel ); + } else { + $collation->add_relationship( $r, $nr, + { 'type' => 'collated', + 'annotation' => "Parsed together for rank $index" } ); + $collation_rel = $collation->get_relationship( $r, $nr ); + } + } + } + } return \%unique; }