X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FParser%2FBaseText.pm;h=e07cdec4f037aceb16d2648d20031bff2d95bb04;hb=861c3e272c65c7553ad7c03cca51cbdd561f126c;hp=a8ea38ff48444b4fa7a49787a122ecace5014b4f;hpb=910a0a6d9f858731358772a45e52817b039cf019;p=scpubgit%2Fstemmatology.git diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm index a8ea38f..e07cdec 100644 --- a/lib/Text/Tradition/Parser/BaseText.pm +++ b/lib/Text/Tradition/Parser/BaseText.pm @@ -17,7 +17,7 @@ merge_base( $graph, 'reference.txt', @apparatus_entries ) =head1 DESCRIPTION For an overview of the package, see the documentation for the -Text::Tradition::Graph module. +Text::Tradition module. This module is meant for use with certain of the other Parser classes - whenever a list of variants is given with reference to a base text, @@ -31,9 +31,9 @@ will join those listed variants onto the reference text. =item B -parse( $graph, %opts ); +parse( $graph, $opts ); -Takes an initialized graph and a set of options, which must include: +Takes an initialized graph and a hashref of options, which must include: - 'base' - the base text referenced by the variants - 'format' - the format of the variant list - 'data' - the variants, in the given format. @@ -41,12 +41,13 @@ Takes an initialized graph and a set of options, which must include: =cut sub parse { - my( $tradition, %opts ) = @_; + my( $tradition, $opts ) = @_; - my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'}; + my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'}; load( $format_mod ); - my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} ); - merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries ); + # TODO Handle a string someday if we ever have a format other than KUL + my @apparatus_entries = $format_mod->can('read')->( $opts ); + merge_base( $tradition->collation, $opts, @apparatus_entries ); } =item B @@ -84,11 +85,10 @@ my $edits_required = {}; # edits_required -> wit -> [ { start_idx, end_idx, items } ] sub merge_base { - my( $collation, $base_file, @app_entries ) = @_; - my @base_line_starts = read_base( $base_file, $collation ); + my( $collation, $opts, @app_entries ) = @_; + my @base_line_starts = read_base( $opts->{'base'}, $collation ); my %all_witnesses; - my @unwitnessed_lemma_nodes; foreach my $app ( @app_entries ) { my( $line, $num ) = split( /\./, $app->{_id} ); # DEBUG with a short graph @@ -112,12 +112,12 @@ sub merge_base { my %seen; while( $lemma_start ne $too_far ) { # Loop detection - if( $seen{ $lemma_start->name() } ) { - warn "Detected loop at " . $lemma_start->name() . + if( $seen{ $lemma_start->id() } ) { + warn "Detected loop at " . $lemma_start->id() . ", ref $line,$num"; last; } - $seen{ $lemma_start->name() } = 1; + $seen{ $lemma_start->id() } = 1; # Try to match the lemma. my $unmatch = 0; @@ -168,7 +168,7 @@ sub merge_base { my @lemma_set = $collation->reading_sequence( $lemma_start, $lemma_end ); my @reading_sets = [ @lemma_set ]; - + # For each reading that is not rdg_0, we create the variant # reading nodes, and store the range as an edit operation on # the base text. @@ -177,11 +177,6 @@ sub merge_base { foreach my $k ( grep { /^rdg/ } keys( %$app ) ) { my @mss = grep { $app->{$_} eq $k } keys( %$app ); - # Keep track of lemma nodes that don't actually appear in - # any MSS; we will want to remove them from the collation. - push( @unwitnessed_lemma_nodes, @lemma_set ) - if !@mss && $k eq 'rdg_0'; - # Keep track of what witnesses we have seen. @all_witnesses{ @mss } = ( 1 ) x scalar( @mss ); # Keep track of which witnesses bear corrected readings here. @@ -201,8 +196,9 @@ sub merge_base { my $ctr = 0; foreach my $vw ( @variant ) { my $vwname = "$k/$line.$num.$ctr"; $ctr++; - my $vwreading = $collation->add_reading( $vwname ); - $vwreading->text( $vw ); + my $vwreading = $collation->add_reading( { + 'id' => $vwname, + 'text' => $vw } ); push( @variant_readings, $vwreading ); } @@ -225,7 +221,7 @@ sub merge_base { foreach my $rkey ( keys %$variant_objects ) { # Object is argument list for splice, so: # offset, length, replacements - my $edit_object = [ $lemma_start->name, + my $edit_object = [ $lemma_start->id, scalar( @lemma_set ), $variant_objects->{$rkey}->{reading} ]; foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) { @@ -277,80 +273,32 @@ sub merge_base { # ones we have created so far. Also remove any unwitnessed # lemma nodes (TODO unless we are treating base as witness) foreach ( $collation->paths() ) { - $collation->del_path( $_ ); - } - foreach( @unwitnessed_lemma_nodes ) { - $collation->del_reading( $_ ); - # TODO do we need to delete any relationship paths here? + $collation->del_path( $_, $collation->baselabel ); } ### HACKY HACKY Do some one-off path corrections here. - if( $collation->linear ) { - my $c = $collation; - my $end = $SHORTEND ? $SHORTEND : 155; - # Vb11 - my $path; - if( $end > 16 ) { - $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') ); - $path = $c->tradition->witness('Vb11')->path; - splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) ); - $path = $c->tradition->witness('Vb11')->uncorrected_path; - splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) ); - } - # What else? - # Vb26: - $path = $c->tradition->witness('Vb26')->path; - splice( @$path, 618, 0, $c->reading('rdg_1/46.1.1') ) if $end > 46; - # Vb13: - $path = $c->tradition->witness('Vb13')->path; - splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58; - $path = $c->tradition->witness('Vb13')->uncorrected_path; - splice( @$path, 758, 0, $c->reading( '58,5' ) ) if $end > 58; - # Vb20 a.c.: - $path = $c->tradition->witness('Vb20')->uncorrected_path; - splice( @$path, 1251, 1, $c->reading( '94,4' ) ) if $end > 94; - # Vb5: - $path = $c->tradition->witness('Vb5')->path; - splice( @$path, 1436, 0, $c->reading('rdg_1/106.5.1') ) if $end > 106; - # extraneous: - $c->del_reading( 'rdg_2/147.6.13' ); - $c->del_reading( 'rdg_2/147.6.14' ); - $c->del_reading( 'rdg_2/147.6.15' ); - - } else { - my $c = $collation; - my $end = $SHORTEND ? $SHORTEND : 155; - # Vb5: - my $path = $c->tradition->witness('Vb5')->path; - splice( @$path, 1436, 0, $c->reading('106,14') ) if $end > 106; - # Vb11: - $path = $c->tradition->witness('Vb11')->path; - if( $end > 16 ) { - $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') ); - splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( '16,1' ) ); - } - # Vb13: - $path = $c->tradition->witness('Vb13')->path; - splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58; - $path = $c->tradition->witness('Vb13')->uncorrected_path; - splice( @$path, 758, 0, $c->reading( '58,5' ) ) if $end > 58; - # Vb20 a.c.: - $path = $c->tradition->witness('Vb20')->uncorrected_path; - splice( @$path, 1251, 1, $c->reading( '94,4' ) ) if $end > 94; - # Vb26: - $path = $c->tradition->witness('Vb26')->path; - splice( @$path, 618, 0, $c->reading('46,2') ) if $end > 46; - } - + if( $opts->{'input'} eq 'KUL' ) { + require 'data/boodts/s158.HACK'; + KUL::HACK::pre_path_hack( $collation ); + } + # Now walk paths and calculate positional rank. - my @common_readings = $collation->make_witness_paths(); + $collation->make_witness_paths(); + # Now delete any orphaned readings. + foreach my $r ( $collation->sequence->isolated_vertices ) { + print STDERR "Deleting unconnected reading $r / " . + $collation->reading( $r )->text . "\n"; + $collation->del_reading( $r ); + } + + KUL::HACK::post_path_hack( $collation ) if $opts->{'input'} eq 'KUL'; # Have to check relationship validity at this point, because before that # we had no paths. # foreach my $rel ( $collation->relationships ) { # next unless $rel->equal_rank; # unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) { # warn sprintf( "Relationship type %s between %s and %s is invalid, deleting", -# $rel->type, $rel->from->name, $rel->to->name ); +# $rel->type, $rel->from->id, $rel->to->id ); # } # } $collation->calculate_ranks(); @@ -373,8 +321,8 @@ sub read_base { # This array gives the first reading for each line. We put the # common starting point in line zero. - my $last_reading = $collation->start(); - $base_text_index{$last_reading->name} = 0; + my $last_reading = $collation->start; + $base_text_index{$last_reading->id} = 0; my $lineref_array = [ $last_reading ]; # There is no line zero. open( BASE, $base_file ) or die "Could not open file $base_file: $!"; @@ -392,8 +340,7 @@ sub read_base { last if $SHORTEND && $lineref > $SHORTEND; foreach my $w ( @words ) { my $readingref = join( ',', $lineref, ++$wordref ); - my $reading = $collation->add_reading( $readingref ); - $reading->text( $w ); + my $reading = $collation->add_reading( { id => $readingref, text => $w } ); unless( $started ) { push( @$lineref_array, $reading ); $started = 1; @@ -413,7 +360,7 @@ sub read_base { # Ending point for all texts $collation->add_path( $last_reading, $collation->end, $collation->baselabel ); push( @$lineref_array, $collation->end ); - $base_text_index{$collation->end->name} = $i; + $base_text_index{$collation->end->id} = $i; return( @$lineref_array ); } @@ -441,14 +388,14 @@ sub set_relationships { $labels{cmp_str( $r )} = $r; } foreach my $r( @$var ) { - if( exists $labels{$r->label} && - $r->name ne $labels{$r->label}->name ) { + if( exists $labels{$r->text} && + $r->id ne $labels{$r->text}->id ) { if( $type eq 'repetition' ) { # Repetition - $collation->add_relationship( $r, $labels{$r->label}, \%rel_options ); + $collation->add_relationship( $r, $labels{$r->text}, \%rel_options ); } else { # Transposition - $r->set_identical( $labels{$r->label} ); + $r->set_identical( $labels{$r->text} ); } } } @@ -489,15 +436,15 @@ sub set_relationships { sub apply_edits { my( $collation, $edit_sequence, $debug ) = @_; - my @lemma_text = $collation->reading_sequence( $collation->start, - $collation->reading( '#END#' ) ); + my @lemma_text = $collation->reading_sequence( + $collation->start, $collation->end ); my $drift = 0; foreach my $correction ( @$edit_sequence ) { my( $lemma_start, $length, $items ) = @$correction; my $offset = $base_text_index{$lemma_start}; my $realoffset = $offset + $drift; if( $debug || - $lemma_text[$realoffset]->name ne $lemma_start ) { + $lemma_text[$realoffset]->id ne $lemma_start ) { my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1]; my @base_phrase; my $i = $realoffset; @@ -510,23 +457,23 @@ sub apply_edits { print STDERR sprintf( "Trying to replace %s (%s) starting at %d " . "with %s (%s) with drift %d\n", - join( ' ', map {$_->label} @base_phrase ), - join( ' ', map {$_->name} @base_phrase ), + join( ' ', map {$_->text} @base_phrase ), + join( ' ', map {$_->id} @base_phrase ), $realoffset, - join( ' ', map {$_->label} @$items ), - join( ' ', map {$_->name} @$items ), + join( ' ', map {$_->text} @$items ), + join( ' ', map {$_->id} @$items ), $drift, ) if $debug; - if( $lemma_text[$realoffset]->name ne $lemma_start ) { + if( $lemma_text[$realoffset]->id ne $lemma_start ) { warn( sprintf( "Should be replacing %s (%s) with %s (%s) " . "but %s (%s) is there instead", - join( ' ', map {$_->label} @base_phrase ), - join( ' ', map {$_->name} @base_phrase ), - join( ' ', map {$_->label} @$items ), - join( ' ', map {$_->name} @$items ), - join( ' ', map {$_->label} @this_phrase ), - join( ' ', map {$_->name} @this_phrase ), + join( ' ', map {$_->text} @base_phrase ), + join( ' ', map {$_->id} @base_phrase ), + join( ' ', map {$_->text} @$items ), + join( ' ', map {$_->id} @$items ), + join( ' ', map {$_->text} @this_phrase ), + join( ' ', map {$_->id} @this_phrase ), ) ); # next; }