retain original order of collation sets

[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / BaseText.pm
diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm

index e0e6ec2..76e4f36 100644 (file)
--- a/lib/Text/Tradition/Parser/BaseText.pm
+++ b/lib/Text/Tradition/Parser/BaseText.pm
@@ -3,7 +3,9 @@ package Text::Tradition::Parser::BaseText;
 use strict;
 use warnings;
 use Module::Load;
-use Text::Tradition::Parser::Util qw( collate_variants cmp_str check_for_repeated add_hash_entry );
+use TryCatch;
+use Text::Tradition::Parser::Util qw( collate_variants cmp_str 
+       check_for_repeated add_hash_entry );
 
 =head1 NAME
 
@@ -17,7 +19,7 @@ merge_base( $graph, 'reference.txt', @apparatus_entries )
 =head1 DESCRIPTION
 
 For an overview of the package, see the documentation for the
-Text::Tradition::Graph module.
+Text::Tradition module.
 
 This module is meant for use with certain of the other Parser classes
 - whenever a list of variants is given with reference to a base text,
@@ -31,9 +33,9 @@ will join those listed variants onto the reference text.
 
 =item B<parse>
 
-parse( $graph, %opts );
+parse( $graph, $opts );
 
-Takes an initialized graph and a set of options, which must include:
+Takes an initialized graph and a hashref of options, which must include:
 - 'base' - the base text referenced by the variants
 - 'format' - the format of the variant list
 - 'data' - the variants, in the given format.
@@ -45,8 +47,9 @@ sub parse {
 
     my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
     load( $format_mod );
-    my @apparatus_entries = $format_mod->can('read')->( $opts->{'file'} );
-    merge_base( $tradition->collation, $opts->{'base'}, @apparatus_entries );
+    # TODO Handle a string someday if we ever have a format other than KUL
+    my @apparatus_entries = $format_mod->can('read')->( $opts );
+    merge_base( $tradition->collation, $opts, @apparatus_entries );
 }
 
 =item B<merge_base>
@@ -84,11 +87,10 @@ my $edits_required = {};
 # edits_required -> wit -> [ { start_idx, end_idx, items } ]
 
 sub merge_base {
-    my( $collation, $base_file, @app_entries ) = @_;
-    my @base_line_starts = read_base( $base_file, $collation );
+    my( $collation, $opts, @app_entries ) = @_;
+    my @base_line_starts = read_base( $opts->{'base'}, $collation );
 
     my %all_witnesses;
-    my @unwitnessed_lemma_nodes;
     foreach my $app ( @app_entries ) {
         my( $line, $num ) = split( /\./, $app->{_id} );
         # DEBUG with a short graph
@@ -112,12 +114,12 @@ sub merge_base {
         my %seen;
         while( $lemma_start ne $too_far ) {
             # Loop detection
-            if( $seen{ $lemma_start->name() } ) {
-                warn "Detected loop at " . $lemma_start->name() . 
+            if( $seen{ $lemma_start->id() } ) {
+                warn "Detected loop at " . $lemma_start->id() . 
                     ", ref $line,$num";
                 last;
             }
-            $seen{ $lemma_start->name() } = 1;
+            $seen{ $lemma_start->id() } = 1;
             
             # Try to match the lemma.
             my $unmatch = 0;
@@ -168,7 +170,7 @@ sub merge_base {
         my @lemma_set = $collation->reading_sequence( $lemma_start, 
                                                       $lemma_end );
         my @reading_sets = [ @lemma_set ];
-
+        
         # For each reading that is not rdg_0, we create the variant
         # reading nodes, and store the range as an edit operation on
         # the base text.
@@ -177,11 +179,6 @@ sub merge_base {
         foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
             my @mss = grep { $app->{$_} eq $k } keys( %$app );
 
-            # Keep track of lemma nodes that don't actually appear in
-            # any MSS; we will want to remove them from the collation.
-            push( @unwitnessed_lemma_nodes, @lemma_set )
-                if !@mss && $k eq 'rdg_0';
-
             # Keep track of what witnesses we have seen.
             @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
             # Keep track of which witnesses bear corrected readings here.
@@ -201,8 +198,9 @@ sub merge_base {
             my $ctr = 0;
             foreach my $vw ( @variant ) {
                 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
-                my $vwreading = $collation->add_reading( $vwname );
-                $vwreading->text( $vw );
+                my $vwreading = $collation->add_reading( {
+                       'id' => $vwname,
+                       'text' => $vw } );
                 push( @variant_readings, $vwreading );
             }
 
@@ -225,7 +223,7 @@ sub merge_base {
         foreach my $rkey ( keys %$variant_objects ) {
             # Object is argument list for splice, so:
             # offset, length, replacements
-            my $edit_object = [ $lemma_start->name,
+            my $edit_object = [ $lemma_start->id,
                                 scalar( @lemma_set ),
                                 $variant_objects->{$rkey}->{reading} ];
             foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
@@ -251,7 +249,8 @@ sub merge_base {
     # Now make the witness objects, and create their text sequences
     foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
         print STDERR "Creating witness $w\n";
-        my $witness_obj = $collation->tradition->add_witness( sigil => $w );
+        my $witness_obj = $collation->tradition->add_witness( 
+               sigil => $w, sourcetype => 'collation' );
         my $debug; #  = $w eq 'Vb11';
         my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
         my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
@@ -277,83 +276,35 @@ sub merge_base {
     # ones we have created so far.  Also remove any unwitnessed
     # lemma nodes (TODO unless we are treating base as witness)
     foreach ( $collation->paths() ) {
-        $collation->del_path( $_ );
-    }
-    foreach( @unwitnessed_lemma_nodes ) {
-        $collation->del_reading( $_ );
-        # TODO do we need to delete any relationship paths here?
+        $collation->del_path( $_, $collation->baselabel );
     }
 
     ### HACKY HACKY Do some one-off path corrections here.
-    if( $collation->linear ) {
-       my $c = $collation;
-       my $end = $SHORTEND ? $SHORTEND : 155;
-       # Vb11
-       my $path;
-       if( $end > 16 ) {
-           $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
-           $path = $c->tradition->witness('Vb11')->path;
-           splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) );
-           $path = $c->tradition->witness('Vb11')->uncorrected_path;
-           splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) );
-       }
-       # What else?
-       # Vb26:
-       $path = $c->tradition->witness('Vb26')->path;
-       splice( @$path, 618, 0, $c->reading('rdg_1/46.1.1') ) if $end > 46;
-       # Vb13:
-       $path = $c->tradition->witness('Vb13')->path;
-       splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58;
-       $path = $c->tradition->witness('Vb13')->uncorrected_path;
-       splice( @$path, 758, 0, $c->reading( '58,5' ) ) if $end > 58;
-       # Vb20 a.c.: 
-       $path = $c->tradition->witness('Vb20')->uncorrected_path;
-       splice( @$path, 1251, 1, $c->reading( '94,4' ) ) if $end > 94;
-       # Vb5:
-       $path = $c->tradition->witness('Vb5')->path;
-       splice( @$path, 1436, 0, $c->reading('rdg_1/106.5.1') ) if $end > 106;
-       # extraneous:
-       $c->del_reading( 'rdg_2/147.6.13' );
-       $c->del_reading( 'rdg_2/147.6.14' );
-       $c->del_reading( 'rdg_2/147.6.15' );
-       
-    } else {
-       my $c = $collation;
-       my $end = $SHORTEND ? $SHORTEND : 155;
-       # Vb5:
-       my $path = $c->tradition->witness('Vb5')->path;
-       splice( @$path, 1436, 0, $c->reading('106,14') ) if $end > 106;
-       # Vb11: 
-       $path = $c->tradition->witness('Vb11')->path;
-       if( $end > 16 ) {
-           $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
-           splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( '16,1' ) );
-       }
-       # Vb13:
-       $path = $c->tradition->witness('Vb13')->path;
-       splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58;
-       $path = $c->tradition->witness('Vb13')->uncorrected_path;
-       splice( @$path, 758, 0, $c->reading( '58,5' ) ) if $end > 58;
-       # Vb20 a.c.: 
-       $path = $c->tradition->witness('Vb20')->uncorrected_path;
-       splice( @$path, 1251, 1, $c->reading( '94,4' ) ) if $end > 94;
-       # Vb26: 
-       $path = $c->tradition->witness('Vb26')->path;
-       splice( @$path, 618, 0, $c->reading('46,2') ) if $end > 46;
-    }
-
+    if( $opts->{'input'} eq 'KUL' ) {
+               require 'data/boodts/s158.HACK';
+               KUL::HACK::pre_path_hack( $collation );
+       }
+       
     # Now walk paths and calculate positional rank.
     $collation->make_witness_paths();
+    # Now delete any orphaned readings.
+       foreach my $r ( $collation->sequence->isolated_vertices ) {
+               print STDERR "Deleting unconnected reading $r / " . 
+                       $collation->reading( $r )->text . "\n";
+               $collation->del_reading( $r );
+       }
+       
+    KUL::HACK::post_path_hack( $collation ) if $opts->{'input'} eq 'KUL';
     # Have to check relationship validity at this point, because before that
     # we had no paths.
 #     foreach my $rel ( $collation->relationships ) {
 #         next unless $rel->equal_rank;
 #         unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
 #             warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
-#                             $rel->type, $rel->from->name, $rel->to->name );
+#                             $rel->type, $rel->from->id, $rel->to->id );
 #         }
 #     }
-    $collation->calculate_ranks();
+    $collation->calculate_common_readings(); # will implicitly rank
 }
 
 =item B<read_base>
@@ -373,8 +324,8 @@ sub read_base {
     
     # This array gives the first reading for each line.  We put the
     # common starting point in line zero.
-    my $last_reading = $collation->start();
-    $base_text_index{$last_reading->name} = 0;
+    my $last_reading = $collation->start;
+    $base_text_index{$last_reading->id} = 0;
     my $lineref_array = [ $last_reading ]; # There is no line zero.
 
     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
@@ -392,8 +343,7 @@ sub read_base {
         last if $SHORTEND && $lineref > $SHORTEND;
         foreach my $w ( @words ) {
             my $readingref = join( ',', $lineref, ++$wordref );
-            my $reading = $collation->add_reading( $readingref );
-            $reading->text( $w );
+            my $reading = $collation->add_reading( { id => $readingref, text => $w } );
             unless( $started ) {
                 push( @$lineref_array, $reading );
                 $started = 1;
@@ -413,7 +363,7 @@ sub read_base {
     # Ending point for all texts
     $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
     push( @$lineref_array, $collation->end );
-    $base_text_index{$collation->end->name} = $i;
+    $base_text_index{$collation->end->id} = $i;
 
     return( @$lineref_array );
 }
@@ -441,14 +391,24 @@ sub set_relationships {
                 $labels{cmp_str( $r )} = $r;
             }
             foreach my $r( @$var ) {
-                if( exists $labels{$r->label} &&
-                    $r->name ne $labels{$r->label}->name ) {
+                if( exists $labels{$r->text} &&
+                    $r->id ne $labels{$r->text}->id ) {
                     if( $type eq 'repetition' ) {
                         # Repetition
-                        $collation->add_relationship( $r, $labels{$r->label}, \%rel_options );
+                        try {
+                               $collation->add_relationship( $r, $labels{$r->text}, \%rel_options );
+                        } catch( Text::Tradition::Error $e ) {
+                               warn "Could not set repetition relationship $r -> " 
+                                       . $labels{$r->text} . ": " . $e->message;
+                        }
                     } else {
                         # Transposition
-                        $r->set_identical( $labels{$r->label} );
+                       try {
+                                       $r->set_identical( $labels{$r->text} );
+                        } catch( Text::Tradition::Error $e ) {
+                               warn "Could not set transposition relationship $r -> " 
+                                       . $labels{$r->text} . ": " . $e->message;
+                        }
                     }
                 }
             }
@@ -466,8 +426,13 @@ sub set_relationships {
             $rel_options{'equal_rank'} = 1;
             if( @$lemma == @$var ) {
                 foreach my $i ( 0 .. $#{$lemma} ) {
-                    $collation->add_relationship( $var->[$i], $lemma->[$i],
-                        \%rel_options );
+                       try {
+                                               $collation->add_relationship( $var->[$i], $lemma->[$i],
+                                                       \%rel_options );
+                                       } catch( Text::Tradition::Error $e ) {
+                                               warn "Could not set $type relationship " . $var->[$i] . " -> " 
+                                                       . $lemma->[$i] . ": " . $e->message;
+                                       }
                 } 
             } else {
                 # An uneven many-to-many mapping.  Skip for now.
@@ -475,9 +440,9 @@ sub set_relationships {
                 # my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
                 # my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
                 # $collation->add_relationship( $varseg, $lemseg, \%rel_options );
-                if( @$lemma == 1 && @$var == 1 ) {
-                    $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
-                }
+                # if( @$lemma == 1 && @$var == 1 ) {
+                #     $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
+                # }
             }
         } elsif( $type !~ /^(add|om|lex)$/i ) {
             warn "Unrecognized type $type";
@@ -489,15 +454,15 @@ sub set_relationships {
 
 sub apply_edits {
     my( $collation, $edit_sequence, $debug ) = @_;
-    my @lemma_text = $collation->reading_sequence( $collation->start,
-                                           $collation->reading( '#END#' ) );
+    my @lemma_text = $collation->reading_sequence( 
+       $collation->start, $collation->end );
     my $drift = 0;
     foreach my $correction ( @$edit_sequence ) {
         my( $lemma_start, $length, $items ) = @$correction;
         my $offset = $base_text_index{$lemma_start};
         my $realoffset = $offset + $drift;
         if( $debug ||
-            $lemma_text[$realoffset]->name ne $lemma_start ) {
+            $lemma_text[$realoffset]->id ne $lemma_start ) {
             my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
             my @base_phrase;
             my $i = $realoffset;
@@ -510,23 +475,23 @@ sub apply_edits {
             
             print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
                                   "with %s (%s) with drift %d\n",
-                                  join( ' ', map {$_->label} @base_phrase ),
-                                  join( ' ', map {$_->name} @base_phrase ),
+                                  join( ' ', map {$_->text} @base_phrase ),
+                                  join( ' ', map {$_->id} @base_phrase ),
                                   $realoffset,
-                                  join( ' ', map {$_->label} @$items ),
-                                  join( ' ', map {$_->name} @$items ),
+                                  join( ' ', map {$_->text} @$items ),
+                                  join( ' ', map {$_->id} @$items ),
                                   $drift,
                                   ) if $debug;
                                   
-            if( $lemma_text[$realoffset]->name ne $lemma_start ) {
+            if( $lemma_text[$realoffset]->id ne $lemma_start ) {
                 warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
                                "but %s (%s) is there instead", 
-                               join( ' ', map {$_->label} @base_phrase ),
-                               join( ' ', map {$_->name} @base_phrase ),
-                               join( ' ', map {$_->label} @$items ),
-                               join( ' ', map {$_->name} @$items ),
-                               join( ' ', map {$_->label} @this_phrase ),
-                               join( ' ', map {$_->name} @this_phrase ),
+                               join( ' ', map {$_->text} @base_phrase ),
+                               join( ' ', map {$_->id} @base_phrase ),
+                               join( ' ', map {$_->text} @$items ),
+                               join( ' ', map {$_->id} @$items ),
+                               join( ' ', map {$_->text} @this_phrase ),
+                               join( ' ', map {$_->id} @this_phrase ),
                       ) );
                 # next;
             }