[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / BaseText.pm

package Text::Tradition::Parser::BaseText;

use strict;
use warnings;
use Module::Load;
use Text::Tradition::Parser::Util qw( collate_variants cmp_str check_for_repeated add_hash_entry );

=head1 NAME

Text::Tradition::Parser::BaseText

=head1 SYNOPSIS

use Text::Tradition::Parser::BaseText qw( merge_base );
merge_base( $graph, 'reference.txt', @apparatus_entries )

=head1 DESCRIPTION

For an overview of the package, see the documentation for the
Text::Tradition::Graph module.

This module is meant for use with certain of the other Parser classes
- whenever a list of variants is given with reference to a base text,
these must be joined into a single collation.  The parser should
therefore make a list of variants and their locations, and BaseText
will join those listed variants onto the reference text.  

=head1 SUBROUTINES

=over

=item B<parse>

parse( $graph, %opts );

Takes an initialized graph and a set of options, which must include:
- 'base' - the base text referenced by the variants
- 'format' - the format of the variant list
- 'data' - the variants, in the given format.

=cut

sub parse {
    my( $tradition, $opts ) = @_;

    my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
    load( $format_mod );
    my @apparatus_entries = $format_mod->can('read')->( $opts->{'file'} );
    merge_base( $tradition->collation, $opts->{'base'}, @apparatus_entries );
}

=item B<merge_base>

merge_base( $graph, 'reference.txt', @apparatus_entries )

Takes three arguments: a newly-initialized Text::Tradition::Graph
object, a text file containing the reference text, and a list of
variants (apparatus entries).  Adds the base text to the graph, and
joins the variants to that.

The list of variants is an array of hash references; each hash takes
the form
 { '_id' => line reference,
   'rdg_0' => lemma reading,
   'rdg_1' => first variant,
   ...  # and so on until all distinct readings are listed
   'WitnessA' => 'rdg_0',
   'WitnessB' => 'rdg_1',
   ...  # and so on until all witnesses are listed with their readings
 }

Any hash key that is not of the form /^rdg_\d+$/ and that does not
begin with an underscore is assumed to be a witness name.  Any 'meta'
information to be passed must be passed in a key with a leading
underscore in its name.

=cut

my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed

my %base_text_index;
my $edits_required = {};

# edits_required -> wit -> [ { start_idx, end_idx, items } ]

sub merge_base {
    my( $collation, $base_file, @app_entries ) = @_;
    my @base_line_starts = read_base( $base_file, $collation );

    my %all_witnesses;
    my @unwitnessed_lemma_nodes;
    foreach my $app ( @app_entries ) {
        my( $line, $num ) = split( /\./, $app->{_id} );
        # DEBUG with a short graph
        last if $SHORTEND && $line > $SHORTEND;
        # DEBUG for problematic entries
        my $scrutinize = '';
        my $first_line_reading = $base_line_starts[ $line ];
        my $too_far = $base_line_starts[ $line+1 ];
        
        my $lemma = $app->{rdg_0};
        my $seq = 1; 
        # Is this the Nth occurrence of this reading in the line?
        if( $lemma =~ s/(_)?(\d)$// ) {
            $seq = $2;
        }
        my @lemma_words = split( /\s+/, $lemma );
        
        # Now search for the lemma words within this line.
        my $lemma_start = $first_line_reading;
        my $lemma_end;
        my %seen;
        while( $lemma_start ne $too_far ) {
            # Loop detection
            if( $seen{ $lemma_start->name() } ) {
                warn "Detected loop at " . $lemma_start->name() . 
                    ", ref $line,$num";
                last;
            }
            $seen{ $lemma_start->name() } = 1;
            
            # Try to match the lemma.
            my $unmatch = 0;
            print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
                $lemma_words[0] . "...\n"
                if "$line.$num" eq $scrutinize;
            if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
                # Skip it if we need a match that is not the first.
                if( --$seq < 1 ) {
                    # Now we have to compare the rest of the words here.
                    if( scalar( @lemma_words ) > 1 ) {
                        my $next_reading = 
                            $collation->next_reading( $lemma_start );
                        foreach my $w ( @lemma_words[1..$#lemma_words] ) {
                            printf STDERR "Now matching %s against %s\n", 
                                    cmp_str($next_reading), $w
                                if "$line.$num" eq $scrutinize;
                            if( $w ne cmp_str($next_reading) ) {
                                $unmatch = 1;
                                last;
                            } else {
                                $lemma_end = $next_reading;
                                $next_reading = 
                                    $collation->next_reading( $lemma_end );
                            }
                        }
                    } else {
                        $lemma_end = $lemma_start;
                    }
                } else {
                    $unmatch = 1;
                }
            }
            last unless ( $unmatch || !defined( $lemma_end ) );
            $lemma_end = undef;
            $lemma_start = $collation->next_reading( $lemma_start );
        }
        
        unless( $lemma_end ) {
            warn "No match found for @lemma_words at $line.$num";
            next;
        }
        
        # Now we have found the lemma; we will record an 'edit', in
        # terms of a splice operation, for each subsequent reading.
        # We also note which witnesses take the given edit.

        my @lemma_set = $collation->reading_sequence( $lemma_start, 
                                                      $lemma_end );
        my @reading_sets = [ @lemma_set ];

        # For each reading that is not rdg_0, we create the variant
        # reading nodes, and store the range as an edit operation on
        # the base text.
        my $variant_objects;
        my %pc_seen; # Keep track of mss with explicit post-corr data
        foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
            my @mss = grep { $app->{$_} eq $k } keys( %$app );

            # Keep track of lemma nodes that don't actually appear in
            # any MSS; we will want to remove them from the collation.
            push( @unwitnessed_lemma_nodes, @lemma_set )
                if !@mss && $k eq 'rdg_0';

            # Keep track of what witnesses we have seen.
            @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
            # Keep track of which witnesses bear corrected readings here.
            foreach my $m ( @mss ) {
                my $base = _is_post_corr( $m );
                next unless $base;
                $pc_seen{$base} = 1;
            }
            next if $k eq 'rdg_0';

            # Parse the variant into reading tokens.
            # TODO don't hardcode the reading split operation
            my @variant = split( /\s+/, $app->{$k} );
            @variant = () if $app->{$k} eq '/'; # This is an omission.
            
            my @variant_readings;
            my $ctr = 0;
            foreach my $vw ( @variant ) {
                my $vwname = "$k/$line.$num.$ctr"; $ctr++;
                my $vwreading = $collation->add_reading( $vwname );
                $vwreading->text( $vw );
                push( @variant_readings, $vwreading );
            }

            $variant_objects->{$k} = { 'mss' => \@mss,
                                       'reading' => \@variant_readings,
            };
            push( @reading_sets, \@variant_readings );
        }

        # Now collate and collapse the identical readings within the
        # collated sets.  Modifies the reading sets that were passed.
        collate_variants( $collation, @reading_sets );

        # Record any stated relationships between the nodes and the lemma.
        set_relationships( $collation, $app, \@lemma_set, $variant_objects );

        # Now create the splice-edit objects that will be used
        # to reconstruct each witness.

        foreach my $rkey ( keys %$variant_objects ) {
            # Object is argument list for splice, so:
            # offset, length, replacements
            my $edit_object = [ $lemma_start->name,
                                scalar( @lemma_set ),
                                $variant_objects->{$rkey}->{reading} ];
            foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
                # Is this a p.c. entry?
                my $base = _is_post_corr( $ms );
                if( $base ) { # this is a post-corr witness
                    my $pc_key = $base . "_post";
                    add_hash_entry( $edits_required, $pc_key, $edit_object );
                } else { # this is an ante-corr witness
                    my $pc_key = $ms . "_post";
                    add_hash_entry( $edits_required, $ms, $edit_object );
                    unless( $pc_seen{$ms} ) {
                        # If this witness carries no correction, add this 
                        # same object to its post-corrected state.
                        add_hash_entry( $edits_required, $pc_key, 
                                         $edit_object );
                    }
                }
            }
        }
    } # Finished going through the apparatus entries

    # Now make the witness objects, and create their text sequences
    foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
        print STDERR "Creating witness $w\n";
        my $witness_obj = $collation->tradition->add_witness( sigil => $w );
        my $debug; #  = $w eq 'Vb11';
        my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
        my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
            if exists( $edits_required->{$w."_post"} );

        my @repeated = check_for_repeated( @ante_corr_seq );
        warn "Repeated elements @repeated in $w a.c."
            if @repeated;
        @repeated = check_for_repeated( @post_corr_seq );
        warn "Repeated elements @repeated in $w p.c."
            if @repeated;

        # Now save these paths in my witness object
        if( @post_corr_seq ) {
            $witness_obj->path( \@post_corr_seq );
            $witness_obj->uncorrected_path( \@ante_corr_seq );
        } else {
            $witness_obj->path( \@ante_corr_seq );
        }
    }

    # Now remove our 'base text' edges, which is to say, the only
    # ones we have created so far.  Also remove any unwitnessed
    # lemma nodes (TODO unless we are treating base as witness)
    foreach ( $collation->paths() ) {
        $collation->del_path( $_ );
    }
    foreach( @unwitnessed_lemma_nodes ) {
        $collation->del_reading( $_ );
        # TODO do we need to delete any relationship paths here?
    }

    ### HACKY HACKY Do some one-off path corrections here.
    if( $collation->linear ) {
       my $c = $collation;
       my $end = $SHORTEND ? $SHORTEND : 155;
       # Vb11
       my $path;
       if( $end > 16 ) {
           $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
           $path = $c->tradition->witness('Vb11')->path;
           splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) );
           $path = $c->tradition->witness('Vb11')->uncorrected_path;
           splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) );
       }
       # What else?
       # Vb26:
       $path = $c->tradition->witness('Vb26')->path;
       splice( @$path, 618, 0, $c->reading('rdg_1/46.1.1') ) if $end > 46;
       # Vb13:
       $path = $c->tradition->witness('Vb13')->path;
       splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58;
       $path = $c->tradition->witness('Vb13')->uncorrected_path;
       splice( @$path, 758, 0, $c->reading( '58,5' ) ) if $end > 58;
       # Vb20 a.c.: 
       $path = $c->tradition->witness('Vb20')->uncorrected_path;
       splice( @$path, 1251, 1, $c->reading( '94,4' ) ) if $end > 94;
       # Vb5:
       $path = $c->tradition->witness('Vb5')->path;
       splice( @$path, 1436, 0, $c->reading('rdg_1/106.5.1') ) if $end > 106;
       # extraneous:
       $c->del_reading( 'rdg_2/147.6.13' );
       $c->del_reading( 'rdg_2/147.6.14' );
       $c->del_reading( 'rdg_2/147.6.15' );
       
    } else {
       my $c = $collation;
       my $end = $SHORTEND ? $SHORTEND : 155;
       # Vb5:
       my $path = $c->tradition->witness('Vb5')->path;
       splice( @$path, 1436, 0, $c->reading('106,14') ) if $end > 106;
       # Vb11: 
       $path = $c->tradition->witness('Vb11')->path;
       if( $end > 16 ) {
           $c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
           splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( '16,1' ) );
       }
       # Vb13:
       $path = $c->tradition->witness('Vb13')->path;
       splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58;
       $path = $c->tradition->witness('Vb13')->uncorrected_path;
       splice( @$path, 758, 0, $c->reading( '58,5' ) ) if $end > 58;
       # Vb20 a.c.: 
       $path = $c->tradition->witness('Vb20')->uncorrected_path;
       splice( @$path, 1251, 1, $c->reading( '94,4' ) ) if $end > 94;
       # Vb26: 
       $path = $c->tradition->witness('Vb26')->path;
       splice( @$path, 618, 0, $c->reading('46,2') ) if $end > 46;
    }

    # Now walk paths and calculate positional rank.
    $collation->make_witness_paths();
    # Have to check relationship validity at this point, because before that
    # we had no paths.
#     foreach my $rel ( $collation->relationships ) {
#         next unless $rel->equal_rank;
#         unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
#             warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
#                             $rel->type, $rel->from->name, $rel->to->name );
#         }
#     }
    $collation->calculate_ranks();
}

=item B<read_base>

my @line_beginnings = read_base( 'reference.txt', $collation );

Takes a text file and a (presumed empty) collation object, adds the
words as simple linear readings to the collation, and returns a
list of readings that represent the beginning of lines. This collation
is now the starting point for application of apparatus entries in
merge_base, e.g. from a CSV file or a Classical Text Editor file.

=cut

sub read_base {
    my( $base_file, $collation ) = @_;
    
    # This array gives the first reading for each line.  We put the
    # common starting point in line zero.
    my $last_reading = $collation->start();
    $base_text_index{$last_reading->name} = 0;
    my $lineref_array = [ $last_reading ]; # There is no line zero.

    open( BASE, $base_file ) or die "Could not open file $base_file: $!";
    my $i = 1;
    while(<BASE>) {
        # Make the readings, and connect them up for the base, but
        # also save the first reading of each line in an array for the
        # purpose.
        # TODO use configurable reading separator
        chomp;
        my @words = split;
        my $started = 0;
        my $wordref = 0;
        my $lineref = scalar @$lineref_array;
        last if $SHORTEND && $lineref > $SHORTEND;
        foreach my $w ( @words ) {
            my $readingref = join( ',', $lineref, ++$wordref );
            my $reading = $collation->add_reading( $readingref );
            $reading->text( $w );
            unless( $started ) {
                push( @$lineref_array, $reading );
                $started = 1;
            }
            # Add edge paths in the graph, for easier tracking when
            # we start applying corrections.  These paths will be
            # removed when we're done.
            my $path = $collation->add_path( $last_reading, $reading, 
                                             $collation->baselabel );
            $last_reading = $reading;

            # Note an array index for the reading, for later correction splices.
            $base_text_index{$readingref} = $i++;
        }
    }
    close BASE;
    # Ending point for all texts
    $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
    push( @$lineref_array, $collation->end );
    $base_text_index{$collation->end->name} = $i;

    return( @$lineref_array );
}

sub set_relationships {
    my( $collation, $app, $lemma, $variants ) = @_;
    foreach my $rkey ( keys %$variants ) {
        my $var = $variants->{$rkey}->{'reading'};
        my $type = $app->{sprintf( "_%s_type", $rkey )};
        my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
        my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
        
        my %rel_options = ();
        $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
        $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
        
        if( $type =~ /^(inv|tr|rep)$/i ) {
            # Transposition or repetition: look for nodes with the
            # same label but different IDs and mark them.
            $type = 'repetition' if $type =~ /^rep/i;
            $rel_options{'type'} = $type;
            $rel_options{'equal_rank'} = undef;
            my %labels;
            foreach my $r ( @$lemma ) {
                $labels{cmp_str( $r )} = $r;
            }
            foreach my $r( @$var ) {
                if( exists $labels{$r->label} &&
                    $r->name ne $labels{$r->label}->name ) {
                    if( $type eq 'repetition' ) {
                        # Repetition
                        $collation->add_relationship( $r, $labels{$r->label}, \%rel_options );
                    } else {
                        # Transposition
                        $r->set_identical( $labels{$r->label} );
                    }
                }
            }
        } elsif( $type =~ /^(gr|sp(el)?)$/i ) {

            # Grammar/spelling/lexical: this can be a one-to-one or
            # one-to-many mapping.  We should think about merging
            # readings if it is one-to-many.

            $type = 'grammatical' if $type =~ /gr/i;
            $type = 'spelling' if $type =~ /sp/i;
            $type = 'repetition' if $type =~ /rep/i;
            # $type = 'lexical' if $type =~ /lex/i;
            $rel_options{'type'} = $type;
            $rel_options{'equal_rank'} = 1;
            if( @$lemma == @$var ) {
                foreach my $i ( 0 .. $#{$lemma} ) {
                    $collation->add_relationship( $var->[$i], $lemma->[$i],
                        \%rel_options );
                } 
            } else {
                # An uneven many-to-many mapping.  Skip for now.
                # We really want to make a segment out of whatever we have.
                # my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
                # my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
                # $collation->add_relationship( $varseg, $lemseg, \%rel_options );
                if( @$lemma == 1 && @$var == 1 ) {
                    $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
                }
            }
        } elsif( $type !~ /^(add|om|lex)$/i ) {
            warn "Unrecognized type $type";
        }
    }
}
        

sub apply_edits {
    my( $collation, $edit_sequence, $debug ) = @_;
    my @lemma_text = $collation->reading_sequence( $collation->start,
                                           $collation->reading( '#END#' ) );
    my $drift = 0;
    foreach my $correction ( @$edit_sequence ) {
        my( $lemma_start, $length, $items ) = @$correction;
        my $offset = $base_text_index{$lemma_start};
        my $realoffset = $offset + $drift;
        if( $debug ||
            $lemma_text[$realoffset]->name ne $lemma_start ) {
            my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
            my @base_phrase;
            my $i = $realoffset;
            my $l = $collation->reading( $lemma_start );
            while( $i < $realoffset+$length ) {
                push( @base_phrase, $l );
                $l = $collation->next_reading( $l );
                $i++;
            }
            
            print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
                                  "with %s (%s) with drift %d\n",
                                  join( ' ', map {$_->label} @base_phrase ),
                                  join( ' ', map {$_->name} @base_phrase ),
                                  $realoffset,
                                  join( ' ', map {$_->label} @$items ),
                                  join( ' ', map {$_->name} @$items ),
                                  $drift,
                                  ) if $debug;
                                  
            if( $lemma_text[$realoffset]->name ne $lemma_start ) {
                warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
                               "but %s (%s) is there instead", 
                               join( ' ', map {$_->label} @base_phrase ),
                               join( ' ', map {$_->name} @base_phrase ),
                               join( ' ', map {$_->label} @$items ),
                               join( ' ', map {$_->name} @$items ),
                               join( ' ', map {$_->label} @this_phrase ),
                               join( ' ', map {$_->name} @this_phrase ),
                      ) );
                # next;
            }
        }
        splice( @lemma_text, $realoffset, $length, @$items );
        $drift += @$items - $length;
    }
    return @lemma_text;
}
        

# Helper function. Given a witness sigil, if it is a post-correctione
# sigil,return the base witness.  If not, return a false value.
sub _is_post_corr {
    my( $sigil ) = @_;
    if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
        return $1;
    }
    return undef;
}


=back

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews, aurum@cpan.org

=cut

1;
Commit	Line	Data
e58153d6	1	package Text::Tradition::Parser::BaseText;
b49c4318	2
	3	use strict;
	4	use warnings;
52ce987f	5	use Module::Load;
910a0a6d	6	use Text::Tradition::Parser::Util qw( collate_variants cmp_str check_for_repeated add_hash_entry );
b49c4318	7
2ceca8c3	8	=head1 NAME
	9
	10	Text::Tradition::Parser::BaseText
	11
	12	=head1 SYNOPSIS
	13
	14	use Text::Tradition::Parser::BaseText qw( merge_base );
	15	merge_base( $graph, 'reference.txt', @apparatus_entries )
	16
	17	=head1 DESCRIPTION
	18
	19	For an overview of the package, see the documentation for the
	20	Text::Tradition::Graph module.
	21
	22	This module is meant for use with certain of the other Parser classes
	23	- whenever a list of variants is given with reference to a base text,
	24	these must be joined into a single collation. The parser should
	25	therefore make a list of variants and their locations, and BaseText
	26	will join those listed variants onto the reference text.
	27
	28	=head1 SUBROUTINES
	29
	30	=over
	31
52ce987f	32	=item B<parse>
	33
	34	parse( $graph, %opts );
	35
	36	Takes an initialized graph and a set of options, which must include:
	37	- 'base' - the base text referenced by the variants
	38	- 'format' - the format of the variant list
	39	- 'data' - the variants, in the given format.
	40
	41	=cut
	42
	43	sub parse {
dfc37e38	44	my( $tradition, $opts ) = @_;
52ce987f	45
dfc37e38	46	my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
52ce987f	47	load( $format_mod );
dfc37e38	48	my @apparatus_entries = $format_mod->can('read')->( $opts->{'file'} );
dfc37e38	49	merge_base( $tradition->collation, $opts->{'base'}, @apparatus_entries );
52ce987f	50	}
52ce987f	51
2ceca8c3	52	=item B<merge_base>
	53
	54	merge_base( $graph, 'reference.txt', @apparatus_entries )
	55
	56	Takes three arguments: a newly-initialized Text::Tradition::Graph
	57	object, a text file containing the reference text, and a list of
	58	variants (apparatus entries). Adds the base text to the graph, and
	59	joins the variants to that.
	60
	61	The list of variants is an array of hash references; each hash takes
	62	the form
	63	{ '_id' => line reference,
	64	'rdg_0' => lemma reading,
	65	'rdg_1' => first variant,
	66	... # and so on until all distinct readings are listed
	67	'WitnessA' => 'rdg_0',
	68	'WitnessB' => 'rdg_1',
	69	... # and so on until all witnesses are listed with their readings
	70	}
	71
	72	Any hash key that is not of the form /^rdg_\d+$/ and that does not
	73	begin with an underscore is assumed to be a witness name. Any 'meta'
	74	information to be passed must be passed in a key with a leading
	75	underscore in its name.
	76
	77	=cut
	78
b15511bf	79	my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
4ca00eca	80
4ca00eca	81	my %base_text_index;
6a222840	82	my $edits_required = {};
4ca00eca	83
4ca00eca	84	# edits_required -> wit -> [ { start_idx, end_idx, items } ]
930ff666	85
b49c4318	86	sub merge_base {
e2902068	87	my( $collation, $base_file, @app_entries ) = @_;
e2902068	88	my @base_line_starts = read_base( $base_file, $collation );
b49c4318	89
52ce987f	90	my %all_witnesses;
6a222840	91	my @unwitnessed_lemma_nodes;
b49c4318	92	foreach my $app ( @app_entries ) {
910a0a6d	93	my( $line, $num ) = split( /\./, $app->{_id} );
	94	# DEBUG with a short graph
	95	last if $SHORTEND && $line > $SHORTEND;
	96	# DEBUG for problematic entries
	97	my $scrutinize = '';
	98	my $first_line_reading = $base_line_starts[ $line ];
	99	my $too_far = $base_line_starts[ $line+1 ];
	100
	101	my $lemma = $app->{rdg_0};
	102	my $seq = 1;
	103	# Is this the Nth occurrence of this reading in the line?
	104	if( $lemma =~ s/(_)?(\d)$// ) {
	105	$seq = $2;
	106	}
	107	my @lemma_words = split( /\s+/, $lemma );
	108
	109	# Now search for the lemma words within this line.
	110	my $lemma_start = $first_line_reading;
	111	my $lemma_end;
	112	my %seen;
	113	while( $lemma_start ne $too_far ) {
	114	# Loop detection
	115	if( $seen{ $lemma_start->name() } ) {
	116	warn "Detected loop at " . $lemma_start->name() .
	117	", ref $line,$num";
	118	last;
	119	}
	120	$seen{ $lemma_start->name() } = 1;
	121
	122	# Try to match the lemma.
	123	my $unmatch = 0;
	124	print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
	125	$lemma_words[0] . "...\n"
	126	if "$line.$num" eq $scrutinize;
	127	if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
	128	# Skip it if we need a match that is not the first.
	129	if( --$seq < 1 ) {
	130	# Now we have to compare the rest of the words here.
	131	if( scalar( @lemma_words ) > 1 ) {
	132	my $next_reading =
	133	$collation->next_reading( $lemma_start );
	134	foreach my $w ( @lemma_words[1..$#lemma_words] ) {
	135	printf STDERR "Now matching %s against %s\n",
	136	cmp_str($next_reading), $w
	137	if "$line.$num" eq $scrutinize;
	138	if( $w ne cmp_str($next_reading) ) {
	139	$unmatch = 1;
	140	last;
	141	} else {
	142	$lemma_end = $next_reading;
	143	$next_reading =
	144	$collation->next_reading( $lemma_end );
	145	}
	146	}
	147	} else {
	148	$lemma_end = $lemma_start;
	149	}
	150	} else {
	151	$unmatch = 1;
	152	}
	153	}
	154	last unless ( $unmatch \|\| !defined( $lemma_end ) );
	155	$lemma_end = undef;
	156	$lemma_start = $collation->next_reading( $lemma_start );
157	}
158
159	unless( $lemma_end ) {
160	warn "No match found for @lemma_words at $line.$num";
161	next;
162	}
163
164	# Now we have found the lemma; we will record an 'edit', in
165	# terms of a splice operation, for each subsequent reading.
166	# We also note which witnesses take the given edit.
167
168	my @lemma_set = $collation->reading_sequence( $lemma_start,
169	$lemma_end );
170	my @reading_sets = [ @lemma_set ];
171
172	# For each reading that is not rdg_0, we create the variant
173	# reading nodes, and store the range as an edit operation on
174	# the base text.
175	my $variant_objects;
176	my %pc_seen; # Keep track of mss with explicit post-corr data
177	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
178	my @mss = grep { $app->{$_} eq $k } keys( %$app );
179
180	# Keep track of lemma nodes that don't actually appear in
181	# any MSS; we will want to remove them from the collation.
182	push( @unwitnessed_lemma_nodes, @lemma_set )
183	if !@mss && $k eq 'rdg_0';
184
185	# Keep track of what witnesses we have seen.
186	@all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
187	# Keep track of which witnesses bear corrected readings here.
188	foreach my $m ( @mss ) {
189	my $base = _is_post_corr( $m );
190	next unless $base;
191	$pc_seen{$base} = 1;
192	}
193	next if $k eq 'rdg_0';
194
195	# Parse the variant into reading tokens.
196	# TODO don't hardcode the reading split operation
197	my @variant = split( /\s+/, $app->{$k} );
198	@variant = () if $app->{$k} eq '/'; # This is an omission.
199
200	my @variant_readings;
201	my $ctr = 0;
202	foreach my $vw ( @variant ) {
203	my $vwname = "$k/$line.$num.$ctr"; $ctr++;
204	my $vwreading = $collation->add_reading( $vwname );
205	$vwreading->text( $vw );
206	push( @variant_readings, $vwreading );
207	}
208
209	$variant_objects->{$k} = { 'mss' => \@mss,
210	'reading' => \@variant_readings,
211	};
212	push( @reading_sets, \@variant_readings );
213	}
214
215	# Now collate and collapse the identical readings within the
216	# collated sets. Modifies the reading sets that were passed.
217	collate_variants( $collation, @reading_sets );
218
219	# Record any stated relationships between the nodes and the lemma.
220	set_relationships( $collation, $app, \@lemma_set, $variant_objects );
221
222	# Now create the splice-edit objects that will be used
223	# to reconstruct each witness.
224
225	foreach my $rkey ( keys %$variant_objects ) {
226	# Object is argument list for splice, so:
227	# offset, length, replacements
228	my $edit_object = [ $lemma_start->name,
229	scalar( @lemma_set ),
230	$variant_objects->{$rkey}->{reading} ];
231	foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
232	# Is this a p.c. entry?
233	my $base = _is_post_corr( $ms );
234	if( $base ) { # this is a post-corr witness
235	my $pc_key = $base . "_post";
236	add_hash_entry( $edits_required, $pc_key, $edit_object );
237	} else { # this is an ante-corr witness
238	my $pc_key = $ms . "_post";
239	add_hash_entry( $edits_required, $ms, $edit_object );
240	unless( $pc_seen{$ms} ) {
241	# If this witness carries no correction, add this
242	# same object to its post-corrected state.
243	add_hash_entry( $edits_required, $pc_key,
244	$edit_object );
245	}
246	}
247	}
248	}
4ca00eca	249	} # Finished going through the apparatus entries
	250
	251	# Now make the witness objects, and create their text sequences
6a222840	252	foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
910a0a6d	253	print STDERR "Creating witness $w\n";
	254	my $witness_obj = $collation->tradition->add_witness( sigil => $w );
	255	my $debug; # = $w eq 'Vb11';
	256	my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
	257	my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
	258	if exists( $edits_required->{$w."_post"} );
	259
	260	my @repeated = check_for_repeated( @ante_corr_seq );
	261	warn "Repeated elements @repeated in $w a.c."
	262	if @repeated;
	263	@repeated = check_for_repeated( @post_corr_seq );
	264	warn "Repeated elements @repeated in $w p.c."
	265	if @repeated;
	266
	267	# Now save these paths in my witness object
	268	if( @post_corr_seq ) {
	269	$witness_obj->path( \@post_corr_seq );
	270	$witness_obj->uncorrected_path( \@ante_corr_seq );
	271	} else {
	272	$witness_obj->path( \@ante_corr_seq );
	273	}
b49c4318	274	}
e2902068	275
6a222840	276	# Now remove our 'base text' edges, which is to say, the only
1ed3973e	277	# ones we have created so far. Also remove any unwitnessed
1ed3973e	278	# lemma nodes (TODO unless we are treating base as witness)
6a222840	279	foreach ( $collation->paths() ) {
910a0a6d	280	$collation->del_path( $_ );
6a222840	281	}
6a222840	282	foreach( @unwitnessed_lemma_nodes ) {
910a0a6d	283	$collation->del_reading( $_ );
910a0a6d	284	# TODO do we need to delete any relationship paths here?
6a222840	285	}
4ca00eca	286
b15511bf	287	### HACKY HACKY Do some one-off path corrections here.
b15511bf	288	if( $collation->linear ) {
910a0a6d	289	my $c = $collation;
	290	my $end = $SHORTEND ? $SHORTEND : 155;
	291	# Vb11
	292	my $path;
	293	if( $end > 16 ) {
	294	$c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
	295	$path = $c->tradition->witness('Vb11')->path;
	296	splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) );
	297	$path = $c->tradition->witness('Vb11')->uncorrected_path;
	298	splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( 'rdg_1/16.2.2' ) );
	299	}
	300	# What else?
	301	# Vb26:
	302	$path = $c->tradition->witness('Vb26')->path;
	303	splice( @$path, 618, 0, $c->reading('rdg_1/46.1.1') ) if $end > 46;
	304	# Vb13:
	305	$path = $c->tradition->witness('Vb13')->path;
	306	splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58;
	307	$path = $c->tradition->witness('Vb13')->uncorrected_path;
	308	splice( @$path, 758, 0, $c->reading( '58,5' ) ) if $end > 58;
	309	# Vb20 a.c.:
	310	$path = $c->tradition->witness('Vb20')->uncorrected_path;
	311	splice( @$path, 1251, 1, $c->reading( '94,4' ) ) if $end > 94;
	312	# Vb5:
	313	$path = $c->tradition->witness('Vb5')->path;
	314	splice( @$path, 1436, 0, $c->reading('rdg_1/106.5.1') ) if $end > 106;
	315	# extraneous:
	316	$c->del_reading( 'rdg_2/147.6.13' );
	317	$c->del_reading( 'rdg_2/147.6.14' );
	318	$c->del_reading( 'rdg_2/147.6.15' );
	319
b15511bf	320	} else {
910a0a6d	321	my $c = $collation;
	322	my $end = $SHORTEND ? $SHORTEND : 155;
	323	# Vb5:
	324	my $path = $c->tradition->witness('Vb5')->path;
	325	splice( @$path, 1436, 0, $c->reading('106,14') ) if $end > 106;
	326	# Vb11:
	327	$path = $c->tradition->witness('Vb11')->path;
	328	if( $end > 16 ) {
	329	$c->merge_readings( $c->reading('rdg_1/16.3.0'), $c->reading('rdg_1/16.2.1') );
	330	splice( @$path, 209, 2, $c->reading( 'rdg_1/16.3.0' ), $c->reading( '16,1' ) );
	331	}
	332	# Vb13:
	333	$path = $c->tradition->witness('Vb13')->path;
	334	splice( @$path, 782, 0, $c->reading( '58,5' ) ) if $end > 58;
	335	$path = $c->tradition->witness('Vb13')->uncorrected_path;
	336	splice( @$path, 758, 0, $c->reading( '58,5' ) ) if $end > 58;
	337	# Vb20 a.c.:
	338	$path = $c->tradition->witness('Vb20')->uncorrected_path;
	339	splice( @$path, 1251, 1, $c->reading( '94,4' ) ) if $end > 94;
	340	# Vb26:
	341	$path = $c->tradition->witness('Vb26')->path;
	342	splice( @$path, 618, 0, $c->reading('46,2') ) if $end > 46;
b15511bf	343	}
b15511bf	344
910a0a6d	345	# Now walk paths and calculate positional rank.
7e450e44	346	$collation->make_witness_paths();
910a0a6d	347	# Have to check relationship validity at this point, because before that
	348	# we had no paths.
	349	# foreach my $rel ( $collation->relationships ) {
	350	# next unless $rel->equal_rank;
	351	# unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
	352	# warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
	353	# $rel->type, $rel->from->name, $rel->to->name );
	354	# }
	355	# }
	356	$collation->calculate_ranks();
15d2d3df	357	}
15d2d3df	358
2ceca8c3	359	=item B<read_base>
2ceca8c3	360
e2902068	361	my @line_beginnings = read_base( 'reference.txt', $collation );
2ceca8c3	362
e2902068	363	Takes a text file and a (presumed empty) collation object, adds the
	364	words as simple linear readings to the collation, and returns a
	365	list of readings that represent the beginning of lines. This collation
	366	is now the starting point for application of apparatus entries in
	367	merge_base, e.g. from a CSV file or a Classical Text Editor file.
2ceca8c3	368
2ceca8c3	369	=cut
b49c4318	370
b49c4318	371	sub read_base {
e2902068	372	my( $base_file, $collation ) = @_;
b49c4318	373
e2902068	374	# This array gives the first reading for each line. We put the
b49c4318	375	# common starting point in line zero.
e2902068	376	my $last_reading = $collation->start();
6a222840	377	$base_text_index{$last_reading->name} = 0;
e2902068	378	my $lineref_array = [ $last_reading ]; # There is no line zero.
b49c4318	379
b49c4318	380	open( BASE, $base_file ) or die "Could not open file $base_file: $!";
6a222840	381	my $i = 1;
b49c4318	382	while(<BASE>) {
910a0a6d	383	# Make the readings, and connect them up for the base, but
	384	# also save the first reading of each line in an array for the
	385	# purpose.
	386	# TODO use configurable reading separator
	387	chomp;
	388	my @words = split;
	389	my $started = 0;
	390	my $wordref = 0;
	391	my $lineref = scalar @$lineref_array;
	392	last if $SHORTEND && $lineref > $SHORTEND;
	393	foreach my $w ( @words ) {
	394	my $readingref = join( ',', $lineref, ++$wordref );
	395	my $reading = $collation->add_reading( $readingref );
	396	$reading->text( $w );
	397	unless( $started ) {
	398	push( @$lineref_array, $reading );
	399	$started = 1;
	400	}
	401	# Add edge paths in the graph, for easier tracking when
	402	# we start applying corrections. These paths will be
	403	# removed when we're done.
	404	my $path = $collation->add_path( $last_reading, $reading,
	405	$collation->baselabel );
	406	$last_reading = $reading;
	407
	408	# Note an array index for the reading, for later correction splices.
	409	$base_text_index{$readingref} = $i++;
	410	}
b49c4318	411	}
	412	close BASE;
	413	# Ending point for all texts
910a0a6d	414	$collation->add_path( $last_reading, $collation->end, $collation->baselabel );
	415	push( @$lineref_array, $collation->end );
	416	$base_text_index{$collation->end->name} = $i;
b49c4318	417
	418	return( @$lineref_array );
	419	}
	420
15d2d3df	421	sub set_relationships {
3265b0ce	422	my( $collation, $app, $lemma, $variants ) = @_;
15d2d3df	423	foreach my $rkey ( keys %$variants ) {
910a0a6d	424	my $var = $variants->{$rkey}->{'reading'};
	425	my $type = $app->{sprintf( "_%s_type", $rkey )};
	426	my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
	427	my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
	428
	429	my %rel_options = ();
	430	$rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
	431	$rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
	432
	433	if( $type =~ /^(inv\|tr\|rep)$/i ) {
	434	# Transposition or repetition: look for nodes with the
	435	# same label but different IDs and mark them.
	436	$type = 'repetition' if $type =~ /^rep/i;
	437	$rel_options{'type'} = $type;
	438	$rel_options{'equal_rank'} = undef;
	439	my %labels;
	440	foreach my $r ( @$lemma ) {
	441	$labels{cmp_str( $r )} = $r;
	442	}
	443	foreach my $r( @$var ) {
	444	if( exists $labels{$r->label} &&
	445	$r->name ne $labels{$r->label}->name ) {
	446	if( $type eq 'repetition' ) {
	447	# Repetition
	448	$collation->add_relationship( $r, $labels{$r->label}, \%rel_options );
	449	} else {
	450	# Transposition
	451	$r->set_identical( $labels{$r->label} );
	452	}
	453	}
	454	}
	455	} elsif( $type =~ /^(gr\|sp(el)?)$/i ) {
	456
	457	# Grammar/spelling/lexical: this can be a one-to-one or
	458	# one-to-many mapping. We should think about merging
	459	# readings if it is one-to-many.
	460
	461	$type = 'grammatical' if $type =~ /gr/i;
	462	$type = 'spelling' if $type =~ /sp/i;
	463	$type = 'repetition' if $type =~ /rep/i;
	464	# $type = 'lexical' if $type =~ /lex/i;
	465	$rel_options{'type'} = $type;
	466	$rel_options{'equal_rank'} = 1;
	467	if( @$lemma == @$var ) {
	468	foreach my $i ( 0 .. $#{$lemma} ) {
	469	$collation->add_relationship( $var->[$i], $lemma->[$i],
	470	\%rel_options );
	471	}
	472	} else {
	473	# An uneven many-to-many mapping. Skip for now.
	474	# We really want to make a segment out of whatever we have.
	475	# my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
	476	# my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
	477	# $collation->add_relationship( $varseg, $lemseg, \%rel_options );
	478	if( @$lemma == 1 && @$var == 1 ) {
	479	$collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
	480	}
	481	}
	482	} elsif( $type !~ /^(add\|om\|lex)$/i ) {
	483	warn "Unrecognized type $type";
	484	}
15d2d3df	485	}
15d2d3df	486	}
910a0a6d	487
15d2d3df	488
15d2d3df	489
4ca00eca	490	sub apply_edits {
b15511bf	491	my( $collation, $edit_sequence, $debug ) = @_;
c78feb69	492	my @lemma_text = $collation->reading_sequence( $collation->start,
910a0a6d	493	$collation->reading( '#END#' ) );
4ca00eca	494	my $drift = 0;
b15511bf	495	foreach my $correction ( @$edit_sequence ) {
910a0a6d	496	my( $lemma_start, $length, $items ) = @$correction;
	497	my $offset = $base_text_index{$lemma_start};
	498	my $realoffset = $offset + $drift;
	499	if( $debug \|\|
	500	$lemma_text[$realoffset]->name ne $lemma_start ) {
	501	my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
	502	my @base_phrase;
	503	my $i = $realoffset;
	504	my $l = $collation->reading( $lemma_start );
	505	while( $i < $realoffset+$length ) {
	506	push( @base_phrase, $l );
	507	$l = $collation->next_reading( $l );
	508	$i++;
	509	}
	510
	511	print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
	512	"with %s (%s) with drift %d\n",
	513	join( ' ', map {$_->label} @base_phrase ),
	514	join( ' ', map {$_->name} @base_phrase ),
	515	$realoffset,
	516	join( ' ', map {$_->label} @$items ),
	517	join( ' ', map {$_->name} @$items ),
	518	$drift,
	519	) if $debug;
	520
	521	if( $lemma_text[$realoffset]->name ne $lemma_start ) {
	522	warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
	523	"but %s (%s) is there instead",
	524	join( ' ', map {$_->label} @base_phrase ),
	525	join( ' ', map {$_->name} @base_phrase ),
	526	join( ' ', map {$_->label} @$items ),
	527	join( ' ', map {$_->name} @$items ),
	528	join( ' ', map {$_->label} @this_phrase ),
	529	join( ' ', map {$_->name} @this_phrase ),
	530	) );
	531	# next;
	532	}
	533	}
	534	splice( @lemma_text, $realoffset, $length, @$items );
	535	$drift += @$items - $length;
b49c4318	536	}
b15511bf	537	return @lemma_text;
b49c4318	538	}
910a0a6d	539
4ca00eca	540
e2902068	541	# Helper function. Given a witness sigil, if it is a post-correctione
	542	# sigil,return the base witness. If not, return a false value.
	543	sub _is_post_corr {
	544	my( $sigil ) = @_;
930ff666	545	if( $sigil =~ /^(.?)(\s\(?p\.\s*c\.\)?)$/ ) {
910a0a6d	546	return $1;
e2902068	547	}
	548	return undef;
	549	}
	550
b49c4318	551
2ceca8c3	552	=back
	553
	554	=head1 LICENSE
	555
	556	This package is free software and is provided "as is" without express
	557	or implied warranty. You can redistribute it and/or modify it under
	558	the same terms as Perl itself.
	559
	560	=head1 AUTHOR
	561
	562	Tara L Andrews, aurum@cpan.org
	563
	564	=cut
	565
b49c4318	566	1;