[scpubgit/stemmatology.git] / base / lib / Text / Tradition / Parser / BaseText.pm

package Text::Tradition::Parser::BaseText;

use strict;
use warnings;
use Module::Load;
use TryCatch;
use Text::Tradition::Parser::Util qw( collate_variants cmp_str 
	check_for_repeated add_hash_entry );

=head1 NAME

Text::Tradition::Parser::BaseText

=head1 SYNOPSIS

use Text::Tradition::Parser::BaseText qw( merge_base );
merge_base( $graph, 'reference.txt', @apparatus_entries )

=head1 DESCRIPTION

For an overview of the package, see the documentation for the
Text::Tradition module.

This module is meant for use with certain of the other Parser classes
- whenever a list of variants is given with reference to a base text,
these must be joined into a single collation.  The parser should
therefore make a list of variants and their locations, and BaseText
will join those listed variants onto the reference text.  

=head1 SUBROUTINES

=over

=item B<parse>

parse( $graph, $opts );

Takes an initialized graph and a hashref of options, which must include:
- 'base' - the base text referenced by the variants
- 'format' - the format of the variant list
- 'data' - the variants, in the given format.

=cut

sub parse {
    my( $tradition, $opts ) = @_;

    my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
    load( $format_mod );
    # TODO Handle a string someday if we ever have a format other than KUL
    my @apparatus_entries = $format_mod->can('read')->( $opts );
    merge_base( $tradition->collation, $opts, @apparatus_entries );
}

=item B<merge_base>

merge_base( $graph, 'reference.txt', @apparatus_entries )

Takes three arguments: a newly-initialized Text::Tradition::Graph
object, a text file containing the reference text, and a list of
variants (apparatus entries).  Adds the base text to the graph, and
joins the variants to that.

The list of variants is an array of hash references; each hash takes
the form
 { '_id' => line reference,
   'rdg_0' => lemma reading,
   'rdg_1' => first variant,
   ...  # and so on until all distinct readings are listed
   'WitnessA' => 'rdg_0',
   'WitnessB' => 'rdg_1',
   ...  # and so on until all witnesses are listed with their readings
 }

Any hash key that is not of the form /^rdg_\d+$/ and that does not
begin with an underscore is assumed to be a witness name.  Any 'meta'
information to be passed must be passed in a key with a leading
underscore in its name.

=cut

my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed

my %base_text_index;
my $edits_required = {};

# edits_required -> wit -> [ { start_idx, end_idx, items } ]

sub merge_base {
    my( $collation, $opts, @app_entries ) = @_;
    my @base_line_starts = read_base( $opts->{'base'}, $collation );

    my %all_witnesses;
    foreach my $app ( @app_entries ) {
        my( $line, $num ) = split( /\./, $app->{_id} );
        # DEBUG with a short graph
        last if $SHORTEND && $line > $SHORTEND;
        # DEBUG for problematic entries
        my $scrutinize = '';
        my $first_line_reading = $base_line_starts[ $line ];
        my $too_far = $base_line_starts[ $line+1 ];
        
        my $lemma = $app->{rdg_0};
        my $seq = 1; 
        # Is this the Nth occurrence of this reading in the line?
        if( $lemma =~ s/(_)?(\d)$// ) {
            $seq = $2;
        }
        my @lemma_words = split( /\s+/, $lemma );
        
        # Now search for the lemma words within this line.
        my $lemma_start = $first_line_reading;
        my $lemma_end;
        my %seen;
        while( $lemma_start ne $too_far ) {
            # Loop detection
            if( $seen{ $lemma_start->id() } ) {
                warn "Detected loop at " . $lemma_start->id() . 
                    ", ref $line,$num";
                last;
            }
            $seen{ $lemma_start->id() } = 1;
            
            # Try to match the lemma.
            my $unmatch = 0;
            print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
                $lemma_words[0] . "...\n"
                if "$line.$num" eq $scrutinize;
            if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
                # Skip it if we need a match that is not the first.
                if( --$seq < 1 ) {
                    # Now we have to compare the rest of the words here.
                    if( scalar( @lemma_words ) > 1 ) {
                        my $next_reading = 
                            $collation->next_reading( $lemma_start );
                        foreach my $w ( @lemma_words[1..$#lemma_words] ) {
                            printf STDERR "Now matching %s against %s\n", 
                                    cmp_str($next_reading), $w
                                if "$line.$num" eq $scrutinize;
                            if( $w ne cmp_str($next_reading) ) {
                                $unmatch = 1;
                                last;
                            } else {
                                $lemma_end = $next_reading;
                                $next_reading = 
                                    $collation->next_reading( $lemma_end );
                            }
                        }
                    } else {
                        $lemma_end = $lemma_start;
                    }
                } else {
                    $unmatch = 1;
                }
            }
            last unless ( $unmatch || !defined( $lemma_end ) );
            $lemma_end = undef;
            $lemma_start = $collation->next_reading( $lemma_start );
        }
        
        unless( $lemma_end ) {
            warn "No match found for @lemma_words at $line.$num";
            next;
        }
        
        # Now we have found the lemma; we will record an 'edit', in
        # terms of a splice operation, for each subsequent reading.
        # We also note which witnesses take the given edit.

        my @lemma_set = $collation->reading_sequence( $lemma_start, 
                                                      $lemma_end );
        my @reading_sets = [ @lemma_set ];
        
        # For each reading that is not rdg_0, we create the variant
        # reading nodes, and store the range as an edit operation on
        # the base text.
        my $variant_objects;
        my %pc_seen; # Keep track of mss with explicit post-corr data
        foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
            my @mss = grep { $app->{$_} eq $k } keys( %$app );

            # Keep track of what witnesses we have seen.
            @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
            # Keep track of which witnesses bear corrected readings here.
            foreach my $m ( @mss ) {
                my $base = _is_post_corr( $m );
                next unless $base;
                $pc_seen{$base} = 1;
            }
            next if $k eq 'rdg_0';

            # Parse the variant into reading tokens.
            # TODO don't hardcode the reading split operation
            my @variant = split( /\s+/, $app->{$k} );
            @variant = () if $app->{$k} eq '/'; # This is an omission.
            
            my @variant_readings;
            my $ctr = 0;
            foreach my $vw ( @variant ) {
                my $vwname = "$k/$line.$num.$ctr"; $ctr++;
                my $vwreading = $collation->add_reading( {
                	'id' => $vwname,
                	'text' => $vw } );
                push( @variant_readings, $vwreading );
            }

            $variant_objects->{$k} = { 'mss' => \@mss,
                                       'reading' => \@variant_readings,
            };
            push( @reading_sets, \@variant_readings );
        }

        # Now collate and collapse the identical readings within the
        # collated sets.  Modifies the reading sets that were passed.
        collate_variants( $collation, @reading_sets );

        # Record any stated relationships between the nodes and the lemma.
        set_relationships( $collation, $app, \@lemma_set, $variant_objects );

        # Now create the splice-edit objects that will be used
        # to reconstruct each witness.

        foreach my $rkey ( keys %$variant_objects ) {
            # Object is argument list for splice, so:
            # offset, length, replacements
            my $edit_object = [ $lemma_start->id,
                                scalar( @lemma_set ),
                                $variant_objects->{$rkey}->{reading} ];
            foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
                # Is this a p.c. entry?
                my $base = _is_post_corr( $ms );
                if( $base ) { # this is a post-corr witness
                    my $pc_key = $base . "_post";
                    add_hash_entry( $edits_required, $pc_key, $edit_object );
                } else { # this is an ante-corr witness
                    my $pc_key = $ms . "_post";
                    add_hash_entry( $edits_required, $ms, $edit_object );
                    unless( $pc_seen{$ms} ) {
                        # If this witness carries no correction, add this 
                        # same object to its post-corrected state.
                        add_hash_entry( $edits_required, $pc_key, 
                                         $edit_object );
                    }
                }
            }
        }
    } # Finished going through the apparatus entries

    # Now make the witness objects, and create their text sequences
    foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
        print STDERR "Creating witness $w\n";
        my $witness_obj = $collation->tradition->add_witness( 
        	sigil => $w, sourcetype => 'collation' );
        my $debug; #  = $w eq 'Vb11';
        my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
        my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
            if exists( $edits_required->{$w."_post"} );

        my @repeated = check_for_repeated( @ante_corr_seq );
        warn "Repeated elements @repeated in $w a.c."
            if @repeated;
        @repeated = check_for_repeated( @post_corr_seq );
        warn "Repeated elements @repeated in $w p.c."
            if @repeated;

        # Now save these paths in my witness object
        if( @post_corr_seq ) {
            $witness_obj->path( \@post_corr_seq );
            $witness_obj->uncorrected_path( \@ante_corr_seq );
        } else {
            $witness_obj->path( \@ante_corr_seq );
        }
    }

    # Now remove our 'base text' edges, which is to say, the only
    # ones we have created so far.  Also remove any unwitnessed
    # lemma nodes (TODO unless we are treating base as witness)
    foreach ( $collation->paths() ) {
        $collation->del_path( $_, $collation->baselabel );
    }

    ### HACKY HACKY Do some one-off path corrections here.
    if( $opts->{'input'} eq 'KUL' ) {
		require 'data/boodts/s158.HACK';
		KUL::HACK::pre_path_hack( $collation );
	}
	
    # Now walk paths and calculate positional rank.
    $collation->make_witness_paths();
    # Now delete any orphaned readings.
	foreach my $r ( $collation->sequence->isolated_vertices ) {
		print STDERR "Deleting unconnected reading $r / " . 
			$collation->reading( $r )->text . "\n";
		$collation->del_reading( $r );
	}
	
    KUL::HACK::post_path_hack( $collation ) if $opts->{'input'} eq 'KUL';
    # Have to check relationship validity at this point, because before that
    # we had no paths.
#     foreach my $rel ( $collation->relationships ) {
#         next unless $rel->equal_rank;
#         unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
#             warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
#                             $rel->type, $rel->from->id, $rel->to->id );
#         }
#     }
	unless( $opts->{'nocalc'} ) {
	    $collation->calculate_common_readings(); # will implicitly rank
	}
}

=item B<read_base>

my @line_beginnings = read_base( 'reference.txt', $collation );

Takes a text file and a (presumed empty) collation object, adds the
words as simple linear readings to the collation, and returns a
list of readings that represent the beginning of lines. This collation
is now the starting point for application of apparatus entries in
merge_base, e.g. from a CSV file or a Classical Text Editor file.

=cut

sub read_base {
    my( $base_file, $collation ) = @_;
    
    # This array gives the first reading for each line.  We put the
    # common starting point in line zero.
    my $last_reading = $collation->start;
    $base_text_index{$last_reading->id} = 0;
    my $lineref_array = [ $last_reading ]; # There is no line zero.

    open( BASE, $base_file ) or die "Could not open file $base_file: $!";
    my $i = 1;
    while(<BASE>) {
        # Make the readings, and connect them up for the base, but
        # also save the first reading of each line in an array for the
        # purpose.
        # TODO use configurable reading separator
        chomp;
        my @words = split;
        my $started = 0;
        my $wordref = 0;
        my $lineref = scalar @$lineref_array;
        last if $SHORTEND && $lineref > $SHORTEND;
        foreach my $w ( @words ) {
            my $readingref = join( ',', $lineref, ++$wordref );
            my $reading = $collation->add_reading( { id => $readingref, text => $w } );
            unless( $started ) {
                push( @$lineref_array, $reading );
                $started = 1;
            }
            # Add edge paths in the graph, for easier tracking when
            # we start applying corrections.  These paths will be
            # removed when we're done.
            my $path = $collation->add_path( $last_reading, $reading, 
                                             $collation->baselabel );
            $last_reading = $reading;

            # Note an array index for the reading, for later correction splices.
            $base_text_index{$readingref} = $i++;
        }
    }
    close BASE;
    # Ending point for all texts
    $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
    push( @$lineref_array, $collation->end );
    $base_text_index{$collation->end->id} = $i;

    return( @$lineref_array );
}

sub set_relationships {
    my( $collation, $app, $lemma, $variants ) = @_;
    foreach my $rkey ( keys %$variants ) {
        my $var = $variants->{$rkey}->{'reading'};
        my $type = $app->{sprintf( "_%s_type", $rkey )};
        my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
        my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
        
        my %rel_options = ();
        $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
        $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
        
        if( $type =~ /^(inv|tr|rep)$/i ) {
            # Transposition or repetition: look for nodes with the
            # same label but different IDs and mark them.
            $type = 'repetition' if $type =~ /^rep/i;
            $rel_options{'type'} = $type;
            $rel_options{'equal_rank'} = undef;
            my %labels;
            foreach my $r ( @$lemma ) {
                $labels{cmp_str( $r )} = $r;
            }
            foreach my $r( @$var ) {
                if( exists $labels{$r->text} &&
                    $r->id ne $labels{$r->text}->id ) {
                    if( $type eq 'repetition' ) {
                        # Repetition
                        try {
                        	$collation->add_relationship( $r, $labels{$r->text}, \%rel_options );
                        } catch( Text::Tradition::Error $e ) {
                        	warn "Could not set repetition relationship $r -> " 
                        		. $labels{$r->text} . ": " . $e->message;
                        }
                    } else {
                        # Transposition
                    	try {
                       		$r->set_identical( $labels{$r->text} );
                        } catch( Text::Tradition::Error $e ) {
                        	warn "Could not set transposition relationship $r -> " 
                        		. $labels{$r->text} . ": " . $e->message;
                        }
                    }
                }
            }
        } elsif( $type =~ /^(gr|sp(el)?)$/i ) {

            # Grammar/spelling/lexical: this can be a one-to-one or
            # one-to-many mapping.  We should think about merging
            # readings if it is one-to-many.

            $type = 'grammatical' if $type =~ /gr/i;
            $type = 'spelling' if $type =~ /sp/i;
            $type = 'repetition' if $type =~ /rep/i;
            # $type = 'lexical' if $type =~ /lex/i;
            $rel_options{'type'} = $type;
            $rel_options{'equal_rank'} = 1;
            if( @$lemma == @$var ) {
                foreach my $i ( 0 .. $#{$lemma} ) {
                	try {
						$collation->add_relationship( $var->[$i], $lemma->[$i],
							\%rel_options );
					} catch( Text::Tradition::Error $e ) {
						warn "Could not set $type relationship " . $var->[$i] . " -> " 
							. $lemma->[$i] . ": " . $e->message;
					}
                } 
            } else {
                # An uneven many-to-many mapping.  Skip for now.
                # We really want to make a segment out of whatever we have.
                # my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
                # my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
                # $collation->add_relationship( $varseg, $lemseg, \%rel_options );
                # if( @$lemma == 1 && @$var == 1 ) {
                #     $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
                # }
            }
        } elsif( $type !~ /^(add|om|lex)$/i ) {
            warn "Unrecognized type $type";
        }
    }
}
        

sub apply_edits {
    my( $collation, $edit_sequence, $debug ) = @_;
    my @lemma_text = $collation->reading_sequence( 
    	$collation->start, $collation->end );
    my $drift = 0;
    foreach my $correction ( @$edit_sequence ) {
        my( $lemma_start, $length, $items ) = @$correction;
        my $offset = $base_text_index{$lemma_start};
        my $realoffset = $offset + $drift;
        if( $debug ||
            $lemma_text[$realoffset]->id ne $lemma_start ) {
            my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
            my @base_phrase;
            my $i = $realoffset;
            my $l = $collation->reading( $lemma_start );
            while( $i < $realoffset+$length ) {
                push( @base_phrase, $l );
                $l = $collation->next_reading( $l );
                $i++;
            }
            
            print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
                                  "with %s (%s) with drift %d\n",
                                  join( ' ', map {$_->text} @base_phrase ),
                                  join( ' ', map {$_->id} @base_phrase ),
                                  $realoffset,
                                  join( ' ', map {$_->text} @$items ),
                                  join( ' ', map {$_->id} @$items ),
                                  $drift,
                                  ) if $debug;
                                  
            if( $lemma_text[$realoffset]->id ne $lemma_start ) {
                warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
                               "but %s (%s) is there instead", 
                               join( ' ', map {$_->text} @base_phrase ),
                               join( ' ', map {$_->id} @base_phrase ),
                               join( ' ', map {$_->text} @$items ),
                               join( ' ', map {$_->id} @$items ),
                               join( ' ', map {$_->text} @this_phrase ),
                               join( ' ', map {$_->id} @this_phrase ),
                      ) );
                # next;
            }
        }
        splice( @lemma_text, $realoffset, $length, @$items );
        $drift += @$items - $length;
    }
    return @lemma_text;
}
        

# Helper function. Given a witness sigil, if it is a post-correctione
# sigil,return the base witness.  If not, return a false value.
sub _is_post_corr {
    my( $sigil ) = @_;
    if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
        return $1;
    }
    return undef;
}


=back

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews, aurum@cpan.org

=cut

1;
Commit	Line	Data
e58153d6	1	package Text::Tradition::Parser::BaseText;
b49c4318	2
	3	use strict;
	4	use warnings;
52ce987f	5	use Module::Load;
63778331	6	use TryCatch;
	7	use Text::Tradition::Parser::Util qw( collate_variants cmp_str
	8	check_for_repeated add_hash_entry );
b49c4318	9
2ceca8c3	10	=head1 NAME
	11
	12	Text::Tradition::Parser::BaseText
	13
	14	=head1 SYNOPSIS
	15
	16	use Text::Tradition::Parser::BaseText qw( merge_base );
	17	merge_base( $graph, 'reference.txt', @apparatus_entries )
	18
	19	=head1 DESCRIPTION
	20
	21	For an overview of the package, see the documentation for the
408449b7	22	Text::Tradition module.
2ceca8c3	23
	24	This module is meant for use with certain of the other Parser classes
	25	- whenever a list of variants is given with reference to a base text,
	26	these must be joined into a single collation. The parser should
	27	therefore make a list of variants and their locations, and BaseText
	28	will join those listed variants onto the reference text.
	29
	30	=head1 SUBROUTINES
	31
	32	=over
	33
52ce987f	34	=item B<parse>
52ce987f	35
408449b7	36	parse( $graph, $opts );
52ce987f	37
408449b7	38	Takes an initialized graph and a hashref of options, which must include:
52ce987f	39	- 'base' - the base text referenced by the variants
	40	- 'format' - the format of the variant list
	41	- 'data' - the variants, in the given format.
	42
	43	=cut
	44
	45	sub parse {
dfc37e38	46	my( $tradition, $opts ) = @_;
52ce987f	47
dfc37e38	48	my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
52ce987f	49	load( $format_mod );
408449b7	50	# TODO Handle a string someday if we ever have a format other than KUL
408449b7	51	my @apparatus_entries = $format_mod->can('read')->( $opts );
b0b4421a	52	merge_base( $tradition->collation, $opts, @apparatus_entries );
52ce987f	53	}
52ce987f	54
2ceca8c3	55	=item B<merge_base>
	56
	57	merge_base( $graph, 'reference.txt', @apparatus_entries )
	58
	59	Takes three arguments: a newly-initialized Text::Tradition::Graph
	60	object, a text file containing the reference text, and a list of
	61	variants (apparatus entries). Adds the base text to the graph, and
	62	joins the variants to that.
	63
	64	The list of variants is an array of hash references; each hash takes
	65	the form
	66	{ '_id' => line reference,
	67	'rdg_0' => lemma reading,
	68	'rdg_1' => first variant,
	69	... # and so on until all distinct readings are listed
	70	'WitnessA' => 'rdg_0',
	71	'WitnessB' => 'rdg_1',
	72	... # and so on until all witnesses are listed with their readings
	73	}
	74
	75	Any hash key that is not of the form /^rdg_\d+$/ and that does not
	76	begin with an underscore is assumed to be a witness name. Any 'meta'
	77	information to be passed must be passed in a key with a leading
	78	underscore in its name.
	79
	80	=cut
	81
b15511bf	82	my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
4ca00eca	83
4ca00eca	84	my %base_text_index;
6a222840	85	my $edits_required = {};
4ca00eca	86
4ca00eca	87	# edits_required -> wit -> [ { start_idx, end_idx, items } ]
930ff666	88
b49c4318	89	sub merge_base {
b0b4421a	90	my( $collation, $opts, @app_entries ) = @_;
b0b4421a	91	my @base_line_starts = read_base( $opts->{'base'}, $collation );
b49c4318	92
52ce987f	93	my %all_witnesses;
b49c4318	94	foreach my $app ( @app_entries ) {
910a0a6d	95	my( $line, $num ) = split( /\./, $app->{_id} );
	96	# DEBUG with a short graph
	97	last if $SHORTEND && $line > $SHORTEND;
	98	# DEBUG for problematic entries
	99	my $scrutinize = '';
	100	my $first_line_reading = $base_line_starts[ $line ];
	101	my $too_far = $base_line_starts[ $line+1 ];
	102
	103	my $lemma = $app->{rdg_0};
	104	my $seq = 1;
	105	# Is this the Nth occurrence of this reading in the line?
	106	if( $lemma =~ s/(_)?(\d)$// ) {
	107	$seq = $2;
	108	}
	109	my @lemma_words = split( /\s+/, $lemma );
	110
	111	# Now search for the lemma words within this line.
	112	my $lemma_start = $first_line_reading;
	113	my $lemma_end;
	114	my %seen;
	115	while( $lemma_start ne $too_far ) {
	116	# Loop detection
49d4f2ac	117	if( $seen{ $lemma_start->id() } ) {
49d4f2ac	118	warn "Detected loop at " . $lemma_start->id() .
910a0a6d	119	", ref $line,$num";
	120	last;
	121	}
49d4f2ac	122	$seen{ $lemma_start->id() } = 1;
910a0a6d	123
	124	# Try to match the lemma.
	125	my $unmatch = 0;
	126	print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
	127	$lemma_words[0] . "...\n"
	128	if "$line.$num" eq $scrutinize;
	129	if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
	130	# Skip it if we need a match that is not the first.
	131	if( --$seq < 1 ) {
	132	# Now we have to compare the rest of the words here.
	133	if( scalar( @lemma_words ) > 1 ) {
	134	my $next_reading =
	135	$collation->next_reading( $lemma_start );
	136	foreach my $w ( @lemma_words[1..$#lemma_words] ) {
	137	printf STDERR "Now matching %s against %s\n",
	138	cmp_str($next_reading), $w
	139	if "$line.$num" eq $scrutinize;
	140	if( $w ne cmp_str($next_reading) ) {
	141	$unmatch = 1;
	142	last;
	143	} else {
	144	$lemma_end = $next_reading;
	145	$next_reading =
	146	$collation->next_reading( $lemma_end );
	147	}
	148	}
	149	} else {
	150	$lemma_end = $lemma_start;
	151	}
	152	} else {
	153	$unmatch = 1;
	154	}
	155	}
	156	last unless ( $unmatch \|\| !defined( $lemma_end ) );
	157	$lemma_end = undef;
	158	$lemma_start = $collation->next_reading( $lemma_start );
	159	}
	160
	161	unless( $lemma_end ) {
	162	warn "No match found for @lemma_words at $line.$num";
	163	next;
	164	}
	165
	166	# Now we have found the lemma; we will record an 'edit', in
	167	# terms of a splice operation, for each subsequent reading.
	168	# We also note which witnesses take the given edit.
	169
	170	my @lemma_set = $collation->reading_sequence( $lemma_start,
	171	$lemma_end );
	172	my @reading_sets = [ @lemma_set ];
b0b4421a	173
910a0a6d	174	# For each reading that is not rdg_0, we create the variant
	175	# reading nodes, and store the range as an edit operation on
	176	# the base text.
	177	my $variant_objects;
	178	my %pc_seen; # Keep track of mss with explicit post-corr data
	179	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
	180	my @mss = grep { $app->{$_} eq $k } keys( %$app );
	181
910a0a6d	182	# Keep track of what witnesses we have seen.
	183	@all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
	184	# Keep track of which witnesses bear corrected readings here.
	185	foreach my $m ( @mss ) {
	186	my $base = _is_post_corr( $m );
	187	next unless $base;
	188	$pc_seen{$base} = 1;
	189	}
	190	next if $k eq 'rdg_0';
	191
	192	# Parse the variant into reading tokens.
	193	# TODO don't hardcode the reading split operation
	194	my @variant = split( /\s+/, $app->{$k} );
	195	@variant = () if $app->{$k} eq '/'; # This is an omission.
	196
	197	my @variant_readings;
	198	my $ctr = 0;
	199	foreach my $vw ( @variant ) {
	200	my $vwname = "$k/$line.$num.$ctr"; $ctr++;
49d4f2ac	201	my $vwreading = $collation->add_reading( {
	202	'id' => $vwname,
	203	'text' => $vw } );
910a0a6d	204	push( @variant_readings, $vwreading );
	205	}
	206
	207	$variant_objects->{$k} = { 'mss' => \@mss,
	208	'reading' => \@variant_readings,
	209	};
	210	push( @reading_sets, \@variant_readings );
	211	}
	212
	213	# Now collate and collapse the identical readings within the
	214	# collated sets. Modifies the reading sets that were passed.
	215	collate_variants( $collation, @reading_sets );
	216
	217	# Record any stated relationships between the nodes and the lemma.
	218	set_relationships( $collation, $app, \@lemma_set, $variant_objects );
	219
	220	# Now create the splice-edit objects that will be used
	221	# to reconstruct each witness.
	222
	223	foreach my $rkey ( keys %$variant_objects ) {
	224	# Object is argument list for splice, so:
	225	# offset, length, replacements
49d4f2ac	226	my $edit_object = [ $lemma_start->id,
910a0a6d	227	scalar( @lemma_set ),
	228	$variant_objects->{$rkey}->{reading} ];
	229	foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
	230	# Is this a p.c. entry?
	231	my $base = _is_post_corr( $ms );
	232	if( $base ) { # this is a post-corr witness
	233	my $pc_key = $base . "_post";
	234	add_hash_entry( $edits_required, $pc_key, $edit_object );
	235	} else { # this is an ante-corr witness
	236	my $pc_key = $ms . "_post";
	237	add_hash_entry( $edits_required, $ms, $edit_object );
	238	unless( $pc_seen{$ms} ) {
	239	# If this witness carries no correction, add this
	240	# same object to its post-corrected state.
	241	add_hash_entry( $edits_required, $pc_key,
	242	$edit_object );
	243	}
	244	}
	245	}
	246	}
4ca00eca	247	} # Finished going through the apparatus entries
	248
	249	# Now make the witness objects, and create their text sequences
6a222840	250	foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
910a0a6d	251	print STDERR "Creating witness $w\n";
82fa4d57	252	my $witness_obj = $collation->tradition->add_witness(
82fa4d57	253	sigil => $w, sourcetype => 'collation' );
910a0a6d	254	my $debug; # = $w eq 'Vb11';
	255	my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
	256	my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
	257	if exists( $edits_required->{$w."_post"} );
	258
	259	my @repeated = check_for_repeated( @ante_corr_seq );
	260	warn "Repeated elements @repeated in $w a.c."
	261	if @repeated;
	262	@repeated = check_for_repeated( @post_corr_seq );
	263	warn "Repeated elements @repeated in $w p.c."
	264	if @repeated;
	265
	266	# Now save these paths in my witness object
	267	if( @post_corr_seq ) {
	268	$witness_obj->path( \@post_corr_seq );
	269	$witness_obj->uncorrected_path( \@ante_corr_seq );
	270	} else {
	271	$witness_obj->path( \@ante_corr_seq );
	272	}
b49c4318	273	}
e2902068	274
6a222840	275	# Now remove our 'base text' edges, which is to say, the only
1ed3973e	276	# ones we have created so far. Also remove any unwitnessed
1ed3973e	277	# lemma nodes (TODO unless we are treating base as witness)
6a222840	278	foreach ( $collation->paths() ) {
49d4f2ac	279	$collation->del_path( $_, $collation->baselabel );
6a222840	280	}
4ca00eca	281
b15511bf	282	### HACKY HACKY Do some one-off path corrections here.
b0b4421a	283	if( $opts->{'input'} eq 'KUL' ) {
	284	require 'data/boodts/s158.HACK';
	285	KUL::HACK::pre_path_hack( $collation );
	286	}
	287
910a0a6d	288	# Now walk paths and calculate positional rank.
7e450e44	289	$collation->make_witness_paths();
49d4f2ac	290	# Now delete any orphaned readings.
	291	foreach my $r ( $collation->sequence->isolated_vertices ) {
	292	print STDERR "Deleting unconnected reading $r / " .
	293	$collation->reading( $r )->text . "\n";
	294	$collation->del_reading( $r );
	295	}
	296
b0b4421a	297	KUL::HACK::post_path_hack( $collation ) if $opts->{'input'} eq 'KUL';
910a0a6d	298	# Have to check relationship validity at this point, because before that
	299	# we had no paths.
	300	# foreach my $rel ( $collation->relationships ) {
	301	# next unless $rel->equal_rank;
	302	# unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
	303	# warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
49d4f2ac	304	# $rel->type, $rel->from->id, $rel->to->id );
910a0a6d	305	# }
910a0a6d	306	# }
82a45078	307	unless( $opts->{'nocalc'} ) {
	308	$collation->calculate_common_readings(); # will implicitly rank
	309	}
15d2d3df	310	}
15d2d3df	311
2ceca8c3	312	=item B<read_base>
2ceca8c3	313
e2902068	314	my @line_beginnings = read_base( 'reference.txt', $collation );
2ceca8c3	315
e2902068	316	Takes a text file and a (presumed empty) collation object, adds the
	317	words as simple linear readings to the collation, and returns a
	318	list of readings that represent the beginning of lines. This collation
	319	is now the starting point for application of apparatus entries in
	320	merge_base, e.g. from a CSV file or a Classical Text Editor file.
2ceca8c3	321
2ceca8c3	322	=cut
b49c4318	323
b49c4318	324	sub read_base {
e2902068	325	my( $base_file, $collation ) = @_;
b49c4318	326
e2902068	327	# This array gives the first reading for each line. We put the
b49c4318	328	# common starting point in line zero.
49d4f2ac	329	my $last_reading = $collation->start;
49d4f2ac	330	$base_text_index{$last_reading->id} = 0;
e2902068	331	my $lineref_array = [ $last_reading ]; # There is no line zero.
b49c4318	332
b49c4318	333	open( BASE, $base_file ) or die "Could not open file $base_file: $!";
6a222840	334	my $i = 1;
b49c4318	335	while(<BASE>) {
910a0a6d	336	# Make the readings, and connect them up for the base, but
	337	# also save the first reading of each line in an array for the
	338	# purpose.
	339	# TODO use configurable reading separator
	340	chomp;
	341	my @words = split;
	342	my $started = 0;
	343	my $wordref = 0;
	344	my $lineref = scalar @$lineref_array;
	345	last if $SHORTEND && $lineref > $SHORTEND;
	346	foreach my $w ( @words ) {
	347	my $readingref = join( ',', $lineref, ++$wordref );
49d4f2ac	348	my $reading = $collation->add_reading( { id => $readingref, text => $w } );
910a0a6d	349	unless( $started ) {
	350	push( @$lineref_array, $reading );
	351	$started = 1;
	352	}
	353	# Add edge paths in the graph, for easier tracking when
	354	# we start applying corrections. These paths will be
	355	# removed when we're done.
	356	my $path = $collation->add_path( $last_reading, $reading,
	357	$collation->baselabel );
	358	$last_reading = $reading;
	359
	360	# Note an array index for the reading, for later correction splices.
	361	$base_text_index{$readingref} = $i++;
	362	}
b49c4318	363	}
	364	close BASE;
	365	# Ending point for all texts
910a0a6d	366	$collation->add_path( $last_reading, $collation->end, $collation->baselabel );
910a0a6d	367	push( @$lineref_array, $collation->end );
49d4f2ac	368	$base_text_index{$collation->end->id} = $i;
b49c4318	369
	370	return( @$lineref_array );
	371	}
	372
15d2d3df	373	sub set_relationships {
3265b0ce	374	my( $collation, $app, $lemma, $variants ) = @_;
15d2d3df	375	foreach my $rkey ( keys %$variants ) {
910a0a6d	376	my $var = $variants->{$rkey}->{'reading'};
	377	my $type = $app->{sprintf( "_%s_type", $rkey )};
	378	my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
	379	my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
	380
	381	my %rel_options = ();
	382	$rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
	383	$rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
	384
	385	if( $type =~ /^(inv\|tr\|rep)$/i ) {
	386	# Transposition or repetition: look for nodes with the
	387	# same label but different IDs and mark them.
	388	$type = 'repetition' if $type =~ /^rep/i;
	389	$rel_options{'type'} = $type;
	390	$rel_options{'equal_rank'} = undef;
	391	my %labels;
	392	foreach my $r ( @$lemma ) {
	393	$labels{cmp_str( $r )} = $r;
	394	}
	395	foreach my $r( @$var ) {
49d4f2ac	396	if( exists $labels{$r->text} &&
49d4f2ac	397	$r->id ne $labels{$r->text}->id ) {
910a0a6d	398	if( $type eq 'repetition' ) {
910a0a6d	399	# Repetition
63778331	400	try {
	401	$collation->add_relationship( $r, $labels{$r->text}, \%rel_options );
	402	} catch( Text::Tradition::Error $e ) {
	403	warn "Could not set repetition relationship $r -> "
	404	. $labels{$r->text} . ": " . $e->message;
	405	}
910a0a6d	406	} else {
910a0a6d	407	# Transposition
63778331	408	try {
	409	$r->set_identical( $labels{$r->text} );
	410	} catch( Text::Tradition::Error $e ) {
	411	warn "Could not set transposition relationship $r -> "
	412	. $labels{$r->text} . ": " . $e->message;
	413	}
910a0a6d	414	}
	415	}
	416	}
	417	} elsif( $type =~ /^(gr\|sp(el)?)$/i ) {
	418
	419	# Grammar/spelling/lexical: this can be a one-to-one or
	420	# one-to-many mapping. We should think about merging
	421	# readings if it is one-to-many.
	422
	423	$type = 'grammatical' if $type =~ /gr/i;
	424	$type = 'spelling' if $type =~ /sp/i;
	425	$type = 'repetition' if $type =~ /rep/i;
	426	# $type = 'lexical' if $type =~ /lex/i;
	427	$rel_options{'type'} = $type;
	428	$rel_options{'equal_rank'} = 1;
	429	if( @$lemma == @$var ) {
	430	foreach my $i ( 0 .. $#{$lemma} ) {
63778331	431	try {
	432	$collation->add_relationship( $var->[$i], $lemma->[$i],
	433	\%rel_options );
	434	} catch( Text::Tradition::Error $e ) {
	435	warn "Could not set $type relationship " . $var->[$i] . " -> "
	436	. $lemma->[$i] . ": " . $e->message;
	437	}
910a0a6d	438	}
	439	} else {
	440	# An uneven many-to-many mapping. Skip for now.
	441	# We really want to make a segment out of whatever we have.
	442	# my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
	443	# my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
	444	# $collation->add_relationship( $varseg, $lemseg, \%rel_options );
63778331	445	# if( @$lemma == 1 && @$var == 1 ) {
	446	# $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
	447	# }
910a0a6d	448	}
	449	} elsif( $type !~ /^(add\|om\|lex)$/i ) {
	450	warn "Unrecognized type $type";
	451	}
15d2d3df	452	}
15d2d3df	453	}
910a0a6d	454
15d2d3df	455
15d2d3df	456
4ca00eca	457	sub apply_edits {
b15511bf	458	my( $collation, $edit_sequence, $debug ) = @_;
49d4f2ac	459	my @lemma_text = $collation->reading_sequence(
49d4f2ac	460	$collation->start, $collation->end );
4ca00eca	461	my $drift = 0;
b15511bf	462	foreach my $correction ( @$edit_sequence ) {
910a0a6d	463	my( $lemma_start, $length, $items ) = @$correction;
	464	my $offset = $base_text_index{$lemma_start};
	465	my $realoffset = $offset + $drift;
	466	if( $debug \|\|
49d4f2ac	467	$lemma_text[$realoffset]->id ne $lemma_start ) {
910a0a6d	468	my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
	469	my @base_phrase;
	470	my $i = $realoffset;
	471	my $l = $collation->reading( $lemma_start );
	472	while( $i < $realoffset+$length ) {
	473	push( @base_phrase, $l );
	474	$l = $collation->next_reading( $l );
	475	$i++;
	476	}
	477
	478	print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
	479	"with %s (%s) with drift %d\n",
49d4f2ac	480	join( ' ', map {$_->text} @base_phrase ),
49d4f2ac	481	join( ' ', map {$_->id} @base_phrase ),
910a0a6d	482	$realoffset,
49d4f2ac	483	join( ' ', map {$_->text} @$items ),
49d4f2ac	484	join( ' ', map {$_->id} @$items ),
910a0a6d	485	$drift,
	486	) if $debug;
	487
49d4f2ac	488	if( $lemma_text[$realoffset]->id ne $lemma_start ) {
910a0a6d	489	warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
910a0a6d	490	"but %s (%s) is there instead",
49d4f2ac	491	join( ' ', map {$_->text} @base_phrase ),
	492	join( ' ', map {$_->id} @base_phrase ),
	493	join( ' ', map {$_->text} @$items ),
	494	join( ' ', map {$_->id} @$items ),
	495	join( ' ', map {$_->text} @this_phrase ),
	496	join( ' ', map {$_->id} @this_phrase ),
910a0a6d	497	) );
	498	# next;
	499	}
	500	}
	501	splice( @lemma_text, $realoffset, $length, @$items );
	502	$drift += @$items - $length;
b49c4318	503	}
b15511bf	504	return @lemma_text;
b49c4318	505	}
910a0a6d	506
4ca00eca	507
e2902068	508	# Helper function. Given a witness sigil, if it is a post-correctione
	509	# sigil,return the base witness. If not, return a false value.
	510	sub _is_post_corr {
	511	my( $sigil ) = @_;
930ff666	512	if( $sigil =~ /^(.?)(\s\(?p\.\s*c\.\)?)$/ ) {
910a0a6d	513	return $1;
e2902068	514	}
	515	return undef;
	516	}
	517
b49c4318	518
2ceca8c3	519	=back
	520
	521	=head1 LICENSE
	522
	523	This package is free software and is provided "as is" without express
	524	or implied warranty. You can redistribute it and/or modify it under
	525	the same terms as Perl itself.
	526
	527	=head1 AUTHOR
	528
	529	Tara L Andrews, aurum@cpan.org
	530
	531	=cut
	532
b49c4318	533	1;