[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / BaseText.pm

package Text::Tradition::Parser::BaseText;

use strict;
use warnings;
use Module::Load;
use TryCatch;
use Text::Tradition::Parser::Util qw( collate_variants cmp_str 
	check_for_repeated add_hash_entry );

=head1 NAME

Text::Tradition::Parser::BaseText

=head1 SYNOPSIS

use Text::Tradition::Parser::BaseText qw( merge_base );
merge_base( $graph, 'reference.txt', @apparatus_entries )

=head1 DESCRIPTION

For an overview of the package, see the documentation for the
Text::Tradition module.

This module is meant for use with certain of the other Parser classes
- whenever a list of variants is given with reference to a base text,
these must be joined into a single collation.  The parser should
therefore make a list of variants and their locations, and BaseText
will join those listed variants onto the reference text.  

=head1 SUBROUTINES

=over

=item B<parse>

parse( $graph, $opts );

Takes an initialized graph and a hashref of options, which must include:
- 'base' - the base text referenced by the variants
- 'format' - the format of the variant list
- 'data' - the variants, in the given format.

=cut

sub parse {
    my( $tradition, $opts ) = @_;

    my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
    load( $format_mod );
    # TODO Handle a string someday if we ever have a format other than KUL
    my @apparatus_entries = $format_mod->can('read')->( $opts );
    merge_base( $tradition->collation, $opts, @apparatus_entries );
}

=item B<merge_base>

merge_base( $graph, 'reference.txt', @apparatus_entries )

Takes three arguments: a newly-initialized Text::Tradition::Graph
object, a text file containing the reference text, and a list of
variants (apparatus entries).  Adds the base text to the graph, and
joins the variants to that.

The list of variants is an array of hash references; each hash takes
the form
 { '_id' => line reference,
   'rdg_0' => lemma reading,
   'rdg_1' => first variant,
   ...  # and so on until all distinct readings are listed
   'WitnessA' => 'rdg_0',
   'WitnessB' => 'rdg_1',
   ...  # and so on until all witnesses are listed with their readings
 }

Any hash key that is not of the form /^rdg_\d+$/ and that does not
begin with an underscore is assumed to be a witness name.  Any 'meta'
information to be passed must be passed in a key with a leading
underscore in its name.

=cut

my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed

my %base_text_index;
my $edits_required = {};

# edits_required -> wit -> [ { start_idx, end_idx, items } ]

sub merge_base {
    my( $collation, $opts, @app_entries ) = @_;
    my @base_line_starts = read_base( $opts->{'base'}, $collation );

    my %all_witnesses;
    foreach my $app ( @app_entries ) {
        my( $line, $num ) = split( /\./, $app->{_id} );
        # DEBUG with a short graph
        last if $SHORTEND && $line > $SHORTEND;
        # DEBUG for problematic entries
        my $scrutinize = '';
        my $first_line_reading = $base_line_starts[ $line ];
        my $too_far = $base_line_starts[ $line+1 ];
        
        my $lemma = $app->{rdg_0};
        my $seq = 1; 
        # Is this the Nth occurrence of this reading in the line?
        if( $lemma =~ s/(_)?(\d)$// ) {
            $seq = $2;
        }
        my @lemma_words = split( /\s+/, $lemma );
        
        # Now search for the lemma words within this line.
        my $lemma_start = $first_line_reading;
        my $lemma_end;
        my %seen;
        while( $lemma_start ne $too_far ) {
            # Loop detection
            if( $seen{ $lemma_start->id() } ) {
                warn "Detected loop at " . $lemma_start->id() . 
                    ", ref $line,$num";
                last;
            }
            $seen{ $lemma_start->id() } = 1;
            
            # Try to match the lemma.
            my $unmatch = 0;
            print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
                $lemma_words[0] . "...\n"
                if "$line.$num" eq $scrutinize;
            if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
                # Skip it if we need a match that is not the first.
                if( --$seq < 1 ) {
                    # Now we have to compare the rest of the words here.
                    if( scalar( @lemma_words ) > 1 ) {
                        my $next_reading = 
                            $collation->next_reading( $lemma_start );
                        foreach my $w ( @lemma_words[1..$#lemma_words] ) {
                            printf STDERR "Now matching %s against %s\n", 
                                    cmp_str($next_reading), $w
                                if "$line.$num" eq $scrutinize;
                            if( $w ne cmp_str($next_reading) ) {
                                $unmatch = 1;
                                last;
                            } else {
                                $lemma_end = $next_reading;
                                $next_reading = 
                                    $collation->next_reading( $lemma_end );
                            }
                        }
                    } else {
                        $lemma_end = $lemma_start;
                    }
                } else {
                    $unmatch = 1;
                }
            }
            last unless ( $unmatch || !defined( $lemma_end ) );
            $lemma_end = undef;
            $lemma_start = $collation->next_reading( $lemma_start );
        }
        
        unless( $lemma_end ) {
            warn "No match found for @lemma_words at $line.$num";
            next;
        }
        
        # Now we have found the lemma; we will record an 'edit', in
        # terms of a splice operation, for each subsequent reading.
        # We also note which witnesses take the given edit.

        my @lemma_set = $collation->reading_sequence( $lemma_start, 
                                                      $lemma_end );
        my @reading_sets = [ @lemma_set ];
        
        # For each reading that is not rdg_0, we create the variant
        # reading nodes, and store the range as an edit operation on
        # the base text.
        my $variant_objects;
        my %pc_seen; # Keep track of mss with explicit post-corr data
        foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
            my @mss = grep { $app->{$_} eq $k } keys( %$app );

            # Keep track of what witnesses we have seen.
            @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
            # Keep track of which witnesses bear corrected readings here.
            foreach my $m ( @mss ) {
                my $base = _is_post_corr( $m );
                next unless $base;
                $pc_seen{$base} = 1;
            }
            next if $k eq 'rdg_0';

            # Parse the variant into reading tokens.
            # TODO don't hardcode the reading split operation
            my @variant = split( /\s+/, $app->{$k} );
            @variant = () if $app->{$k} eq '/'; # This is an omission.
            
            my @variant_readings;
            my $ctr = 0;
            foreach my $vw ( @variant ) {
                my $vwname = "$k/$line.$num.$ctr"; $ctr++;
                my $vwreading = $collation->add_reading( {
                	'id' => $vwname,
                	'text' => $vw } );
                push( @variant_readings, $vwreading );
            }

            $variant_objects->{$k} = { 'mss' => \@mss,
                                       'reading' => \@variant_readings,
            };
            push( @reading_sets, \@variant_readings );
        }

        # Now collate and collapse the identical readings within the
        # collated sets.  Modifies the reading sets that were passed.
        collate_variants( $collation, @reading_sets );

        # Record any stated relationships between the nodes and the lemma.
        set_relationships( $collation, $app, \@lemma_set, $variant_objects );

        # Now create the splice-edit objects that will be used
        # to reconstruct each witness.

        foreach my $rkey ( keys %$variant_objects ) {
            # Object is argument list for splice, so:
            # offset, length, replacements
            my $edit_object = [ $lemma_start->id,
                                scalar( @lemma_set ),
                                $variant_objects->{$rkey}->{reading} ];
            foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
                # Is this a p.c. entry?
                my $base = _is_post_corr( $ms );
                if( $base ) { # this is a post-corr witness
                    my $pc_key = $base . "_post";
                    add_hash_entry( $edits_required, $pc_key, $edit_object );
                } else { # this is an ante-corr witness
                    my $pc_key = $ms . "_post";
                    add_hash_entry( $edits_required, $ms, $edit_object );
                    unless( $pc_seen{$ms} ) {
                        # If this witness carries no correction, add this 
                        # same object to its post-corrected state.
                        add_hash_entry( $edits_required, $pc_key, 
                                         $edit_object );
                    }
                }
            }
        }
    } # Finished going through the apparatus entries

    # Now make the witness objects, and create their text sequences
    foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
        print STDERR "Creating witness $w\n";
        my $witness_obj = $collation->tradition->add_witness( 
        	sigil => $w, sourcetype => 'collation' );
        my $debug; #  = $w eq 'Vb11';
        my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
        my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
            if exists( $edits_required->{$w."_post"} );

        my @repeated = check_for_repeated( @ante_corr_seq );
        warn "Repeated elements @repeated in $w a.c."
            if @repeated;
        @repeated = check_for_repeated( @post_corr_seq );
        warn "Repeated elements @repeated in $w p.c."
            if @repeated;

        # Now save these paths in my witness object
        if( @post_corr_seq ) {
            $witness_obj->path( \@post_corr_seq );
            $witness_obj->uncorrected_path( \@ante_corr_seq );
        } else {
            $witness_obj->path( \@ante_corr_seq );
        }
    }

    # Now remove our 'base text' edges, which is to say, the only
    # ones we have created so far.  Also remove any unwitnessed
    # lemma nodes (TODO unless we are treating base as witness)
    foreach ( $collation->paths() ) {
        $collation->del_path( $_, $collation->baselabel );
    }

    ### HACKY HACKY Do some one-off path corrections here.
    if( $opts->{'input'} eq 'KUL' ) {
		require 'data/boodts/s158.HACK';
		KUL::HACK::pre_path_hack( $collation );
	}
	
    # Now walk paths and calculate positional rank.
    $collation->make_witness_paths();
    # Now delete any orphaned readings.
	foreach my $r ( $collation->sequence->isolated_vertices ) {
		print STDERR "Deleting unconnected reading $r / " . 
			$collation->reading( $r )->text . "\n";
		$collation->del_reading( $r );
	}
	
    KUL::HACK::post_path_hack( $collation ) if $opts->{'input'} eq 'KUL';
    # Have to check relationship validity at this point, because before that
    # we had no paths.
#     foreach my $rel ( $collation->relationships ) {
#         next unless $rel->equal_rank;
#         unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
#             warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
#                             $rel->type, $rel->from->id, $rel->to->id );
#         }
#     }
    $collation->calculate_common_readings(); # will implicitly rank
}

=item B<read_base>

my @line_beginnings = read_base( 'reference.txt', $collation );

Takes a text file and a (presumed empty) collation object, adds the
words as simple linear readings to the collation, and returns a
list of readings that represent the beginning of lines. This collation
is now the starting point for application of apparatus entries in
merge_base, e.g. from a CSV file or a Classical Text Editor file.

=cut

sub read_base {
    my( $base_file, $collation ) = @_;
    
    # This array gives the first reading for each line.  We put the
    # common starting point in line zero.
    my $last_reading = $collation->start;
    $base_text_index{$last_reading->id} = 0;
    my $lineref_array = [ $last_reading ]; # There is no line zero.

    open( BASE, $base_file ) or die "Could not open file $base_file: $!";
    my $i = 1;
    while(<BASE>) {
        # Make the readings, and connect them up for the base, but
        # also save the first reading of each line in an array for the
        # purpose.
        # TODO use configurable reading separator
        chomp;
        my @words = split;
        my $started = 0;
        my $wordref = 0;
        my $lineref = scalar @$lineref_array;
        last if $SHORTEND && $lineref > $SHORTEND;
        foreach my $w ( @words ) {
            my $readingref = join( ',', $lineref, ++$wordref );
            my $reading = $collation->add_reading( { id => $readingref, text => $w } );
            unless( $started ) {
                push( @$lineref_array, $reading );
                $started = 1;
            }
            # Add edge paths in the graph, for easier tracking when
            # we start applying corrections.  These paths will be
            # removed when we're done.
            my $path = $collation->add_path( $last_reading, $reading, 
                                             $collation->baselabel );
            $last_reading = $reading;

            # Note an array index for the reading, for later correction splices.
            $base_text_index{$readingref} = $i++;
        }
    }
    close BASE;
    # Ending point for all texts
    $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
    push( @$lineref_array, $collation->end );
    $base_text_index{$collation->end->id} = $i;

    return( @$lineref_array );
}

sub set_relationships {
    my( $collation, $app, $lemma, $variants ) = @_;
    foreach my $rkey ( keys %$variants ) {
        my $var = $variants->{$rkey}->{'reading'};
        my $type = $app->{sprintf( "_%s_type", $rkey )};
        my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
        my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
        
        my %rel_options = ();
        $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
        $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
        
        if( $type =~ /^(inv|tr|rep)$/i ) {
            # Transposition or repetition: look for nodes with the
            # same label but different IDs and mark them.
            $type = 'repetition' if $type =~ /^rep/i;
            $rel_options{'type'} = $type;
            $rel_options{'equal_rank'} = undef;
            my %labels;
            foreach my $r ( @$lemma ) {
                $labels{cmp_str( $r )} = $r;
            }
            foreach my $r( @$var ) {
                if( exists $labels{$r->text} &&
                    $r->id ne $labels{$r->text}->id ) {
                    if( $type eq 'repetition' ) {
                        # Repetition
                        try {
                        	$collation->add_relationship( $r, $labels{$r->text}, \%rel_options );
                        } catch( Text::Tradition::Error $e ) {
                        	warn "Could not set repetition relationship $r -> " 
                        		. $labels{$r->text} . ": " . $e->message;
                        }
                    } else {
                        # Transposition
                    	try {
                       		$r->set_identical( $labels{$r->text} );
                        } catch( Text::Tradition::Error $e ) {
                        	warn "Could not set transposition relationship $r -> " 
                        		. $labels{$r->text} . ": " . $e->message;
                        }
                    }
                }
            }
        } elsif( $type =~ /^(gr|sp(el)?)$/i ) {

            # Grammar/spelling/lexical: this can be a one-to-one or
            # one-to-many mapping.  We should think about merging
            # readings if it is one-to-many.

            $type = 'grammatical' if $type =~ /gr/i;
            $type = 'spelling' if $type =~ /sp/i;
            $type = 'repetition' if $type =~ /rep/i;
            # $type = 'lexical' if $type =~ /lex/i;
            $rel_options{'type'} = $type;
            $rel_options{'equal_rank'} = 1;
            if( @$lemma == @$var ) {
                foreach my $i ( 0 .. $#{$lemma} ) {
                	try {
						$collation->add_relationship( $var->[$i], $lemma->[$i],
							\%rel_options );
					} catch( Text::Tradition::Error $e ) {
						warn "Could not set $type relationship " . $var->[$i] . " -> " 
							. $lemma->[$i] . ": " . $e->message;
					}
                } 
            } else {
                # An uneven many-to-many mapping.  Skip for now.
                # We really want to make a segment out of whatever we have.
                # my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
                # my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
                # $collation->add_relationship( $varseg, $lemseg, \%rel_options );
                # if( @$lemma == 1 && @$var == 1 ) {
                #     $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
                # }
            }
        } elsif( $type !~ /^(add|om|lex)$/i ) {
            warn "Unrecognized type $type";
        }
    }
}
        

sub apply_edits {
    my( $collation, $edit_sequence, $debug ) = @_;
    my @lemma_text = $collation->reading_sequence( 
    	$collation->start, $collation->end );
    my $drift = 0;
    foreach my $correction ( @$edit_sequence ) {
        my( $lemma_start, $length, $items ) = @$correction;
        my $offset = $base_text_index{$lemma_start};
        my $realoffset = $offset + $drift;
        if( $debug ||
            $lemma_text[$realoffset]->id ne $lemma_start ) {
            my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
            my @base_phrase;
            my $i = $realoffset;
            my $l = $collation->reading( $lemma_start );
            while( $i < $realoffset+$length ) {
                push( @base_phrase, $l );
                $l = $collation->next_reading( $l );
                $i++;
            }
            
            print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
                                  "with %s (%s) with drift %d\n",
                                  join( ' ', map {$_->text} @base_phrase ),
                                  join( ' ', map {$_->id} @base_phrase ),
                                  $realoffset,
                                  join( ' ', map {$_->text} @$items ),
                                  join( ' ', map {$_->id} @$items ),
                                  $drift,
                                  ) if $debug;
                                  
            if( $lemma_text[$realoffset]->id ne $lemma_start ) {
                warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
                               "but %s (%s) is there instead", 
                               join( ' ', map {$_->text} @base_phrase ),
                               join( ' ', map {$_->id} @base_phrase ),
                               join( ' ', map {$_->text} @$items ),
                               join( ' ', map {$_->id} @$items ),
                               join( ' ', map {$_->text} @this_phrase ),
                               join( ' ', map {$_->id} @this_phrase ),
                      ) );
                # next;
            }
        }
        splice( @lemma_text, $realoffset, $length, @$items );
        $drift += @$items - $length;
    }
    return @lemma_text;
}
        

# Helper function. Given a witness sigil, if it is a post-correctione
# sigil,return the base witness.  If not, return a false value.
sub _is_post_corr {
    my( $sigil ) = @_;
    if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
        return $1;
    }
    return undef;
}


=back

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews, aurum@cpan.org

=cut

1;
Commit	Line	Data
e58153d6	1	package Text::Tradition::Parser::BaseText;
b49c4318	2
	3	use strict;
	4	use warnings;
52ce987f	5	use Module::Load;
63778331	6	use TryCatch;
	7	use Text::Tradition::Parser::Util qw( collate_variants cmp_str
	8	check_for_repeated add_hash_entry );
b49c4318	9
2ceca8c3	10	=head1 NAME
	11
	12	Text::Tradition::Parser::BaseText
	13
	14	=head1 SYNOPSIS
	15
	16	use Text::Tradition::Parser::BaseText qw( merge_base );
	17	merge_base( $graph, 'reference.txt', @apparatus_entries )
	18
	19	=head1 DESCRIPTION
	20
	21	For an overview of the package, see the documentation for the
408449b7	22	Text::Tradition module.
2ceca8c3	23
	24	This module is meant for use with certain of the other Parser classes
	25	- whenever a list of variants is given with reference to a base text,
	26	these must be joined into a single collation. The parser should
	27	therefore make a list of variants and their locations, and BaseText
	28	will join those listed variants onto the reference text.
	29
	30	=head1 SUBROUTINES
	31
	32	=over
	33
52ce987f	34	=item B<parse>
52ce987f	35
408449b7	36	parse( $graph, $opts );
52ce987f	37
408449b7	38	Takes an initialized graph and a hashref of options, which must include:
52ce987f	39	- 'base' - the base text referenced by the variants
	40	- 'format' - the format of the variant list
	41	- 'data' - the variants, in the given format.
	42
	43	=cut
	44
	45	sub parse {
dfc37e38	46	my( $tradition, $opts ) = @_;
52ce987f	47
dfc37e38	48	my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
52ce987f	49	load( $format_mod );
408449b7	50	# TODO Handle a string someday if we ever have a format other than KUL
408449b7	51	my @apparatus_entries = $format_mod->can('read')->( $opts );
b0b4421a	52	merge_base( $tradition->collation, $opts, @apparatus_entries );
52ce987f	53	}
52ce987f	54
2ceca8c3	55	=item B<merge_base>
	56
	57	merge_base( $graph, 'reference.txt', @apparatus_entries )
	58
	59	Takes three arguments: a newly-initialized Text::Tradition::Graph
	60	object, a text file containing the reference text, and a list of
	61	variants (apparatus entries). Adds the base text to the graph, and
	62	joins the variants to that.
	63
	64	The list of variants is an array of hash references; each hash takes
	65	the form
	66	{ '_id' => line reference,
	67	'rdg_0' => lemma reading,
	68	'rdg_1' => first variant,
	69	... # and so on until all distinct readings are listed
	70	'WitnessA' => 'rdg_0',
	71	'WitnessB' => 'rdg_1',
	72	... # and so on until all witnesses are listed with their readings
	73	}
	74
	75	Any hash key that is not of the form /^rdg_\d+$/ and that does not
	76	begin with an underscore is assumed to be a witness name. Any 'meta'
	77	information to be passed must be passed in a key with a leading
	78	underscore in its name.
	79
	80	=cut
	81
b15511bf	82	my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
4ca00eca	83
4ca00eca	84	my %base_text_index;
6a222840	85	my $edits_required = {};
4ca00eca	86
4ca00eca	87	# edits_required -> wit -> [ { start_idx, end_idx, items } ]
930ff666	88
b49c4318	89	sub merge_base {
b0b4421a	90	my( $collation, $opts, @app_entries ) = @_;
b0b4421a	91	my @base_line_starts = read_base( $opts->{'base'}, $collation );
b49c4318	92
52ce987f	93	my %all_witnesses;
b49c4318	94	foreach my $app ( @app_entries ) {
910a0a6d	95	my( $line, $num ) = split( /\./, $app->{_id} );
	96	# DEBUG with a short graph
	97	last if $SHORTEND && $line > $SHORTEND;
	98	# DEBUG for problematic entries
	99	my $scrutinize = '';
	100	my $first_line_reading = $base_line_starts[ $line ];
	101	my $too_far = $base_line_starts[ $line+1 ];
	102
	103	my $lemma = $app->{rdg_0};
	104	my $seq = 1;
	105	# Is this the Nth occurrence of this reading in the line?
	106	if( $lemma =~ s/(_)?(\d)$// ) {
	107	$seq = $2;
	108	}
	109	my @lemma_words = split( /\s+/, $lemma );
	110
	111	# Now search for the lemma words within this line.
	112	my $lemma_start = $first_line_reading;
	113	my $lemma_end;
	114	my %seen;
	115	while( $lemma_start ne $too_far ) {
	116	# Loop detection
49d4f2ac	117	if( $seen{ $lemma_start->id() } ) {
49d4f2ac	118	warn "Detected loop at " . $lemma_start->id() .
910a0a6d	119	", ref $line,$num";
	120	last;
	121	}
49d4f2ac	122	$seen{ $lemma_start->id() } = 1;
910a0a6d	123
	124	# Try to match the lemma.
	125	my $unmatch = 0;
	126	print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
	127	$lemma_words[0] . "...\n"
	128	if "$line.$num" eq $scrutinize;
	129	if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
	130	# Skip it if we need a match that is not the first.
	131	if( --$seq < 1 ) {
	132	# Now we have to compare the rest of the words here.
	133	if( scalar( @lemma_words ) > 1 ) {
	134	my $next_reading =
	135	$collation->next_reading( $lemma_start );
	136	foreach my $w ( @lemma_words[1..$#lemma_words] ) {
	137	printf STDERR "Now matching %s against %s\n",
	138	cmp_str($next_reading), $w
	139	if "$line.$num" eq $scrutinize;
	140	if( $w ne cmp_str($next_reading) ) {
	141	$unmatch = 1;
	142	last;
	143	} else {
	144	$lemma_end = $next_reading;
	145	$next_reading =
	146	$collation->next_reading( $lemma_end );
	147	}
	148	}
	149	} else {
	150	$lemma_end = $lemma_start;
	151	}
	152	} else {
	153	$unmatch = 1;
	154	}
	155	}
	156	last unless ( $unmatch \|\| !defined( $lemma_end ) );
	157	$lemma_end = undef;
	158	$lemma_start = $collation->next_reading( $lemma_start );
	159	}
	160
	161	unless( $lemma_end ) {
	162	warn "No match found for @lemma_words at $line.$num";
	163	next;
	164	}
	165
	166	# Now we have found the lemma; we will record an 'edit', in
	167	# terms of a splice operation, for each subsequent reading.
	168	# We also note which witnesses take the given edit.
	169
	170	my @lemma_set = $collation->reading_sequence( $lemma_start,
	171	$lemma_end );
	172	my @reading_sets = [ @lemma_set ];
b0b4421a	173
910a0a6d	174	# For each reading that is not rdg_0, we create the variant
	175	# reading nodes, and store the range as an edit operation on
	176	# the base text.
	177	my $variant_objects;
	178	my %pc_seen; # Keep track of mss with explicit post-corr data
	179	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
	180	my @mss = grep { $app->{$_} eq $k } keys( %$app );
	181
910a0a6d	182	# Keep track of what witnesses we have seen.
	183	@all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
	184	# Keep track of which witnesses bear corrected readings here.
	185	foreach my $m ( @mss ) {
	186	my $base = _is_post_corr( $m );
	187	next unless $base;
	188	$pc_seen{$base} = 1;
	189	}
	190	next if $k eq 'rdg_0';
	191
	192	# Parse the variant into reading tokens.
	193	# TODO don't hardcode the reading split operation
	194	my @variant = split( /\s+/, $app->{$k} );
	195	@variant = () if $app->{$k} eq '/'; # This is an omission.
	196
	197	my @variant_readings;
	198	my $ctr = 0;
	199	foreach my $vw ( @variant ) {
	200	my $vwname = "$k/$line.$num.$ctr"; $ctr++;
49d4f2ac	201	my $vwreading = $collation->add_reading( {
	202	'id' => $vwname,
	203	'text' => $vw } );
910a0a6d	204	push( @variant_readings, $vwreading );
	205	}
	206
	207	$variant_objects->{$k} = { 'mss' => \@mss,
	208	'reading' => \@variant_readings,
	209	};
	210	push( @reading_sets, \@variant_readings );
	211	}
	212
	213	# Now collate and collapse the identical readings within the
	214	# collated sets. Modifies the reading sets that were passed.
	215	collate_variants( $collation, @reading_sets );
	216
	217	# Record any stated relationships between the nodes and the lemma.
	218	set_relationships( $collation, $app, \@lemma_set, $variant_objects );
	219
	220	# Now create the splice-edit objects that will be used
	221	# to reconstruct each witness.
	222
	223	foreach my $rkey ( keys %$variant_objects ) {
	224	# Object is argument list for splice, so:
	225	# offset, length, replacements
49d4f2ac	226	my $edit_object = [ $lemma_start->id,
910a0a6d	227	scalar( @lemma_set ),
	228	$variant_objects->{$rkey}->{reading} ];
	229	foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
	230	# Is this a p.c. entry?
	231	my $base = _is_post_corr( $ms );
	232	if( $base ) { # this is a post-corr witness
	233	my $pc_key = $base . "_post";
	234	add_hash_entry( $edits_required, $pc_key, $edit_object );
	235	} else { # this is an ante-corr witness
	236	my $pc_key = $ms . "_post";
	237	add_hash_entry( $edits_required, $ms, $edit_object );
	238	unless( $pc_seen{$ms} ) {
	239	# If this witness carries no correction, add this
	240	# same object to its post-corrected state.
	241	add_hash_entry( $edits_required, $pc_key,
	242	$edit_object );
	243	}
	244	}
	245	}
	246	}
4ca00eca	247	} # Finished going through the apparatus entries
	248
	249	# Now make the witness objects, and create their text sequences
6a222840	250	foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
910a0a6d	251	print STDERR "Creating witness $w\n";
82fa4d57	252	my $witness_obj = $collation->tradition->add_witness(
82fa4d57	253	sigil => $w, sourcetype => 'collation' );
910a0a6d	254	my $debug; # = $w eq 'Vb11';
	255	my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
	256	my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
	257	if exists( $edits_required->{$w."_post"} );
	258
	259	my @repeated = check_for_repeated( @ante_corr_seq );
	260	warn "Repeated elements @repeated in $w a.c."
	261	if @repeated;
	262	@repeated = check_for_repeated( @post_corr_seq );
	263	warn "Repeated elements @repeated in $w p.c."
	264	if @repeated;
	265
	266	# Now save these paths in my witness object
	267	if( @post_corr_seq ) {
	268	$witness_obj->path( \@post_corr_seq );
	269	$witness_obj->uncorrected_path( \@ante_corr_seq );
	270	} else {
	271	$witness_obj->path( \@ante_corr_seq );
	272	}
b49c4318	273	}
e2902068	274
6a222840	275	# Now remove our 'base text' edges, which is to say, the only
1ed3973e	276	# ones we have created so far. Also remove any unwitnessed
1ed3973e	277	# lemma nodes (TODO unless we are treating base as witness)
6a222840	278	foreach ( $collation->paths() ) {
49d4f2ac	279	$collation->del_path( $_, $collation->baselabel );
6a222840	280	}
4ca00eca	281
b15511bf	282	### HACKY HACKY Do some one-off path corrections here.
b0b4421a	283	if( $opts->{'input'} eq 'KUL' ) {
	284	require 'data/boodts/s158.HACK';
	285	KUL::HACK::pre_path_hack( $collation );
	286	}
	287
910a0a6d	288	# Now walk paths and calculate positional rank.
7e450e44	289	$collation->make_witness_paths();
49d4f2ac	290	# Now delete any orphaned readings.
	291	foreach my $r ( $collation->sequence->isolated_vertices ) {
	292	print STDERR "Deleting unconnected reading $r / " .
	293	$collation->reading( $r )->text . "\n";
	294	$collation->del_reading( $r );
	295	}
	296
b0b4421a	297	KUL::HACK::post_path_hack( $collation ) if $opts->{'input'} eq 'KUL';
910a0a6d	298	# Have to check relationship validity at this point, because before that
	299	# we had no paths.
	300	# foreach my $rel ( $collation->relationships ) {
	301	# next unless $rel->equal_rank;
	302	# unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
	303	# warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
49d4f2ac	304	# $rel->type, $rel->from->id, $rel->to->id );
910a0a6d	305	# }
910a0a6d	306	# }
15db7774	307	$collation->calculate_common_readings(); # will implicitly rank
15d2d3df	308	}
15d2d3df	309
2ceca8c3	310	=item B<read_base>
2ceca8c3	311
e2902068	312	my @line_beginnings = read_base( 'reference.txt', $collation );
2ceca8c3	313
e2902068	314	Takes a text file and a (presumed empty) collation object, adds the
	315	words as simple linear readings to the collation, and returns a
	316	list of readings that represent the beginning of lines. This collation
	317	is now the starting point for application of apparatus entries in
	318	merge_base, e.g. from a CSV file or a Classical Text Editor file.
2ceca8c3	319
2ceca8c3	320	=cut
b49c4318	321
b49c4318	322	sub read_base {
e2902068	323	my( $base_file, $collation ) = @_;
b49c4318	324
e2902068	325	# This array gives the first reading for each line. We put the
b49c4318	326	# common starting point in line zero.
49d4f2ac	327	my $last_reading = $collation->start;
49d4f2ac	328	$base_text_index{$last_reading->id} = 0;
e2902068	329	my $lineref_array = [ $last_reading ]; # There is no line zero.
b49c4318	330
b49c4318	331	open( BASE, $base_file ) or die "Could not open file $base_file: $!";
6a222840	332	my $i = 1;
b49c4318	333	while(<BASE>) {
910a0a6d	334	# Make the readings, and connect them up for the base, but
	335	# also save the first reading of each line in an array for the
	336	# purpose.
	337	# TODO use configurable reading separator
	338	chomp;
	339	my @words = split;
	340	my $started = 0;
	341	my $wordref = 0;
	342	my $lineref = scalar @$lineref_array;
	343	last if $SHORTEND && $lineref > $SHORTEND;
	344	foreach my $w ( @words ) {
	345	my $readingref = join( ',', $lineref, ++$wordref );
49d4f2ac	346	my $reading = $collation->add_reading( { id => $readingref, text => $w } );
910a0a6d	347	unless( $started ) {
	348	push( @$lineref_array, $reading );
	349	$started = 1;
	350	}
	351	# Add edge paths in the graph, for easier tracking when
	352	# we start applying corrections. These paths will be
	353	# removed when we're done.
	354	my $path = $collation->add_path( $last_reading, $reading,
	355	$collation->baselabel );
	356	$last_reading = $reading;
	357
	358	# Note an array index for the reading, for later correction splices.
	359	$base_text_index{$readingref} = $i++;
	360	}
b49c4318	361	}
	362	close BASE;
	363	# Ending point for all texts
910a0a6d	364	$collation->add_path( $last_reading, $collation->end, $collation->baselabel );
910a0a6d	365	push( @$lineref_array, $collation->end );
49d4f2ac	366	$base_text_index{$collation->end->id} = $i;
b49c4318	367
	368	return( @$lineref_array );
	369	}
	370
15d2d3df	371	sub set_relationships {
3265b0ce	372	my( $collation, $app, $lemma, $variants ) = @_;
15d2d3df	373	foreach my $rkey ( keys %$variants ) {
910a0a6d	374	my $var = $variants->{$rkey}->{'reading'};
	375	my $type = $app->{sprintf( "_%s_type", $rkey )};
	376	my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
	377	my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
	378
	379	my %rel_options = ();
	380	$rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
	381	$rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
	382
	383	if( $type =~ /^(inv\|tr\|rep)$/i ) {
	384	# Transposition or repetition: look for nodes with the
	385	# same label but different IDs and mark them.
	386	$type = 'repetition' if $type =~ /^rep/i;
	387	$rel_options{'type'} = $type;
	388	$rel_options{'equal_rank'} = undef;
	389	my %labels;
	390	foreach my $r ( @$lemma ) {
	391	$labels{cmp_str( $r )} = $r;
	392	}
	393	foreach my $r( @$var ) {
49d4f2ac	394	if( exists $labels{$r->text} &&
49d4f2ac	395	$r->id ne $labels{$r->text}->id ) {
910a0a6d	396	if( $type eq 'repetition' ) {
910a0a6d	397	# Repetition
63778331	398	try {
	399	$collation->add_relationship( $r, $labels{$r->text}, \%rel_options );
	400	} catch( Text::Tradition::Error $e ) {
	401	warn "Could not set repetition relationship $r -> "
	402	. $labels{$r->text} . ": " . $e->message;
	403	}
910a0a6d	404	} else {
910a0a6d	405	# Transposition
63778331	406	try {
	407	$r->set_identical( $labels{$r->text} );
	408	} catch( Text::Tradition::Error $e ) {
	409	warn "Could not set transposition relationship $r -> "
	410	. $labels{$r->text} . ": " . $e->message;
	411	}
910a0a6d	412	}
	413	}
	414	}
	415	} elsif( $type =~ /^(gr\|sp(el)?)$/i ) {
	416
	417	# Grammar/spelling/lexical: this can be a one-to-one or
	418	# one-to-many mapping. We should think about merging
	419	# readings if it is one-to-many.
	420
	421	$type = 'grammatical' if $type =~ /gr/i;
	422	$type = 'spelling' if $type =~ /sp/i;
	423	$type = 'repetition' if $type =~ /rep/i;
	424	# $type = 'lexical' if $type =~ /lex/i;
	425	$rel_options{'type'} = $type;
	426	$rel_options{'equal_rank'} = 1;
	427	if( @$lemma == @$var ) {
	428	foreach my $i ( 0 .. $#{$lemma} ) {
63778331	429	try {
	430	$collation->add_relationship( $var->[$i], $lemma->[$i],
	431	\%rel_options );
	432	} catch( Text::Tradition::Error $e ) {
	433	warn "Could not set $type relationship " . $var->[$i] . " -> "
	434	. $lemma->[$i] . ": " . $e->message;
	435	}
910a0a6d	436	}
	437	} else {
	438	# An uneven many-to-many mapping. Skip for now.
	439	# We really want to make a segment out of whatever we have.
	440	# my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
	441	# my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
	442	# $collation->add_relationship( $varseg, $lemseg, \%rel_options );
63778331	443	# if( @$lemma == 1 && @$var == 1 ) {
	444	# $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
	445	# }
910a0a6d	446	}
	447	} elsif( $type !~ /^(add\|om\|lex)$/i ) {
	448	warn "Unrecognized type $type";
	449	}
15d2d3df	450	}
15d2d3df	451	}
910a0a6d	452
15d2d3df	453
15d2d3df	454
4ca00eca	455	sub apply_edits {
b15511bf	456	my( $collation, $edit_sequence, $debug ) = @_;
49d4f2ac	457	my @lemma_text = $collation->reading_sequence(
49d4f2ac	458	$collation->start, $collation->end );
4ca00eca	459	my $drift = 0;
b15511bf	460	foreach my $correction ( @$edit_sequence ) {
910a0a6d	461	my( $lemma_start, $length, $items ) = @$correction;
	462	my $offset = $base_text_index{$lemma_start};
	463	my $realoffset = $offset + $drift;
	464	if( $debug \|\|
49d4f2ac	465	$lemma_text[$realoffset]->id ne $lemma_start ) {
910a0a6d	466	my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
	467	my @base_phrase;
	468	my $i = $realoffset;
	469	my $l = $collation->reading( $lemma_start );
	470	while( $i < $realoffset+$length ) {
	471	push( @base_phrase, $l );
	472	$l = $collation->next_reading( $l );
	473	$i++;
	474	}
	475
	476	print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
	477	"with %s (%s) with drift %d\n",
49d4f2ac	478	join( ' ', map {$_->text} @base_phrase ),
49d4f2ac	479	join( ' ', map {$_->id} @base_phrase ),
910a0a6d	480	$realoffset,
49d4f2ac	481	join( ' ', map {$_->text} @$items ),
49d4f2ac	482	join( ' ', map {$_->id} @$items ),
910a0a6d	483	$drift,
	484	) if $debug;
	485
49d4f2ac	486	if( $lemma_text[$realoffset]->id ne $lemma_start ) {
910a0a6d	487	warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
910a0a6d	488	"but %s (%s) is there instead",
49d4f2ac	489	join( ' ', map {$_->text} @base_phrase ),
	490	join( ' ', map {$_->id} @base_phrase ),
	491	join( ' ', map {$_->text} @$items ),
	492	join( ' ', map {$_->id} @$items ),
	493	join( ' ', map {$_->text} @this_phrase ),
	494	join( ' ', map {$_->id} @this_phrase ),
910a0a6d	495	) );
	496	# next;
	497	}
	498	}
	499	splice( @lemma_text, $realoffset, $length, @$items );
	500	$drift += @$items - $length;
b49c4318	501	}
b15511bf	502	return @lemma_text;
b49c4318	503	}
910a0a6d	504
4ca00eca	505
e2902068	506	# Helper function. Given a witness sigil, if it is a post-correctione
	507	# sigil,return the base witness. If not, return a false value.
	508	sub _is_post_corr {
	509	my( $sigil ) = @_;
930ff666	510	if( $sigil =~ /^(.?)(\s\(?p\.\s*c\.\)?)$/ ) {
910a0a6d	511	return $1;
e2902068	512	}
	513	return undef;
	514	}
	515
b49c4318	516
2ceca8c3	517	=back
	518
	519	=head1 LICENSE
	520
	521	This package is free software and is provided "as is" without express
	522	or implied warranty. You can redistribute it and/or modify it under
	523	the same terms as Perl itself.
	524
	525	=head1 AUTHOR
	526
	527	Tara L Andrews, aurum@cpan.org
	528
	529	=cut
	530
b49c4318	531	1;