[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / BaseText.pm

package Text::Tradition::Parser::BaseText;

use strict;
use warnings;
use Module::Load;
use Text::Tradition::Parser::Util qw( collate_variants cmp_str check_for_repeated add_hash_entry );

=head1 NAME

Text::Tradition::Parser::BaseText

=head1 SYNOPSIS

use Text::Tradition::Parser::BaseText qw( merge_base );
merge_base( $graph, 'reference.txt', @apparatus_entries )

=head1 DESCRIPTION

For an overview of the package, see the documentation for the
Text::Tradition module.

This module is meant for use with certain of the other Parser classes
- whenever a list of variants is given with reference to a base text,
these must be joined into a single collation.  The parser should
therefore make a list of variants and their locations, and BaseText
will join those listed variants onto the reference text.  

=head1 SUBROUTINES

=over

=item B<parse>

parse( $graph, $opts );

Takes an initialized graph and a hashref of options, which must include:
- 'base' - the base text referenced by the variants
- 'format' - the format of the variant list
- 'data' - the variants, in the given format.

=cut

sub parse {
    my( $tradition, $opts ) = @_;

    my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
    load( $format_mod );
    # TODO Handle a string someday if we ever have a format other than KUL
    my @apparatus_entries = $format_mod->can('read')->( $opts );
    merge_base( $tradition->collation, $opts->{'base'}, @apparatus_entries );
}

=item B<merge_base>

merge_base( $graph, 'reference.txt', @apparatus_entries )

Takes three arguments: a newly-initialized Text::Tradition::Graph
object, a text file containing the reference text, and a list of
variants (apparatus entries).  Adds the base text to the graph, and
joins the variants to that.

The list of variants is an array of hash references; each hash takes
the form
 { '_id' => line reference,
   'rdg_0' => lemma reading,
   'rdg_1' => first variant,
   ...  # and so on until all distinct readings are listed
   'WitnessA' => 'rdg_0',
   'WitnessB' => 'rdg_1',
   ...  # and so on until all witnesses are listed with their readings
 }

Any hash key that is not of the form /^rdg_\d+$/ and that does not
begin with an underscore is assumed to be a witness name.  Any 'meta'
information to be passed must be passed in a key with a leading
underscore in its name.

=cut

my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed

my %base_text_index;
my $edits_required = {};

# edits_required -> wit -> [ { start_idx, end_idx, items } ]

sub merge_base {
    my( $collation, $base_file, @app_entries ) = @_;
    my @base_line_starts = read_base( $base_file, $collation );

    my %all_witnesses;
    my @unwitnessed_lemma_nodes;
    foreach my $app ( @app_entries ) {
        my( $line, $num ) = split( /\./, $app->{_id} );
        # DEBUG with a short graph
        last if $SHORTEND && $line > $SHORTEND;
        # DEBUG for problematic entries
        my $scrutinize = '';
        my $first_line_reading = $base_line_starts[ $line ];
        my $too_far = $base_line_starts[ $line+1 ];
        
        my $lemma = $app->{rdg_0};
        my $seq = 1; 
        # Is this the Nth occurrence of this reading in the line?
        if( $lemma =~ s/(_)?(\d)$// ) {
            $seq = $2;
        }
        my @lemma_words = split( /\s+/, $lemma );
        
        # Now search for the lemma words within this line.
        my $lemma_start = $first_line_reading;
        my $lemma_end;
        my %seen;
        while( $lemma_start ne $too_far ) {
            # Loop detection
            if( $seen{ $lemma_start->name() } ) {
                warn "Detected loop at " . $lemma_start->name() . 
                    ", ref $line,$num";
                last;
            }
            $seen{ $lemma_start->name() } = 1;
            
            # Try to match the lemma.
            my $unmatch = 0;
            print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
                $lemma_words[0] . "...\n"
                if "$line.$num" eq $scrutinize;
            if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
                # Skip it if we need a match that is not the first.
                if( --$seq < 1 ) {
                    # Now we have to compare the rest of the words here.
                    if( scalar( @lemma_words ) > 1 ) {
                        my $next_reading = 
                            $collation->next_reading( $lemma_start );
                        foreach my $w ( @lemma_words[1..$#lemma_words] ) {
                            printf STDERR "Now matching %s against %s\n", 
                                    cmp_str($next_reading), $w
                                if "$line.$num" eq $scrutinize;
                            if( $w ne cmp_str($next_reading) ) {
                                $unmatch = 1;
                                last;
                            } else {
                                $lemma_end = $next_reading;
                                $next_reading = 
                                    $collation->next_reading( $lemma_end );
                            }
                        }
                    } else {
                        $lemma_end = $lemma_start;
                    }
                } else {
                    $unmatch = 1;
                }
            }
            last unless ( $unmatch || !defined( $lemma_end ) );
            $lemma_end = undef;
            $lemma_start = $collation->next_reading( $lemma_start );
        }
        
        unless( $lemma_end ) {
            warn "No match found for @lemma_words at $line.$num";
            next;
        }
        
        # Now we have found the lemma; we will record an 'edit', in
        # terms of a splice operation, for each subsequent reading.
        # We also note which witnesses take the given edit.

        my @lemma_set = $collation->reading_sequence( $lemma_start, 
                                                      $lemma_end );
        my @reading_sets = [ @lemma_set ];

        # For each reading that is not rdg_0, we create the variant
        # reading nodes, and store the range as an edit operation on
        # the base text.
        my $variant_objects;
        my %pc_seen; # Keep track of mss with explicit post-corr data
        foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
            my @mss = grep { $app->{$_} eq $k } keys( %$app );

            # Keep track of lemma nodes that don't actually appear in
            # any MSS; we will want to remove them from the collation.
            push( @unwitnessed_lemma_nodes, @lemma_set )
                if !@mss && $k eq 'rdg_0';

            # Keep track of what witnesses we have seen.
            @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
            # Keep track of which witnesses bear corrected readings here.
            foreach my $m ( @mss ) {
                my $base = _is_post_corr( $m );
                next unless $base;
                $pc_seen{$base} = 1;
            }
            next if $k eq 'rdg_0';

            # Parse the variant into reading tokens.
            # TODO don't hardcode the reading split operation
            my @variant = split( /\s+/, $app->{$k} );
            @variant = () if $app->{$k} eq '/'; # This is an omission.
            
            my @variant_readings;
            my $ctr = 0;
            foreach my $vw ( @variant ) {
                my $vwname = "$k/$line.$num.$ctr"; $ctr++;
                my $vwreading = $collation->add_reading( $vwname );
                $vwreading->text( $vw );
                push( @variant_readings, $vwreading );
            }

            $variant_objects->{$k} = { 'mss' => \@mss,
                                       'reading' => \@variant_readings,
            };
            push( @reading_sets, \@variant_readings );
        }

        # Now collate and collapse the identical readings within the
        # collated sets.  Modifies the reading sets that were passed.
        collate_variants( $collation, @reading_sets );

        # Record any stated relationships between the nodes and the lemma.
        set_relationships( $collation, $app, \@lemma_set, $variant_objects );

        # Now create the splice-edit objects that will be used
        # to reconstruct each witness.

        foreach my $rkey ( keys %$variant_objects ) {
            # Object is argument list for splice, so:
            # offset, length, replacements
            my $edit_object = [ $lemma_start->name,
                                scalar( @lemma_set ),
                                $variant_objects->{$rkey}->{reading} ];
            foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
                # Is this a p.c. entry?
                my $base = _is_post_corr( $ms );
                if( $base ) { # this is a post-corr witness
                    my $pc_key = $base . "_post";
                    add_hash_entry( $edits_required, $pc_key, $edit_object );
                } else { # this is an ante-corr witness
                    my $pc_key = $ms . "_post";
                    add_hash_entry( $edits_required, $ms, $edit_object );
                    unless( $pc_seen{$ms} ) {
                        # If this witness carries no correction, add this 
                        # same object to its post-corrected state.
                        add_hash_entry( $edits_required, $pc_key, 
                                         $edit_object );
                    }
                }
            }
        }
    } # Finished going through the apparatus entries

    # Now make the witness objects, and create their text sequences
    foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
        print STDERR "Creating witness $w\n";
        my $witness_obj = $collation->tradition->add_witness( sigil => $w );
        my $debug; #  = $w eq 'Vb11';
        my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
        my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
            if exists( $edits_required->{$w."_post"} );

        my @repeated = check_for_repeated( @ante_corr_seq );
        warn "Repeated elements @repeated in $w a.c."
            if @repeated;
        @repeated = check_for_repeated( @post_corr_seq );
        warn "Repeated elements @repeated in $w p.c."
            if @repeated;

        # Now save these paths in my witness object
        if( @post_corr_seq ) {
            $witness_obj->path( \@post_corr_seq );
            $witness_obj->uncorrected_path( \@ante_corr_seq );
        } else {
            $witness_obj->path( \@ante_corr_seq );
        }
    }

    # Now remove our 'base text' edges, which is to say, the only
    # ones we have created so far.  Also remove any unwitnessed
    # lemma nodes (TODO unless we are treating base as witness)
    foreach ( $collation->paths() ) {
        $collation->del_path( $_ );
    }
    foreach( @unwitnessed_lemma_nodes ) {
        $collation->del_reading( $_ );
        # TODO do we need to delete any relationship paths here?
    }

    ### HACKY HACKY Do some one-off path corrections here.
    require( 'data/boodts/s158.HACK' );
    KUL::HACK::pre_path_hack( $collation );

    # Now walk paths and calculate positional rank.
    $collation->make_witness_paths();
    KUL::HACK::post_path_hack( $collation );
    # Have to check relationship validity at this point, because before that
    # we had no paths.
#     foreach my $rel ( $collation->relationships ) {
#         next unless $rel->equal_rank;
#         unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
#             warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
#                             $rel->type, $rel->from->name, $rel->to->name );
#         }
#     }
    $collation->calculate_ranks();
}

=item B<read_base>

my @line_beginnings = read_base( 'reference.txt', $collation );

Takes a text file and a (presumed empty) collation object, adds the
words as simple linear readings to the collation, and returns a
list of readings that represent the beginning of lines. This collation
is now the starting point for application of apparatus entries in
merge_base, e.g. from a CSV file or a Classical Text Editor file.

=cut

sub read_base {
    my( $base_file, $collation ) = @_;
    
    # This array gives the first reading for each line.  We put the
    # common starting point in line zero.
    my $last_reading = $collation->start();
    $base_text_index{$last_reading->name} = 0;
    my $lineref_array = [ $last_reading ]; # There is no line zero.

    open( BASE, $base_file ) or die "Could not open file $base_file: $!";
    my $i = 1;
    while(<BASE>) {
        # Make the readings, and connect them up for the base, but
        # also save the first reading of each line in an array for the
        # purpose.
        # TODO use configurable reading separator
        chomp;
        my @words = split;
        my $started = 0;
        my $wordref = 0;
        my $lineref = scalar @$lineref_array;
        last if $SHORTEND && $lineref > $SHORTEND;
        foreach my $w ( @words ) {
            my $readingref = join( ',', $lineref, ++$wordref );
            my $reading = $collation->add_reading( $readingref );
            $reading->text( $w );
            unless( $started ) {
                push( @$lineref_array, $reading );
                $started = 1;
            }
            # Add edge paths in the graph, for easier tracking when
            # we start applying corrections.  These paths will be
            # removed when we're done.
            my $path = $collation->add_path( $last_reading, $reading, 
                                             $collation->baselabel );
            $last_reading = $reading;

            # Note an array index for the reading, for later correction splices.
            $base_text_index{$readingref} = $i++;
        }
    }
    close BASE;
    # Ending point for all texts
    $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
    push( @$lineref_array, $collation->end );
    $base_text_index{$collation->end->name} = $i;

    return( @$lineref_array );
}

sub set_relationships {
    my( $collation, $app, $lemma, $variants ) = @_;
    foreach my $rkey ( keys %$variants ) {
        my $var = $variants->{$rkey}->{'reading'};
        my $type = $app->{sprintf( "_%s_type", $rkey )};
        my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
        my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
        
        my %rel_options = ();
        $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
        $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
        
        if( $type =~ /^(inv|tr|rep)$/i ) {
            # Transposition or repetition: look for nodes with the
            # same label but different IDs and mark them.
            $type = 'repetition' if $type =~ /^rep/i;
            $rel_options{'type'} = $type;
            $rel_options{'equal_rank'} = undef;
            my %labels;
            foreach my $r ( @$lemma ) {
                $labels{cmp_str( $r )} = $r;
            }
            foreach my $r( @$var ) {
                if( exists $labels{$r->label} &&
                    $r->name ne $labels{$r->label}->name ) {
                    if( $type eq 'repetition' ) {
                        # Repetition
                        $collation->add_relationship( $r, $labels{$r->label}, \%rel_options );
                    } else {
                        # Transposition
                        $r->set_identical( $labels{$r->label} );
                    }
                }
            }
        } elsif( $type =~ /^(gr|sp(el)?)$/i ) {

            # Grammar/spelling/lexical: this can be a one-to-one or
            # one-to-many mapping.  We should think about merging
            # readings if it is one-to-many.

            $type = 'grammatical' if $type =~ /gr/i;
            $type = 'spelling' if $type =~ /sp/i;
            $type = 'repetition' if $type =~ /rep/i;
            # $type = 'lexical' if $type =~ /lex/i;
            $rel_options{'type'} = $type;
            $rel_options{'equal_rank'} = 1;
            if( @$lemma == @$var ) {
                foreach my $i ( 0 .. $#{$lemma} ) {
                    $collation->add_relationship( $var->[$i], $lemma->[$i],
                        \%rel_options );
                } 
            } else {
                # An uneven many-to-many mapping.  Skip for now.
                # We really want to make a segment out of whatever we have.
                # my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
                # my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
                # $collation->add_relationship( $varseg, $lemseg, \%rel_options );
                if( @$lemma == 1 && @$var == 1 ) {
                    $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
                }
            }
        } elsif( $type !~ /^(add|om|lex)$/i ) {
            warn "Unrecognized type $type";
        }
    }
}
        

sub apply_edits {
    my( $collation, $edit_sequence, $debug ) = @_;
    my @lemma_text = $collation->reading_sequence( $collation->start,
                                           $collation->reading( '#END#' ) );
    my $drift = 0;
    foreach my $correction ( @$edit_sequence ) {
        my( $lemma_start, $length, $items ) = @$correction;
        my $offset = $base_text_index{$lemma_start};
        my $realoffset = $offset + $drift;
        if( $debug ||
            $lemma_text[$realoffset]->name ne $lemma_start ) {
            my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
            my @base_phrase;
            my $i = $realoffset;
            my $l = $collation->reading( $lemma_start );
            while( $i < $realoffset+$length ) {
                push( @base_phrase, $l );
                $l = $collation->next_reading( $l );
                $i++;
            }
            
            print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
                                  "with %s (%s) with drift %d\n",
                                  join( ' ', map {$_->label} @base_phrase ),
                                  join( ' ', map {$_->name} @base_phrase ),
                                  $realoffset,
                                  join( ' ', map {$_->label} @$items ),
                                  join( ' ', map {$_->name} @$items ),
                                  $drift,
                                  ) if $debug;
                                  
            if( $lemma_text[$realoffset]->name ne $lemma_start ) {
                warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
                               "but %s (%s) is there instead", 
                               join( ' ', map {$_->label} @base_phrase ),
                               join( ' ', map {$_->name} @base_phrase ),
                               join( ' ', map {$_->label} @$items ),
                               join( ' ', map {$_->name} @$items ),
                               join( ' ', map {$_->label} @this_phrase ),
                               join( ' ', map {$_->name} @this_phrase ),
                      ) );
                # next;
            }
        }
        splice( @lemma_text, $realoffset, $length, @$items );
        $drift += @$items - $length;
    }
    return @lemma_text;
}
        

# Helper function. Given a witness sigil, if it is a post-correctione
# sigil,return the base witness.  If not, return a false value.
sub _is_post_corr {
    my( $sigil ) = @_;
    if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
        return $1;
    }
    return undef;
}


=back

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews, aurum@cpan.org

=cut

1;
Commit	Line	Data
e58153d6	1	package Text::Tradition::Parser::BaseText;
b49c4318	2
	3	use strict;
	4	use warnings;
52ce987f	5	use Module::Load;
910a0a6d	6	use Text::Tradition::Parser::Util qw( collate_variants cmp_str check_for_repeated add_hash_entry );
b49c4318	7
2ceca8c3	8	=head1 NAME
	9
	10	Text::Tradition::Parser::BaseText
	11
	12	=head1 SYNOPSIS
	13
	14	use Text::Tradition::Parser::BaseText qw( merge_base );
	15	merge_base( $graph, 'reference.txt', @apparatus_entries )
	16
	17	=head1 DESCRIPTION
	18
	19	For an overview of the package, see the documentation for the
408449b7	20	Text::Tradition module.
2ceca8c3	21
	22	This module is meant for use with certain of the other Parser classes
	23	- whenever a list of variants is given with reference to a base text,
	24	these must be joined into a single collation. The parser should
	25	therefore make a list of variants and their locations, and BaseText
	26	will join those listed variants onto the reference text.
	27
	28	=head1 SUBROUTINES
	29
	30	=over
	31
52ce987f	32	=item B<parse>
52ce987f	33
408449b7	34	parse( $graph, $opts );
52ce987f	35
408449b7	36	Takes an initialized graph and a hashref of options, which must include:
52ce987f	37	- 'base' - the base text referenced by the variants
	38	- 'format' - the format of the variant list
	39	- 'data' - the variants, in the given format.
	40
	41	=cut
	42
	43	sub parse {
dfc37e38	44	my( $tradition, $opts ) = @_;
52ce987f	45
dfc37e38	46	my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
52ce987f	47	load( $format_mod );
408449b7	48	# TODO Handle a string someday if we ever have a format other than KUL
408449b7	49	my @apparatus_entries = $format_mod->can('read')->( $opts );
dfc37e38	50	merge_base( $tradition->collation, $opts->{'base'}, @apparatus_entries );
52ce987f	51	}
52ce987f	52
2ceca8c3	53	=item B<merge_base>
	54
	55	merge_base( $graph, 'reference.txt', @apparatus_entries )
	56
	57	Takes three arguments: a newly-initialized Text::Tradition::Graph
	58	object, a text file containing the reference text, and a list of
	59	variants (apparatus entries). Adds the base text to the graph, and
	60	joins the variants to that.
	61
	62	The list of variants is an array of hash references; each hash takes
	63	the form
	64	{ '_id' => line reference,
	65	'rdg_0' => lemma reading,
	66	'rdg_1' => first variant,
	67	... # and so on until all distinct readings are listed
	68	'WitnessA' => 'rdg_0',
	69	'WitnessB' => 'rdg_1',
	70	... # and so on until all witnesses are listed with their readings
	71	}
	72
	73	Any hash key that is not of the form /^rdg_\d+$/ and that does not
	74	begin with an underscore is assumed to be a witness name. Any 'meta'
	75	information to be passed must be passed in a key with a leading
	76	underscore in its name.
	77
	78	=cut
	79
b15511bf	80	my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
4ca00eca	81
4ca00eca	82	my %base_text_index;
6a222840	83	my $edits_required = {};
4ca00eca	84
4ca00eca	85	# edits_required -> wit -> [ { start_idx, end_idx, items } ]
930ff666	86
b49c4318	87	sub merge_base {
e2902068	88	my( $collation, $base_file, @app_entries ) = @_;
e2902068	89	my @base_line_starts = read_base( $base_file, $collation );
b49c4318	90
52ce987f	91	my %all_witnesses;
6a222840	92	my @unwitnessed_lemma_nodes;
b49c4318	93	foreach my $app ( @app_entries ) {
910a0a6d	94	my( $line, $num ) = split( /\./, $app->{_id} );
	95	# DEBUG with a short graph
	96	last if $SHORTEND && $line > $SHORTEND;
	97	# DEBUG for problematic entries
	98	my $scrutinize = '';
	99	my $first_line_reading = $base_line_starts[ $line ];
	100	my $too_far = $base_line_starts[ $line+1 ];
	101
	102	my $lemma = $app->{rdg_0};
	103	my $seq = 1;
	104	# Is this the Nth occurrence of this reading in the line?
	105	if( $lemma =~ s/(_)?(\d)$// ) {
	106	$seq = $2;
	107	}
	108	my @lemma_words = split( /\s+/, $lemma );
	109
	110	# Now search for the lemma words within this line.
	111	my $lemma_start = $first_line_reading;
	112	my $lemma_end;
	113	my %seen;
	114	while( $lemma_start ne $too_far ) {
	115	# Loop detection
	116	if( $seen{ $lemma_start->name() } ) {
	117	warn "Detected loop at " . $lemma_start->name() .
	118	", ref $line,$num";
	119	last;
	120	}
	121	$seen{ $lemma_start->name() } = 1;
	122
	123	# Try to match the lemma.
	124	my $unmatch = 0;
	125	print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
	126	$lemma_words[0] . "...\n"
	127	if "$line.$num" eq $scrutinize;
	128	if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
	129	# Skip it if we need a match that is not the first.
	130	if( --$seq < 1 ) {
	131	# Now we have to compare the rest of the words here.
	132	if( scalar( @lemma_words ) > 1 ) {
	133	my $next_reading =
	134	$collation->next_reading( $lemma_start );
	135	foreach my $w ( @lemma_words[1..$#lemma_words] ) {
	136	printf STDERR "Now matching %s against %s\n",
	137	cmp_str($next_reading), $w
	138	if "$line.$num" eq $scrutinize;
	139	if( $w ne cmp_str($next_reading) ) {
	140	$unmatch = 1;
	141	last;
	142	} else {
	143	$lemma_end = $next_reading;
	144	$next_reading =
	145	$collation->next_reading( $lemma_end );
	146	}
	147	}
	148	} else {
	149	$lemma_end = $lemma_start;
	150	}
	151	} else {
	152	$unmatch = 1;
	153	}
	154	}
	155	last unless ( $unmatch \|\| !defined( $lemma_end ) );
	156	$lemma_end = undef;
	157	$lemma_start = $collation->next_reading( $lemma_start );
158	}
159
160	unless( $lemma_end ) {
161	warn "No match found for @lemma_words at $line.$num";
162	next;
163	}
164
165	# Now we have found the lemma; we will record an 'edit', in
166	# terms of a splice operation, for each subsequent reading.
167	# We also note which witnesses take the given edit.
168
169	my @lemma_set = $collation->reading_sequence( $lemma_start,
170	$lemma_end );
171	my @reading_sets = [ @lemma_set ];
172
173	# For each reading that is not rdg_0, we create the variant
174	# reading nodes, and store the range as an edit operation on
175	# the base text.
176	my $variant_objects;
177	my %pc_seen; # Keep track of mss with explicit post-corr data
178	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
179	my @mss = grep { $app->{$_} eq $k } keys( %$app );
180
181	# Keep track of lemma nodes that don't actually appear in
182	# any MSS; we will want to remove them from the collation.
183	push( @unwitnessed_lemma_nodes, @lemma_set )
184	if !@mss && $k eq 'rdg_0';
185
186	# Keep track of what witnesses we have seen.
187	@all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
188	# Keep track of which witnesses bear corrected readings here.
189	foreach my $m ( @mss ) {
190	my $base = _is_post_corr( $m );
191	next unless $base;
192	$pc_seen{$base} = 1;
193	}
194	next if $k eq 'rdg_0';
195
196	# Parse the variant into reading tokens.
197	# TODO don't hardcode the reading split operation
198	my @variant = split( /\s+/, $app->{$k} );
199	@variant = () if $app->{$k} eq '/'; # This is an omission.
200
201	my @variant_readings;
202	my $ctr = 0;
203	foreach my $vw ( @variant ) {
204	my $vwname = "$k/$line.$num.$ctr"; $ctr++;
205	my $vwreading = $collation->add_reading( $vwname );
206	$vwreading->text( $vw );
207	push( @variant_readings, $vwreading );
208	}
209
210	$variant_objects->{$k} = { 'mss' => \@mss,
211	'reading' => \@variant_readings,
212	};
213	push( @reading_sets, \@variant_readings );
214	}
215
216	# Now collate and collapse the identical readings within the
217	# collated sets. Modifies the reading sets that were passed.
218	collate_variants( $collation, @reading_sets );
219
220	# Record any stated relationships between the nodes and the lemma.
221	set_relationships( $collation, $app, \@lemma_set, $variant_objects );
222
223	# Now create the splice-edit objects that will be used
224	# to reconstruct each witness.
225
226	foreach my $rkey ( keys %$variant_objects ) {
227	# Object is argument list for splice, so:
228	# offset, length, replacements
229	my $edit_object = [ $lemma_start->name,
230	scalar( @lemma_set ),
231	$variant_objects->{$rkey}->{reading} ];
232	foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
233	# Is this a p.c. entry?
234	my $base = _is_post_corr( $ms );
235	if( $base ) { # this is a post-corr witness
236	my $pc_key = $base . "_post";
237	add_hash_entry( $edits_required, $pc_key, $edit_object );
238	} else { # this is an ante-corr witness
239	my $pc_key = $ms . "_post";
240	add_hash_entry( $edits_required, $ms, $edit_object );
241	unless( $pc_seen{$ms} ) {
242	# If this witness carries no correction, add this
243	# same object to its post-corrected state.
244	add_hash_entry( $edits_required, $pc_key,
245	$edit_object );
246	}
247	}
248	}
249	}
4ca00eca	250	} # Finished going through the apparatus entries
	251
	252	# Now make the witness objects, and create their text sequences
6a222840	253	foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
910a0a6d	254	print STDERR "Creating witness $w\n";
	255	my $witness_obj = $collation->tradition->add_witness( sigil => $w );
	256	my $debug; # = $w eq 'Vb11';
	257	my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
	258	my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
	259	if exists( $edits_required->{$w."_post"} );
	260
	261	my @repeated = check_for_repeated( @ante_corr_seq );
	262	warn "Repeated elements @repeated in $w a.c."
	263	if @repeated;
	264	@repeated = check_for_repeated( @post_corr_seq );
	265	warn "Repeated elements @repeated in $w p.c."
	266	if @repeated;
	267
	268	# Now save these paths in my witness object
	269	if( @post_corr_seq ) {
	270	$witness_obj->path( \@post_corr_seq );
	271	$witness_obj->uncorrected_path( \@ante_corr_seq );
	272	} else {
	273	$witness_obj->path( \@ante_corr_seq );
	274	}
b49c4318	275	}
e2902068	276
6a222840	277	# Now remove our 'base text' edges, which is to say, the only
1ed3973e	278	# ones we have created so far. Also remove any unwitnessed
1ed3973e	279	# lemma nodes (TODO unless we are treating base as witness)
6a222840	280	foreach ( $collation->paths() ) {
910a0a6d	281	$collation->del_path( $_ );
6a222840	282	}
6a222840	283	foreach( @unwitnessed_lemma_nodes ) {
910a0a6d	284	$collation->del_reading( $_ );
910a0a6d	285	# TODO do we need to delete any relationship paths here?
6a222840	286	}
4ca00eca	287
b15511bf	288	### HACKY HACKY Do some one-off path corrections here.
408449b7	289	require( 'data/boodts/s158.HACK' );
408449b7	290	KUL::HACK::pre_path_hack( $collation );
b15511bf	291
910a0a6d	292	# Now walk paths and calculate positional rank.
7e450e44	293	$collation->make_witness_paths();
408449b7	294	KUL::HACK::post_path_hack( $collation );
910a0a6d	295	# Have to check relationship validity at this point, because before that
	296	# we had no paths.
	297	# foreach my $rel ( $collation->relationships ) {
	298	# next unless $rel->equal_rank;
	299	# unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
	300	# warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
	301	# $rel->type, $rel->from->name, $rel->to->name );
	302	# }
	303	# }
	304	$collation->calculate_ranks();
15d2d3df	305	}
15d2d3df	306
2ceca8c3	307	=item B<read_base>
2ceca8c3	308
e2902068	309	my @line_beginnings = read_base( 'reference.txt', $collation );
2ceca8c3	310
e2902068	311	Takes a text file and a (presumed empty) collation object, adds the
	312	words as simple linear readings to the collation, and returns a
	313	list of readings that represent the beginning of lines. This collation
	314	is now the starting point for application of apparatus entries in
	315	merge_base, e.g. from a CSV file or a Classical Text Editor file.
2ceca8c3	316
2ceca8c3	317	=cut
b49c4318	318
b49c4318	319	sub read_base {
e2902068	320	my( $base_file, $collation ) = @_;
b49c4318	321
e2902068	322	# This array gives the first reading for each line. We put the
b49c4318	323	# common starting point in line zero.
e2902068	324	my $last_reading = $collation->start();
6a222840	325	$base_text_index{$last_reading->name} = 0;
e2902068	326	my $lineref_array = [ $last_reading ]; # There is no line zero.
b49c4318	327
b49c4318	328	open( BASE, $base_file ) or die "Could not open file $base_file: $!";
6a222840	329	my $i = 1;
b49c4318	330	while(<BASE>) {
910a0a6d	331	# Make the readings, and connect them up for the base, but
	332	# also save the first reading of each line in an array for the
	333	# purpose.
	334	# TODO use configurable reading separator
	335	chomp;
	336	my @words = split;
	337	my $started = 0;
	338	my $wordref = 0;
	339	my $lineref = scalar @$lineref_array;
	340	last if $SHORTEND && $lineref > $SHORTEND;
	341	foreach my $w ( @words ) {
	342	my $readingref = join( ',', $lineref, ++$wordref );
	343	my $reading = $collation->add_reading( $readingref );
	344	$reading->text( $w );
	345	unless( $started ) {
	346	push( @$lineref_array, $reading );
	347	$started = 1;
	348	}
	349	# Add edge paths in the graph, for easier tracking when
	350	# we start applying corrections. These paths will be
	351	# removed when we're done.
	352	my $path = $collation->add_path( $last_reading, $reading,
	353	$collation->baselabel );
	354	$last_reading = $reading;
	355
	356	# Note an array index for the reading, for later correction splices.
	357	$base_text_index{$readingref} = $i++;
	358	}
b49c4318	359	}
	360	close BASE;
	361	# Ending point for all texts
910a0a6d	362	$collation->add_path( $last_reading, $collation->end, $collation->baselabel );
	363	push( @$lineref_array, $collation->end );
	364	$base_text_index{$collation->end->name} = $i;
b49c4318	365
	366	return( @$lineref_array );
	367	}
	368
15d2d3df	369	sub set_relationships {
3265b0ce	370	my( $collation, $app, $lemma, $variants ) = @_;
15d2d3df	371	foreach my $rkey ( keys %$variants ) {
910a0a6d	372	my $var = $variants->{$rkey}->{'reading'};
	373	my $type = $app->{sprintf( "_%s_type", $rkey )};
	374	my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
	375	my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
	376
	377	my %rel_options = ();
	378	$rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
	379	$rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
	380
	381	if( $type =~ /^(inv\|tr\|rep)$/i ) {
	382	# Transposition or repetition: look for nodes with the
	383	# same label but different IDs and mark them.
	384	$type = 'repetition' if $type =~ /^rep/i;
	385	$rel_options{'type'} = $type;
	386	$rel_options{'equal_rank'} = undef;
	387	my %labels;
	388	foreach my $r ( @$lemma ) {
	389	$labels{cmp_str( $r )} = $r;
	390	}
	391	foreach my $r( @$var ) {
	392	if( exists $labels{$r->label} &&
	393	$r->name ne $labels{$r->label}->name ) {
	394	if( $type eq 'repetition' ) {
	395	# Repetition
	396	$collation->add_relationship( $r, $labels{$r->label}, \%rel_options );
	397	} else {
	398	# Transposition
	399	$r->set_identical( $labels{$r->label} );
	400	}
	401	}
	402	}
	403	} elsif( $type =~ /^(gr\|sp(el)?)$/i ) {
	404
	405	# Grammar/spelling/lexical: this can be a one-to-one or
	406	# one-to-many mapping. We should think about merging
	407	# readings if it is one-to-many.
	408
	409	$type = 'grammatical' if $type =~ /gr/i;
	410	$type = 'spelling' if $type =~ /sp/i;
	411	$type = 'repetition' if $type =~ /rep/i;
	412	# $type = 'lexical' if $type =~ /lex/i;
	413	$rel_options{'type'} = $type;
	414	$rel_options{'equal_rank'} = 1;
	415	if( @$lemma == @$var ) {
	416	foreach my $i ( 0 .. $#{$lemma} ) {
	417	$collation->add_relationship( $var->[$i], $lemma->[$i],
	418	\%rel_options );
	419	}
	420	} else {
	421	# An uneven many-to-many mapping. Skip for now.
	422	# We really want to make a segment out of whatever we have.
	423	# my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
	424	# my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
	425	# $collation->add_relationship( $varseg, $lemseg, \%rel_options );
	426	if( @$lemma == 1 && @$var == 1 ) {
	427	$collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
	428	}
	429	}
	430	} elsif( $type !~ /^(add\|om\|lex)$/i ) {
	431	warn "Unrecognized type $type";
	432	}
15d2d3df	433	}
15d2d3df	434	}
910a0a6d	435
15d2d3df	436
15d2d3df	437
4ca00eca	438	sub apply_edits {
b15511bf	439	my( $collation, $edit_sequence, $debug ) = @_;
c78feb69	440	my @lemma_text = $collation->reading_sequence( $collation->start,
910a0a6d	441	$collation->reading( '#END#' ) );
4ca00eca	442	my $drift = 0;
b15511bf	443	foreach my $correction ( @$edit_sequence ) {
910a0a6d	444	my( $lemma_start, $length, $items ) = @$correction;
	445	my $offset = $base_text_index{$lemma_start};
	446	my $realoffset = $offset + $drift;
	447	if( $debug \|\|
	448	$lemma_text[$realoffset]->name ne $lemma_start ) {
	449	my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
	450	my @base_phrase;
	451	my $i = $realoffset;
	452	my $l = $collation->reading( $lemma_start );
	453	while( $i < $realoffset+$length ) {
	454	push( @base_phrase, $l );
	455	$l = $collation->next_reading( $l );
	456	$i++;
	457	}
	458
	459	print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
	460	"with %s (%s) with drift %d\n",
	461	join( ' ', map {$_->label} @base_phrase ),
	462	join( ' ', map {$_->name} @base_phrase ),
	463	$realoffset,
	464	join( ' ', map {$_->label} @$items ),
	465	join( ' ', map {$_->name} @$items ),
	466	$drift,
	467	) if $debug;
	468
	469	if( $lemma_text[$realoffset]->name ne $lemma_start ) {
	470	warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
	471	"but %s (%s) is there instead",
	472	join( ' ', map {$_->label} @base_phrase ),
	473	join( ' ', map {$_->name} @base_phrase ),
	474	join( ' ', map {$_->label} @$items ),
	475	join( ' ', map {$_->name} @$items ),
	476	join( ' ', map {$_->label} @this_phrase ),
	477	join( ' ', map {$_->name} @this_phrase ),
	478	) );
	479	# next;
	480	}
	481	}
	482	splice( @lemma_text, $realoffset, $length, @$items );
	483	$drift += @$items - $length;
b49c4318	484	}
b15511bf	485	return @lemma_text;
b49c4318	486	}
910a0a6d	487
4ca00eca	488
e2902068	489	# Helper function. Given a witness sigil, if it is a post-correctione
	490	# sigil,return the base witness. If not, return a false value.
	491	sub _is_post_corr {
	492	my( $sigil ) = @_;
930ff666	493	if( $sigil =~ /^(.?)(\s\(?p\.\s*c\.\)?)$/ ) {
910a0a6d	494	return $1;
e2902068	495	}
	496	return undef;
	497	}
	498
b49c4318	499
2ceca8c3	500	=back
	501
	502	=head1 LICENSE
	503
	504	This package is free software and is provided "as is" without express
	505	or implied warranty. You can redistribute it and/or modify it under
	506	the same terms as Perl itself.
	507
	508	=head1 AUTHOR
	509
	510	Tara L Andrews, aurum@cpan.org
	511
	512	=cut
	513
b49c4318	514	1;