[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / BaseText.pm

package Text::Tradition::Parser::BaseText;

use strict;
use warnings;
use Module::Load;
use Text::Tradition::Parser::Util qw( collate_variants cmp_str check_for_repeated add_hash_entry );

=head1 NAME

Text::Tradition::Parser::BaseText

=head1 SYNOPSIS

use Text::Tradition::Parser::BaseText qw( merge_base );
merge_base( $graph, 'reference.txt', @apparatus_entries )

=head1 DESCRIPTION

For an overview of the package, see the documentation for the
Text::Tradition module.

This module is meant for use with certain of the other Parser classes
- whenever a list of variants is given with reference to a base text,
these must be joined into a single collation.  The parser should
therefore make a list of variants and their locations, and BaseText
will join those listed variants onto the reference text.  

=head1 SUBROUTINES

=over

=item B<parse>

parse( $graph, $opts );

Takes an initialized graph and a hashref of options, which must include:
- 'base' - the base text referenced by the variants
- 'format' - the format of the variant list
- 'data' - the variants, in the given format.

=cut

sub parse {
    my( $tradition, $opts ) = @_;

    my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
    load( $format_mod );
    # TODO Handle a string someday if we ever have a format other than KUL
    my @apparatus_entries = $format_mod->can('read')->( $opts );
    merge_base( $tradition->collation, $opts, @apparatus_entries );
}

=item B<merge_base>

merge_base( $graph, 'reference.txt', @apparatus_entries )

Takes three arguments: a newly-initialized Text::Tradition::Graph
object, a text file containing the reference text, and a list of
variants (apparatus entries).  Adds the base text to the graph, and
joins the variants to that.

The list of variants is an array of hash references; each hash takes
the form
 { '_id' => line reference,
   'rdg_0' => lemma reading,
   'rdg_1' => first variant,
   ...  # and so on until all distinct readings are listed
   'WitnessA' => 'rdg_0',
   'WitnessB' => 'rdg_1',
   ...  # and so on until all witnesses are listed with their readings
 }

Any hash key that is not of the form /^rdg_\d+$/ and that does not
begin with an underscore is assumed to be a witness name.  Any 'meta'
information to be passed must be passed in a key with a leading
underscore in its name.

=cut

my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed

my %base_text_index;
my $edits_required = {};

# edits_required -> wit -> [ { start_idx, end_idx, items } ]

sub merge_base {
    my( $collation, $opts, @app_entries ) = @_;
    my @base_line_starts = read_base( $opts->{'base'}, $collation );

    my %all_witnesses;
    foreach my $app ( @app_entries ) {
        my( $line, $num ) = split( /\./, $app->{_id} );
        # DEBUG with a short graph
        last if $SHORTEND && $line > $SHORTEND;
        # DEBUG for problematic entries
        my $scrutinize = '';
        my $first_line_reading = $base_line_starts[ $line ];
        my $too_far = $base_line_starts[ $line+1 ];
        
        my $lemma = $app->{rdg_0};
        my $seq = 1; 
        # Is this the Nth occurrence of this reading in the line?
        if( $lemma =~ s/(_)?(\d)$// ) {
            $seq = $2;
        }
        my @lemma_words = split( /\s+/, $lemma );
        
        # Now search for the lemma words within this line.
        my $lemma_start = $first_line_reading;
        my $lemma_end;
        my %seen;
        while( $lemma_start ne $too_far ) {
            # Loop detection
            if( $seen{ $lemma_start->id() } ) {
                warn "Detected loop at " . $lemma_start->id() . 
                    ", ref $line,$num";
                last;
            }
            $seen{ $lemma_start->id() } = 1;
            
            # Try to match the lemma.
            my $unmatch = 0;
            print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
                $lemma_words[0] . "...\n"
                if "$line.$num" eq $scrutinize;
            if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
                # Skip it if we need a match that is not the first.
                if( --$seq < 1 ) {
                    # Now we have to compare the rest of the words here.
                    if( scalar( @lemma_words ) > 1 ) {
                        my $next_reading = 
                            $collation->next_reading( $lemma_start );
                        foreach my $w ( @lemma_words[1..$#lemma_words] ) {
                            printf STDERR "Now matching %s against %s\n", 
                                    cmp_str($next_reading), $w
                                if "$line.$num" eq $scrutinize;
                            if( $w ne cmp_str($next_reading) ) {
                                $unmatch = 1;
                                last;
                            } else {
                                $lemma_end = $next_reading;
                                $next_reading = 
                                    $collation->next_reading( $lemma_end );
                            }
                        }
                    } else {
                        $lemma_end = $lemma_start;
                    }
                } else {
                    $unmatch = 1;
                }
            }
            last unless ( $unmatch || !defined( $lemma_end ) );
            $lemma_end = undef;
            $lemma_start = $collation->next_reading( $lemma_start );
        }
        
        unless( $lemma_end ) {
            warn "No match found for @lemma_words at $line.$num";
            next;
        }
        
        # Now we have found the lemma; we will record an 'edit', in
        # terms of a splice operation, for each subsequent reading.
        # We also note which witnesses take the given edit.

        my @lemma_set = $collation->reading_sequence( $lemma_start, 
                                                      $lemma_end );
        my @reading_sets = [ @lemma_set ];
        
        # For each reading that is not rdg_0, we create the variant
        # reading nodes, and store the range as an edit operation on
        # the base text.
        my $variant_objects;
        my %pc_seen; # Keep track of mss with explicit post-corr data
        foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
            my @mss = grep { $app->{$_} eq $k } keys( %$app );

            # Keep track of what witnesses we have seen.
            @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
            # Keep track of which witnesses bear corrected readings here.
            foreach my $m ( @mss ) {
                my $base = _is_post_corr( $m );
                next unless $base;
                $pc_seen{$base} = 1;
            }
            next if $k eq 'rdg_0';

            # Parse the variant into reading tokens.
            # TODO don't hardcode the reading split operation
            my @variant = split( /\s+/, $app->{$k} );
            @variant = () if $app->{$k} eq '/'; # This is an omission.
            
            my @variant_readings;
            my $ctr = 0;
            foreach my $vw ( @variant ) {
                my $vwname = "$k/$line.$num.$ctr"; $ctr++;
                my $vwreading = $collation->add_reading( {
                	'id' => $vwname,
                	'text' => $vw } );
                push( @variant_readings, $vwreading );
            }

            $variant_objects->{$k} = { 'mss' => \@mss,
                                       'reading' => \@variant_readings,
            };
            push( @reading_sets, \@variant_readings );
        }

        # Now collate and collapse the identical readings within the
        # collated sets.  Modifies the reading sets that were passed.
        collate_variants( $collation, @reading_sets );

        # Record any stated relationships between the nodes and the lemma.
        set_relationships( $collation, $app, \@lemma_set, $variant_objects );

        # Now create the splice-edit objects that will be used
        # to reconstruct each witness.

        foreach my $rkey ( keys %$variant_objects ) {
            # Object is argument list for splice, so:
            # offset, length, replacements
            my $edit_object = [ $lemma_start->id,
                                scalar( @lemma_set ),
                                $variant_objects->{$rkey}->{reading} ];
            foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
                # Is this a p.c. entry?
                my $base = _is_post_corr( $ms );
                if( $base ) { # this is a post-corr witness
                    my $pc_key = $base . "_post";
                    add_hash_entry( $edits_required, $pc_key, $edit_object );
                } else { # this is an ante-corr witness
                    my $pc_key = $ms . "_post";
                    add_hash_entry( $edits_required, $ms, $edit_object );
                    unless( $pc_seen{$ms} ) {
                        # If this witness carries no correction, add this 
                        # same object to its post-corrected state.
                        add_hash_entry( $edits_required, $pc_key, 
                                         $edit_object );
                    }
                }
            }
        }
    } # Finished going through the apparatus entries

    # Now make the witness objects, and create their text sequences
    foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
        print STDERR "Creating witness $w\n";
        my $witness_obj = $collation->tradition->add_witness( sigil => $w );
        my $debug; #  = $w eq 'Vb11';
        my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
        my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
            if exists( $edits_required->{$w."_post"} );

        my @repeated = check_for_repeated( @ante_corr_seq );
        warn "Repeated elements @repeated in $w a.c."
            if @repeated;
        @repeated = check_for_repeated( @post_corr_seq );
        warn "Repeated elements @repeated in $w p.c."
            if @repeated;

        # Now save these paths in my witness object
        if( @post_corr_seq ) {
            $witness_obj->path( \@post_corr_seq );
            $witness_obj->uncorrected_path( \@ante_corr_seq );
        } else {
            $witness_obj->path( \@ante_corr_seq );
        }
    }

    # Now remove our 'base text' edges, which is to say, the only
    # ones we have created so far.  Also remove any unwitnessed
    # lemma nodes (TODO unless we are treating base as witness)
    foreach ( $collation->paths() ) {
        $collation->del_path( $_, $collation->baselabel );
    }

    ### HACKY HACKY Do some one-off path corrections here.
    if( $opts->{'input'} eq 'KUL' ) {
		require 'data/boodts/s158.HACK';
		KUL::HACK::pre_path_hack( $collation );
	}
	
    # Now walk paths and calculate positional rank.
    $collation->make_witness_paths();
    # Now delete any orphaned readings.
	foreach my $r ( $collation->sequence->isolated_vertices ) {
		print STDERR "Deleting unconnected reading $r / " . 
			$collation->reading( $r )->text . "\n";
		$collation->del_reading( $r );
	}
	
    KUL::HACK::post_path_hack( $collation ) if $opts->{'input'} eq 'KUL';
    # Have to check relationship validity at this point, because before that
    # we had no paths.
#     foreach my $rel ( $collation->relationships ) {
#         next unless $rel->equal_rank;
#         unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
#             warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
#                             $rel->type, $rel->from->id, $rel->to->id );
#         }
#     }
    $collation->calculate_ranks();
}

=item B<read_base>

my @line_beginnings = read_base( 'reference.txt', $collation );

Takes a text file and a (presumed empty) collation object, adds the
words as simple linear readings to the collation, and returns a
list of readings that represent the beginning of lines. This collation
is now the starting point for application of apparatus entries in
merge_base, e.g. from a CSV file or a Classical Text Editor file.

=cut

sub read_base {
    my( $base_file, $collation ) = @_;
    
    # This array gives the first reading for each line.  We put the
    # common starting point in line zero.
    my $last_reading = $collation->start;
    $base_text_index{$last_reading->id} = 0;
    my $lineref_array = [ $last_reading ]; # There is no line zero.

    open( BASE, $base_file ) or die "Could not open file $base_file: $!";
    my $i = 1;
    while(<BASE>) {
        # Make the readings, and connect them up for the base, but
        # also save the first reading of each line in an array for the
        # purpose.
        # TODO use configurable reading separator
        chomp;
        my @words = split;
        my $started = 0;
        my $wordref = 0;
        my $lineref = scalar @$lineref_array;
        last if $SHORTEND && $lineref > $SHORTEND;
        foreach my $w ( @words ) {
            my $readingref = join( ',', $lineref, ++$wordref );
            my $reading = $collation->add_reading( { id => $readingref, text => $w } );
            unless( $started ) {
                push( @$lineref_array, $reading );
                $started = 1;
            }
            # Add edge paths in the graph, for easier tracking when
            # we start applying corrections.  These paths will be
            # removed when we're done.
            my $path = $collation->add_path( $last_reading, $reading, 
                                             $collation->baselabel );
            $last_reading = $reading;

            # Note an array index for the reading, for later correction splices.
            $base_text_index{$readingref} = $i++;
        }
    }
    close BASE;
    # Ending point for all texts
    $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
    push( @$lineref_array, $collation->end );
    $base_text_index{$collation->end->id} = $i;

    return( @$lineref_array );
}

sub set_relationships {
    my( $collation, $app, $lemma, $variants ) = @_;
    foreach my $rkey ( keys %$variants ) {
        my $var = $variants->{$rkey}->{'reading'};
        my $type = $app->{sprintf( "_%s_type", $rkey )};
        my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
        my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
        
        my %rel_options = ();
        $rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
        $rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
        
        if( $type =~ /^(inv|tr|rep)$/i ) {
            # Transposition or repetition: look for nodes with the
            # same label but different IDs and mark them.
            $type = 'repetition' if $type =~ /^rep/i;
            $rel_options{'type'} = $type;
            $rel_options{'equal_rank'} = undef;
            my %labels;
            foreach my $r ( @$lemma ) {
                $labels{cmp_str( $r )} = $r;
            }
            foreach my $r( @$var ) {
                if( exists $labels{$r->text} &&
                    $r->id ne $labels{$r->text}->id ) {
                    if( $type eq 'repetition' ) {
                        # Repetition
                        $collation->add_relationship( $r, $labels{$r->text}, \%rel_options );
                    } else {
                        # Transposition
                        $r->set_identical( $labels{$r->text} );
                    }
                }
            }
        } elsif( $type =~ /^(gr|sp(el)?)$/i ) {

            # Grammar/spelling/lexical: this can be a one-to-one or
            # one-to-many mapping.  We should think about merging
            # readings if it is one-to-many.

            $type = 'grammatical' if $type =~ /gr/i;
            $type = 'spelling' if $type =~ /sp/i;
            $type = 'repetition' if $type =~ /rep/i;
            # $type = 'lexical' if $type =~ /lex/i;
            $rel_options{'type'} = $type;
            $rel_options{'equal_rank'} = 1;
            if( @$lemma == @$var ) {
                foreach my $i ( 0 .. $#{$lemma} ) {
                    $collation->add_relationship( $var->[$i], $lemma->[$i],
                        \%rel_options );
                } 
            } else {
                # An uneven many-to-many mapping.  Skip for now.
                # We really want to make a segment out of whatever we have.
                # my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
                # my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
                # $collation->add_relationship( $varseg, $lemseg, \%rel_options );
                if( @$lemma == 1 && @$var == 1 ) {
                    $collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
                }
            }
        } elsif( $type !~ /^(add|om|lex)$/i ) {
            warn "Unrecognized type $type";
        }
    }
}
        

sub apply_edits {
    my( $collation, $edit_sequence, $debug ) = @_;
    my @lemma_text = $collation->reading_sequence( 
    	$collation->start, $collation->end );
    my $drift = 0;
    foreach my $correction ( @$edit_sequence ) {
        my( $lemma_start, $length, $items ) = @$correction;
        my $offset = $base_text_index{$lemma_start};
        my $realoffset = $offset + $drift;
        if( $debug ||
            $lemma_text[$realoffset]->id ne $lemma_start ) {
            my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
            my @base_phrase;
            my $i = $realoffset;
            my $l = $collation->reading( $lemma_start );
            while( $i < $realoffset+$length ) {
                push( @base_phrase, $l );
                $l = $collation->next_reading( $l );
                $i++;
            }
            
            print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
                                  "with %s (%s) with drift %d\n",
                                  join( ' ', map {$_->text} @base_phrase ),
                                  join( ' ', map {$_->id} @base_phrase ),
                                  $realoffset,
                                  join( ' ', map {$_->text} @$items ),
                                  join( ' ', map {$_->id} @$items ),
                                  $drift,
                                  ) if $debug;
                                  
            if( $lemma_text[$realoffset]->id ne $lemma_start ) {
                warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
                               "but %s (%s) is there instead", 
                               join( ' ', map {$_->text} @base_phrase ),
                               join( ' ', map {$_->id} @base_phrase ),
                               join( ' ', map {$_->text} @$items ),
                               join( ' ', map {$_->id} @$items ),
                               join( ' ', map {$_->text} @this_phrase ),
                               join( ' ', map {$_->id} @this_phrase ),
                      ) );
                # next;
            }
        }
        splice( @lemma_text, $realoffset, $length, @$items );
        $drift += @$items - $length;
    }
    return @lemma_text;
}
        

# Helper function. Given a witness sigil, if it is a post-correctione
# sigil,return the base witness.  If not, return a false value.
sub _is_post_corr {
    my( $sigil ) = @_;
    if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
        return $1;
    }
    return undef;
}


=back

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews, aurum@cpan.org

=cut

1;
Commit	Line	Data
e58153d6	1	package Text::Tradition::Parser::BaseText;
b49c4318	2
	3	use strict;
	4	use warnings;
52ce987f	5	use Module::Load;
910a0a6d	6	use Text::Tradition::Parser::Util qw( collate_variants cmp_str check_for_repeated add_hash_entry );
b49c4318	7
2ceca8c3	8	=head1 NAME
	9
	10	Text::Tradition::Parser::BaseText
	11
	12	=head1 SYNOPSIS
	13
	14	use Text::Tradition::Parser::BaseText qw( merge_base );
	15	merge_base( $graph, 'reference.txt', @apparatus_entries )
	16
	17	=head1 DESCRIPTION
	18
	19	For an overview of the package, see the documentation for the
408449b7	20	Text::Tradition module.
2ceca8c3	21
	22	This module is meant for use with certain of the other Parser classes
	23	- whenever a list of variants is given with reference to a base text,
	24	these must be joined into a single collation. The parser should
	25	therefore make a list of variants and their locations, and BaseText
	26	will join those listed variants onto the reference text.
	27
	28	=head1 SUBROUTINES
	29
	30	=over
	31
52ce987f	32	=item B<parse>
52ce987f	33
408449b7	34	parse( $graph, $opts );
52ce987f	35
408449b7	36	Takes an initialized graph and a hashref of options, which must include:
52ce987f	37	- 'base' - the base text referenced by the variants
	38	- 'format' - the format of the variant list
	39	- 'data' - the variants, in the given format.
	40
	41	=cut
	42
	43	sub parse {
dfc37e38	44	my( $tradition, $opts ) = @_;
52ce987f	45
dfc37e38	46	my $format_mod = 'Text::Tradition::Parser::' . $opts->{'input'};
52ce987f	47	load( $format_mod );
408449b7	48	# TODO Handle a string someday if we ever have a format other than KUL
408449b7	49	my @apparatus_entries = $format_mod->can('read')->( $opts );
b0b4421a	50	merge_base( $tradition->collation, $opts, @apparatus_entries );
52ce987f	51	}
52ce987f	52
2ceca8c3	53	=item B<merge_base>
	54
	55	merge_base( $graph, 'reference.txt', @apparatus_entries )
	56
	57	Takes three arguments: a newly-initialized Text::Tradition::Graph
	58	object, a text file containing the reference text, and a list of
	59	variants (apparatus entries). Adds the base text to the graph, and
	60	joins the variants to that.
	61
	62	The list of variants is an array of hash references; each hash takes
	63	the form
	64	{ '_id' => line reference,
	65	'rdg_0' => lemma reading,
	66	'rdg_1' => first variant,
	67	... # and so on until all distinct readings are listed
	68	'WitnessA' => 'rdg_0',
	69	'WitnessB' => 'rdg_1',
	70	... # and so on until all witnesses are listed with their readings
	71	}
	72
	73	Any hash key that is not of the form /^rdg_\d+$/ and that does not
	74	begin with an underscore is assumed to be a witness name. Any 'meta'
	75	information to be passed must be passed in a key with a leading
	76	underscore in its name.
	77
	78	=cut
	79
b15511bf	80	my $SHORTEND = ''; # Debug var - set this to limit the number of lines parsed
4ca00eca	81
4ca00eca	82	my %base_text_index;
6a222840	83	my $edits_required = {};
4ca00eca	84
4ca00eca	85	# edits_required -> wit -> [ { start_idx, end_idx, items } ]
930ff666	86
b49c4318	87	sub merge_base {
b0b4421a	88	my( $collation, $opts, @app_entries ) = @_;
b0b4421a	89	my @base_line_starts = read_base( $opts->{'base'}, $collation );
b49c4318	90
52ce987f	91	my %all_witnesses;
b49c4318	92	foreach my $app ( @app_entries ) {
910a0a6d	93	my( $line, $num ) = split( /\./, $app->{_id} );
	94	# DEBUG with a short graph
	95	last if $SHORTEND && $line > $SHORTEND;
	96	# DEBUG for problematic entries
	97	my $scrutinize = '';
	98	my $first_line_reading = $base_line_starts[ $line ];
	99	my $too_far = $base_line_starts[ $line+1 ];
	100
	101	my $lemma = $app->{rdg_0};
	102	my $seq = 1;
	103	# Is this the Nth occurrence of this reading in the line?
	104	if( $lemma =~ s/(_)?(\d)$// ) {
	105	$seq = $2;
	106	}
	107	my @lemma_words = split( /\s+/, $lemma );
	108
	109	# Now search for the lemma words within this line.
	110	my $lemma_start = $first_line_reading;
	111	my $lemma_end;
	112	my %seen;
	113	while( $lemma_start ne $too_far ) {
	114	# Loop detection
49d4f2ac	115	if( $seen{ $lemma_start->id() } ) {
49d4f2ac	116	warn "Detected loop at " . $lemma_start->id() .
910a0a6d	117	", ref $line,$num";
	118	last;
	119	}
49d4f2ac	120	$seen{ $lemma_start->id() } = 1;
910a0a6d	121
	122	# Try to match the lemma.
	123	my $unmatch = 0;
	124	print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
	125	$lemma_words[0] . "...\n"
	126	if "$line.$num" eq $scrutinize;
	127	if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
	128	# Skip it if we need a match that is not the first.
	129	if( --$seq < 1 ) {
	130	# Now we have to compare the rest of the words here.
	131	if( scalar( @lemma_words ) > 1 ) {
	132	my $next_reading =
	133	$collation->next_reading( $lemma_start );
	134	foreach my $w ( @lemma_words[1..$#lemma_words] ) {
	135	printf STDERR "Now matching %s against %s\n",
	136	cmp_str($next_reading), $w
	137	if "$line.$num" eq $scrutinize;
	138	if( $w ne cmp_str($next_reading) ) {
	139	$unmatch = 1;
	140	last;
	141	} else {
	142	$lemma_end = $next_reading;
	143	$next_reading =
	144	$collation->next_reading( $lemma_end );
	145	}
	146	}
	147	} else {
	148	$lemma_end = $lemma_start;
	149	}
	150	} else {
	151	$unmatch = 1;
	152	}
	153	}
	154	last unless ( $unmatch \|\| !defined( $lemma_end ) );
	155	$lemma_end = undef;
	156	$lemma_start = $collation->next_reading( $lemma_start );
	157	}
	158
	159	unless( $lemma_end ) {
	160	warn "No match found for @lemma_words at $line.$num";
	161	next;
	162	}
	163
	164	# Now we have found the lemma; we will record an 'edit', in
	165	# terms of a splice operation, for each subsequent reading.
	166	# We also note which witnesses take the given edit.
	167
	168	my @lemma_set = $collation->reading_sequence( $lemma_start,
	169	$lemma_end );
	170	my @reading_sets = [ @lemma_set ];
b0b4421a	171
910a0a6d	172	# For each reading that is not rdg_0, we create the variant
	173	# reading nodes, and store the range as an edit operation on
	174	# the base text.
	175	my $variant_objects;
	176	my %pc_seen; # Keep track of mss with explicit post-corr data
	177	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
	178	my @mss = grep { $app->{$_} eq $k } keys( %$app );
	179
910a0a6d	180	# Keep track of what witnesses we have seen.
	181	@all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
	182	# Keep track of which witnesses bear corrected readings here.
	183	foreach my $m ( @mss ) {
	184	my $base = _is_post_corr( $m );
	185	next unless $base;
	186	$pc_seen{$base} = 1;
	187	}
	188	next if $k eq 'rdg_0';
	189
	190	# Parse the variant into reading tokens.
	191	# TODO don't hardcode the reading split operation
	192	my @variant = split( /\s+/, $app->{$k} );
	193	@variant = () if $app->{$k} eq '/'; # This is an omission.
	194
	195	my @variant_readings;
	196	my $ctr = 0;
	197	foreach my $vw ( @variant ) {
	198	my $vwname = "$k/$line.$num.$ctr"; $ctr++;
49d4f2ac	199	my $vwreading = $collation->add_reading( {
	200	'id' => $vwname,
	201	'text' => $vw } );
910a0a6d	202	push( @variant_readings, $vwreading );
	203	}
	204
	205	$variant_objects->{$k} = { 'mss' => \@mss,
	206	'reading' => \@variant_readings,
	207	};
	208	push( @reading_sets, \@variant_readings );
	209	}
	210
	211	# Now collate and collapse the identical readings within the
	212	# collated sets. Modifies the reading sets that were passed.
	213	collate_variants( $collation, @reading_sets );
	214
	215	# Record any stated relationships between the nodes and the lemma.
	216	set_relationships( $collation, $app, \@lemma_set, $variant_objects );
	217
	218	# Now create the splice-edit objects that will be used
	219	# to reconstruct each witness.
	220
	221	foreach my $rkey ( keys %$variant_objects ) {
	222	# Object is argument list for splice, so:
	223	# offset, length, replacements
49d4f2ac	224	my $edit_object = [ $lemma_start->id,
910a0a6d	225	scalar( @lemma_set ),
	226	$variant_objects->{$rkey}->{reading} ];
	227	foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
	228	# Is this a p.c. entry?
	229	my $base = _is_post_corr( $ms );
	230	if( $base ) { # this is a post-corr witness
	231	my $pc_key = $base . "_post";
	232	add_hash_entry( $edits_required, $pc_key, $edit_object );
	233	} else { # this is an ante-corr witness
	234	my $pc_key = $ms . "_post";
	235	add_hash_entry( $edits_required, $ms, $edit_object );
	236	unless( $pc_seen{$ms} ) {
	237	# If this witness carries no correction, add this
	238	# same object to its post-corrected state.
	239	add_hash_entry( $edits_required, $pc_key,
	240	$edit_object );
	241	}
	242	}
	243	}
	244	}
4ca00eca	245	} # Finished going through the apparatus entries
	246
	247	# Now make the witness objects, and create their text sequences
6a222840	248	foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
910a0a6d	249	print STDERR "Creating witness $w\n";
	250	my $witness_obj = $collation->tradition->add_witness( sigil => $w );
	251	my $debug; # = $w eq 'Vb11';
	252	my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w}, $debug );
	253	my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"}, $debug )
	254	if exists( $edits_required->{$w."_post"} );
	255
	256	my @repeated = check_for_repeated( @ante_corr_seq );
	257	warn "Repeated elements @repeated in $w a.c."
	258	if @repeated;
	259	@repeated = check_for_repeated( @post_corr_seq );
	260	warn "Repeated elements @repeated in $w p.c."
	261	if @repeated;
	262
	263	# Now save these paths in my witness object
	264	if( @post_corr_seq ) {
	265	$witness_obj->path( \@post_corr_seq );
	266	$witness_obj->uncorrected_path( \@ante_corr_seq );
	267	} else {
	268	$witness_obj->path( \@ante_corr_seq );
	269	}
b49c4318	270	}
e2902068	271
6a222840	272	# Now remove our 'base text' edges, which is to say, the only
1ed3973e	273	# ones we have created so far. Also remove any unwitnessed
1ed3973e	274	# lemma nodes (TODO unless we are treating base as witness)
6a222840	275	foreach ( $collation->paths() ) {
49d4f2ac	276	$collation->del_path( $_, $collation->baselabel );
6a222840	277	}
4ca00eca	278
b15511bf	279	### HACKY HACKY Do some one-off path corrections here.
b0b4421a	280	if( $opts->{'input'} eq 'KUL' ) {
	281	require 'data/boodts/s158.HACK';
	282	KUL::HACK::pre_path_hack( $collation );
	283	}
	284
910a0a6d	285	# Now walk paths and calculate positional rank.
7e450e44	286	$collation->make_witness_paths();
49d4f2ac	287	# Now delete any orphaned readings.
	288	foreach my $r ( $collation->sequence->isolated_vertices ) {
	289	print STDERR "Deleting unconnected reading $r / " .
	290	$collation->reading( $r )->text . "\n";
	291	$collation->del_reading( $r );
	292	}
	293
b0b4421a	294	KUL::HACK::post_path_hack( $collation ) if $opts->{'input'} eq 'KUL';
910a0a6d	295	# Have to check relationship validity at this point, because before that
	296	# we had no paths.
	297	# foreach my $rel ( $collation->relationships ) {
	298	# next unless $rel->equal_rank;
	299	# unless( Text::Tradition::Collation::relationship_valid( $rel->from, $rel->to ) ) {
	300	# warn sprintf( "Relationship type %s between %s and %s is invalid, deleting",
49d4f2ac	301	# $rel->type, $rel->from->id, $rel->to->id );
910a0a6d	302	# }
910a0a6d	303	# }
861c3e27	304	$collation->calculate_ranks();
15d2d3df	305	}
15d2d3df	306
2ceca8c3	307	=item B<read_base>
2ceca8c3	308
e2902068	309	my @line_beginnings = read_base( 'reference.txt', $collation );
2ceca8c3	310
e2902068	311	Takes a text file and a (presumed empty) collation object, adds the
	312	words as simple linear readings to the collation, and returns a
	313	list of readings that represent the beginning of lines. This collation
	314	is now the starting point for application of apparatus entries in
	315	merge_base, e.g. from a CSV file or a Classical Text Editor file.
2ceca8c3	316
2ceca8c3	317	=cut
b49c4318	318
b49c4318	319	sub read_base {
e2902068	320	my( $base_file, $collation ) = @_;
b49c4318	321
e2902068	322	# This array gives the first reading for each line. We put the
b49c4318	323	# common starting point in line zero.
49d4f2ac	324	my $last_reading = $collation->start;
49d4f2ac	325	$base_text_index{$last_reading->id} = 0;
e2902068	326	my $lineref_array = [ $last_reading ]; # There is no line zero.
b49c4318	327
b49c4318	328	open( BASE, $base_file ) or die "Could not open file $base_file: $!";
6a222840	329	my $i = 1;
b49c4318	330	while(<BASE>) {
910a0a6d	331	# Make the readings, and connect them up for the base, but
	332	# also save the first reading of each line in an array for the
	333	# purpose.
	334	# TODO use configurable reading separator
	335	chomp;
	336	my @words = split;
	337	my $started = 0;
	338	my $wordref = 0;
	339	my $lineref = scalar @$lineref_array;
	340	last if $SHORTEND && $lineref > $SHORTEND;
	341	foreach my $w ( @words ) {
	342	my $readingref = join( ',', $lineref, ++$wordref );
49d4f2ac	343	my $reading = $collation->add_reading( { id => $readingref, text => $w } );
910a0a6d	344	unless( $started ) {
	345	push( @$lineref_array, $reading );
	346	$started = 1;
	347	}
	348	# Add edge paths in the graph, for easier tracking when
	349	# we start applying corrections. These paths will be
	350	# removed when we're done.
	351	my $path = $collation->add_path( $last_reading, $reading,
	352	$collation->baselabel );
	353	$last_reading = $reading;
	354
	355	# Note an array index for the reading, for later correction splices.
	356	$base_text_index{$readingref} = $i++;
	357	}
b49c4318	358	}
	359	close BASE;
	360	# Ending point for all texts
910a0a6d	361	$collation->add_path( $last_reading, $collation->end, $collation->baselabel );
910a0a6d	362	push( @$lineref_array, $collation->end );
49d4f2ac	363	$base_text_index{$collation->end->id} = $i;
b49c4318	364
	365	return( @$lineref_array );
	366	}
	367
15d2d3df	368	sub set_relationships {
3265b0ce	369	my( $collation, $app, $lemma, $variants ) = @_;
15d2d3df	370	foreach my $rkey ( keys %$variants ) {
910a0a6d	371	my $var = $variants->{$rkey}->{'reading'};
	372	my $type = $app->{sprintf( "_%s_type", $rkey )};
	373	my $noncorr = $app->{sprintf( "_%s_non_corr", $rkey )};
	374	my $nonindep = $app->{sprintf( "_%s_non_indep", $rkey )};
	375
	376	my %rel_options = ();
	377	$rel_options{'non_correctable'} = $noncorr if $noncorr && $noncorr =~ /^\d$/;
	378	$rel_options{'non_indep'} = $nonindep if $nonindep && $nonindep =~ /^\d$/;
	379
	380	if( $type =~ /^(inv\|tr\|rep)$/i ) {
	381	# Transposition or repetition: look for nodes with the
	382	# same label but different IDs and mark them.
	383	$type = 'repetition' if $type =~ /^rep/i;
	384	$rel_options{'type'} = $type;
	385	$rel_options{'equal_rank'} = undef;
	386	my %labels;
	387	foreach my $r ( @$lemma ) {
	388	$labels{cmp_str( $r )} = $r;
	389	}
	390	foreach my $r( @$var ) {
49d4f2ac	391	if( exists $labels{$r->text} &&
49d4f2ac	392	$r->id ne $labels{$r->text}->id ) {
910a0a6d	393	if( $type eq 'repetition' ) {
910a0a6d	394	# Repetition
49d4f2ac	395	$collation->add_relationship( $r, $labels{$r->text}, \%rel_options );
910a0a6d	396	} else {
910a0a6d	397	# Transposition
49d4f2ac	398	$r->set_identical( $labels{$r->text} );
910a0a6d	399	}
	400	}
	401	}
	402	} elsif( $type =~ /^(gr\|sp(el)?)$/i ) {
	403
	404	# Grammar/spelling/lexical: this can be a one-to-one or
	405	# one-to-many mapping. We should think about merging
	406	# readings if it is one-to-many.
	407
	408	$type = 'grammatical' if $type =~ /gr/i;
	409	$type = 'spelling' if $type =~ /sp/i;
	410	$type = 'repetition' if $type =~ /rep/i;
	411	# $type = 'lexical' if $type =~ /lex/i;
	412	$rel_options{'type'} = $type;
	413	$rel_options{'equal_rank'} = 1;
	414	if( @$lemma == @$var ) {
	415	foreach my $i ( 0 .. $#{$lemma} ) {
	416	$collation->add_relationship( $var->[$i], $lemma->[$i],
	417	\%rel_options );
	418	}
	419	} else {
	420	# An uneven many-to-many mapping. Skip for now.
	421	# We really want to make a segment out of whatever we have.
	422	# my $lemseg = @$lemma > 1 ? $collation->add_segment( @$lemma ) : $lemma->[0];
	423	# my $varseg = @$var > 1 ? $collation->add_segment( @$var ) : $var->[0];
	424	# $collation->add_relationship( $varseg, $lemseg, \%rel_options );
	425	if( @$lemma == 1 && @$var == 1 ) {
	426	$collation->add_relationship( $lemma->[0], $var->[0], \%rel_options );
	427	}
	428	}
	429	} elsif( $type !~ /^(add\|om\|lex)$/i ) {
	430	warn "Unrecognized type $type";
	431	}
15d2d3df	432	}
15d2d3df	433	}
910a0a6d	434
15d2d3df	435
15d2d3df	436
4ca00eca	437	sub apply_edits {
b15511bf	438	my( $collation, $edit_sequence, $debug ) = @_;
49d4f2ac	439	my @lemma_text = $collation->reading_sequence(
49d4f2ac	440	$collation->start, $collation->end );
4ca00eca	441	my $drift = 0;
b15511bf	442	foreach my $correction ( @$edit_sequence ) {
910a0a6d	443	my( $lemma_start, $length, $items ) = @$correction;
	444	my $offset = $base_text_index{$lemma_start};
	445	my $realoffset = $offset + $drift;
	446	if( $debug \|\|
49d4f2ac	447	$lemma_text[$realoffset]->id ne $lemma_start ) {
910a0a6d	448	my @this_phrase = @lemma_text[$realoffset..$realoffset+$length-1];
	449	my @base_phrase;
	450	my $i = $realoffset;
	451	my $l = $collation->reading( $lemma_start );
	452	while( $i < $realoffset+$length ) {
	453	push( @base_phrase, $l );
	454	$l = $collation->next_reading( $l );
	455	$i++;
	456	}
	457
	458	print STDERR sprintf( "Trying to replace %s (%s) starting at %d " .
	459	"with %s (%s) with drift %d\n",
49d4f2ac	460	join( ' ', map {$_->text} @base_phrase ),
49d4f2ac	461	join( ' ', map {$_->id} @base_phrase ),
910a0a6d	462	$realoffset,
49d4f2ac	463	join( ' ', map {$_->text} @$items ),
49d4f2ac	464	join( ' ', map {$_->id} @$items ),
910a0a6d	465	$drift,
	466	) if $debug;
	467
49d4f2ac	468	if( $lemma_text[$realoffset]->id ne $lemma_start ) {
910a0a6d	469	warn( sprintf( "Should be replacing %s (%s) with %s (%s) " .
910a0a6d	470	"but %s (%s) is there instead",
49d4f2ac	471	join( ' ', map {$_->text} @base_phrase ),
	472	join( ' ', map {$_->id} @base_phrase ),
	473	join( ' ', map {$_->text} @$items ),
	474	join( ' ', map {$_->id} @$items ),
	475	join( ' ', map {$_->text} @this_phrase ),
	476	join( ' ', map {$_->id} @this_phrase ),
910a0a6d	477	) );
	478	# next;
	479	}
	480	}
	481	splice( @lemma_text, $realoffset, $length, @$items );
	482	$drift += @$items - $length;
b49c4318	483	}
b15511bf	484	return @lemma_text;
b49c4318	485	}
910a0a6d	486
4ca00eca	487
e2902068	488	# Helper function. Given a witness sigil, if it is a post-correctione
	489	# sigil,return the base witness. If not, return a false value.
	490	sub _is_post_corr {
	491	my( $sigil ) = @_;
930ff666	492	if( $sigil =~ /^(.?)(\s\(?p\.\s*c\.\)?)$/ ) {
910a0a6d	493	return $1;
e2902068	494	}
	495	return undef;
	496	}
	497
b49c4318	498
2ceca8c3	499	=back
	500
	501	=head1 LICENSE
	502
	503	This package is free software and is provided "as is" without express
	504	or implied warranty. You can redistribute it and/or modify it under
	505	the same terms as Perl itself.
	506
	507	=head1 AUTHOR
	508
	509	Tara L Andrews, aurum@cpan.org
	510
	511	=cut
	512
b49c4318	513	1;