[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / BaseText.pm

package Text::Tradition::Parser::BaseText;

use strict;
use warnings;
use Module::Load;
use Algorithm::Diff;

=head1 NAME

Text::Tradition::Parser::BaseText

=head1 SYNOPSIS

use Text::Tradition::Parser::BaseText qw( merge_base );
merge_base( $graph, 'reference.txt', @apparatus_entries )

=head1 DESCRIPTION

For an overview of the package, see the documentation for the
Text::Tradition::Graph module.

This module is meant for use with certain of the other Parser classes
- whenever a list of variants is given with reference to a base text,
these must be joined into a single collation.  The parser should
therefore make a list of variants and their locations, and BaseText
will join those listed variants onto the reference text.  

=head1 SUBROUTINES

=over

=item B<parse>

parse( $graph, %opts );

Takes an initialized graph and a set of options, which must include:
- 'base' - the base text referenced by the variants
- 'format' - the format of the variant list
- 'data' - the variants, in the given format.

=cut

sub parse {
    my( $tradition, %opts ) = @_;

    my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
    load( $format_mod );
    my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
    merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
}

=item B<merge_base>

merge_base( $graph, 'reference.txt', @apparatus_entries )

Takes three arguments: a newly-initialized Text::Tradition::Graph
object, a text file containing the reference text, and a list of
variants (apparatus entries).  Adds the base text to the graph, and
joins the variants to that.

The list of variants is an array of hash references; each hash takes
the form
 { '_id' => line reference,
   'rdg_0' => lemma reading,
   'rdg_1' => first variant,
   ...  # and so on until all distinct readings are listed
   'WitnessA' => 'rdg_0',
   'WitnessB' => 'rdg_1',
   ...  # and so on until all witnesses are listed with their readings
 }

Any hash key that is not of the form /^rdg_\d+$/ and that does not
begin with an underscore is assumed to be a witness name.  Any 'meta'
information to be passed must be passed in a key with a leading
underscore in its name.

=cut

my $SHORTEND; # Debug var - set this to limit the number of lines parsed

my %base_text_index;
my $edits_required = {};

# edits_required -> wit -> [ { start_idx, end_idx, items } ]

sub merge_base {
    my( $collation, $base_file, @app_entries ) = @_;
    my @base_line_starts = read_base( $base_file, $collation );

    my %all_witnesses;
    my @unwitnessed_lemma_nodes;
    foreach my $app ( @app_entries ) {
	my( $line, $num ) = split( /\./, $app->{_id} );
	# DEBUG with a short graph
	last if $SHORTEND && $line > $SHORTEND;
	# DEBUG for problematic entries
	my $scrutinize = '';
	my $first_line_reading = $base_line_starts[ $line ];
	my $too_far = $base_line_starts[ $line+1 ];
	
	my $lemma = $app->{rdg_0};
	my $seq = 1; 
	# Is this the Nth occurrence of this reading in the line?
	if( $lemma =~ s/(_)?(\d)$// ) {
	    $seq = $2;
	}
	my @lemma_words = split( /\s+/, $lemma );
	
	# Now search for the lemma words within this line.
	my $lemma_start = $first_line_reading;
	my $lemma_end;
	my %seen;
	while( $lemma_start ne $too_far ) {
	    # Loop detection
	    if( $seen{ $lemma_start->name() } ) {
		warn "Detected loop at " . $lemma_start->name() . 
		    ", ref $line,$num";
		last;
	    }
	    $seen{ $lemma_start->name() } = 1;
	    
	    # Try to match the lemma.
	    my $unmatch = 0;
	    print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
		$lemma_words[0] . "...\n"
		if "$line.$num" eq $scrutinize;
	    if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
		# Skip it if we need a match that is not the first.
		if( --$seq < 1 ) {
		    # Now we have to compare the rest of the words here.
		    if( scalar( @lemma_words ) > 1 ) {
			my $next_reading = 
			    $collation->next_reading( $lemma_start );
			foreach my $w ( @lemma_words[1..$#lemma_words] ) {
			    printf STDERR "Now matching %s against %s\n", 
				    cmp_str($next_reading), $w
				if "$line.$num" eq $scrutinize;
			    if( $w ne cmp_str($next_reading) ) {
				$unmatch = 1;
				last;
			    } else {
				$lemma_end = $next_reading;
				$next_reading = 
				    $collation->next_reading( $lemma_end );
			    }
			}
		    } else {
			$lemma_end = $lemma_start;
		    }
		} else {
		    $unmatch = 1;
		}
	    }
	    last unless ( $unmatch || !defined( $lemma_end ) );
	    $lemma_end = undef;
	    $lemma_start = $collation->next_reading( $lemma_start );
	}
	
	unless( $lemma_end ) {
	    warn "No match found for @lemma_words at $line.$num";
	    next;
	}
	
	# Now we have found the lemma; we will record an 'edit', in
	# terms of a splice operation, for each subsequent reading.
	# We also note which witnesses take the given edit.

	my @lemma_set = $collation->reading_sequence( $lemma_start, 
						      $lemma_end );
	my @reading_sets = [ @lemma_set ];

	# For each reading that is not rdg_0, we create the variant
	# reading nodes, and store the range as an edit operation on
	# the base text.
	my $variant_objects;
	my %pc_seen; # Keep track of mss with explicit post-corr data
	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
	    my @mss = grep { $app->{$_} eq $k } keys( %$app );
	    push( @unwitnessed_lemma_nodes, @lemma_set )
		if !@mss && $k eq 'rdg_0';

	    # Keep track of what witnesses we have seen.
	    @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
	    # Keep track of which witnesses bear corrected readings here.
	    foreach my $m ( @mss ) {
		my $base = _is_post_corr( $m );
		next unless $base;
		$pc_seen{$base} = 1;
	    }
	    next if $k eq 'rdg_0';

	    # TODO don't hardcode the reading split operation
	    my @variant = split( /\s+/, $app->{$k} );
	    @variant = () if $app->{$k} eq '/'; # This is an omission.
	    
	    # Make the variant into a set of readings.
	    my @variant_readings;
	    my $ctr = 0;
	    foreach my $vw ( @variant ) {
		my $vwname = "$k/$line.$num.$ctr"; $ctr++;
		my $vwreading = $collation->add_reading( $vwname );
		$vwreading->text( $vw );
		push( @variant_readings, $vwreading );
	    }

	    $variant_objects->{$k} = { 'mss' => \@mss,
				       'reading' => \@variant_readings,
	    };
	    push( @reading_sets, \@variant_readings );
	}

	# Now collate and collapse the identical readings within the
	# collated sets.  Modifies the reading sets that were passed.
	collate_variants( $collation, @reading_sets );

	# TODO Here would be a very good place to set up relationships
	# between the nodes and the lemma.
	set_relationships( $app, \@lemma_set, $variant_objects );

	# Now create the splice-edit objects that will be used
	# to reconstruct each witness.

	foreach my $rkey ( keys %$variant_objects ) {
	    # Object is argument list for splice, so:
	    # offset, length, replacements
	    my $edit_object = [ $base_text_index{$lemma_start->name},
				scalar( @lemma_set ),
				$variant_objects->{$rkey}->{reading} ];
	    foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
		# Is this a p.c. entry?
		my $base = _is_post_corr( $ms );
		if( $base ) { # this is a post-corr witness
		    my $pc_key = $base . "_post";
		    _add_hash_entry( $edits_required, $pc_key, $edit_object );
		} else { # this is an ante-corr witness
		    my $pc_key = $ms . "_post";
		    _add_hash_entry( $edits_required, $ms, $edit_object );
		    unless( $pc_seen{$ms} ) {
			# If this witness carries no correction, add this 
			# same object to its post-corrected state.
			_add_hash_entry( $edits_required, $pc_key, 
					 $edit_object );
		    }
		}
	    }
	}
    } # Finished going through the apparatus entries

    # Now make the witness objects, and create their text sequences
    foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
	print STDERR "Creating witness $w\n";
	my $witness_obj = $collation->tradition->add_witness( sigil => $w );
	my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w} );
	my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"} )
	    if exists( $edits_required->{$w."_post"} );

	my @repeated = _check_for_repeated( @ante_corr_seq );
	warn "Repeated elements @repeated in $w a.c."
	    if @repeated;
	@repeated = _check_for_repeated( @post_corr_seq );
	warn "Repeated elements @repeated in $w p.c."
	    if @repeated;

	# Now save these paths in my witness object
	if( @post_corr_seq ) {
	    $witness_obj->path( \@post_corr_seq );
	    $witness_obj->uncorrected_path( \@ante_corr_seq );
	} else {
	    $witness_obj->path( \@ante_corr_seq );
	}
    }

    # Now remove our 'base text' edges, which is to say, the only
    # ones we have created so far.  Also remove any nodes that didn't
    # appear in any witnesses.
    foreach ( $collation->paths() ) {
	$collation->del_path( $_ );
    }
    foreach( @unwitnessed_lemma_nodes ) {
	$collation->del_reading( $_ );
    }

    # Now walk paths and calculate positions.
    my @common_readings = 
	$collation->make_witness_paths();
    $collation->calculate_positions( @common_readings );
}

sub _check_for_repeated {
    my @seq = @_;
    my %unique;
    my @repeated;
    foreach ( @seq ) {
	if( exists $unique{$_->name} ) {
	    push( @repeated, $_->name );
	} else {
	    $unique{$_->name} = 1;
	}
    }
    return @repeated;
}

=item B<read_base>

my @line_beginnings = read_base( 'reference.txt', $collation );

Takes a text file and a (presumed empty) collation object, adds the
words as simple linear readings to the collation, and returns a
list of readings that represent the beginning of lines. This collation
is now the starting point for application of apparatus entries in
merge_base, e.g. from a CSV file or a Classical Text Editor file.

=cut

sub read_base {
    my( $base_file, $collation ) = @_;
    
    # This array gives the first reading for each line.  We put the
    # common starting point in line zero.
    my $last_reading = $collation->start();
    $base_text_index{$last_reading->name} = 0;
    my $lineref_array = [ $last_reading ]; # There is no line zero.

    open( BASE, $base_file ) or die "Could not open file $base_file: $!";
    my $i = 1;
    while(<BASE>) {
	# Make the readings, and connect them up for the base, but
	# also save the first reading of each line in an array for the
	# purpose.
	# TODO use configurable reading separator
	chomp;
	my @words = split;
	my $started = 0;
	my $wordref = 0;
	my $lineref = scalar @$lineref_array;
	last if $SHORTEND && $lineref > $SHORTEND;
	foreach my $w ( @words ) {
	    my $readingref = join( ',', $lineref, ++$wordref );
	    my $reading = $collation->add_reading( $readingref );
	    $reading->text( $w );
	    unless( $started ) {
		push( @$lineref_array, $reading );
		$started = 1;
	    }
	    # Add edge paths in the graph, for easier tracking when
	    # we start applying corrections.  These paths will be
	    # removed when we're done.
	    my $path = $collation->add_path( $last_reading, $reading, 
					     $collation->baselabel );
	    $last_reading = $reading;

	    # Note an array index for the reading, for later correction splices.
	    $base_text_index{$readingref} = $i++;
	}
    }
    close BASE;
    # Ending point for all texts
    my $endpoint = $collation->add_reading( '#END#' );
    $collation->add_path( $last_reading, $endpoint, $collation->baselabel );
    push( @$lineref_array, $endpoint );
    $base_text_index{$endpoint->name} = $i;

    return( @$lineref_array );
}

=item B<collate_variants>

collate_variants( $collation, @reading_ranges )

Given a set of readings in the form 
( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
walks through each to identify those readings that are identical.  The
collation is a Text::Tradition::Collation object; the elements of
@readings are Text::Tradition::Collation::Reading objects that appear
on the collation graph.

TODO: Handle collapsed and non-collapsed transpositions.

=cut

sub collate_variants {
    my( $collation, @reading_sets ) = @_;

    # Merge the nodes across the sets so that there is only one node
    # for any given reading.  Use diff to identify the 'same' nodes.

    my $lemma_set = shift @reading_sets;

    my @unique;
    push( @unique, @$lemma_set );

    while( @reading_sets ) {
	my $variant_set = shift @reading_sets;
	if( $collation->linear ) {
	    # Use diff to do this job
	    my $diff = Algorithm::Diff->new( \@unique, $variant_set, 
					     {'keyGen' => \&_collation_hash} );
	    my @new_unique;
	    my %merged;
	    while( $diff->Next ) {
		if( $diff->Same ) {
		    # merge the nodes
		    my @l = $diff->Items( 1 );
		    my @v = $diff->Items( 2 );
		    foreach my $i ( 0 .. $#l ) {
			if( !$merged{$l[$i]->name} ) {
			    $collation->merge_readings( $l[$i], $v[$i] );
			    $merged{$l[$i]->name} = 1;
			} else {
			    print STDERR "Would have double merged " . $l[$i]->name . "\n";
			}
		    }
		    # splice the lemma nodes into the variant set
		    my( $offset ) = $diff->Get( 'min2' );
		    splice( @$variant_set, $offset, scalar( @l ), @l );
		    push( @new_unique, @l );
		} else {
		    # Keep the old unique readings
		    push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
		    # Add the new readings to the 'unique' list
		    push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
		}
	    }
	    @unique = @new_unique;
	} else {
	    # It becomes a much simpler job
	    $DB::single = 1;
	    my @distinct;
	    my %merged;
	    foreach my $idx ( 0 .. $#{$variant_set} ) {
		my $vw = $variant_set->[$idx];
		my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
		my $matched;
		if( @same ) {
		    foreach my $i ( 0 .. $#same ) {
			unless( $merged{$same[$i]->name} ) {
			    print STDERR sprintf( "Merging %s into %s\n", 
						  $vw->name,
						  $same[$i]->name );
			    $collation->merge_readings( $same[$i], $vw );
			    $merged{$same[$i]->name} = 1;
			    $matched = $i;
			    $variant_set->[$idx] = $same[$i];
			}
		    }
		}
		unless( @same && defined($matched) ) {
		    push( @distinct, $vw );
		}
	    }
	    push( @unique, @distinct );
	}
    }

    return;
}

    
sub _collation_hash {
    my $node = shift;
    return cmp_str( $node );
}

sub set_relationships {
    my( $app, $lemma, $variants ) = @_;
    foreach my $rkey ( keys %$variants ) {
	my $var = $variants->{$rkey}->{'reading'};
	my $typekey = sprintf( "_%s_type", $rkey );
	my $type = $app->{$typekey};
	
	# Transposition: look for nodes with the same label but different IDs
	# and mark them as transposed-identical.

	# Lexical / Grammatical / Spelling: look for non-identical nodes.
	# Need to work out how to handle many-to-many mapping.
    }
}
	

sub apply_edits {
    my( $collation, $edit_sequence ) = @_;
    my @lemma_names = sort { $base_text_index{$a} <=> $base_text_index{$b} }
        keys %base_text_index;
    my @lemma_text = map { $collation->reading( $_ ) } @lemma_names;

    my $drift = 0;
    foreach my $correction ( @$edit_sequence ) {
	my( $offset, $length, $items ) = @$correction;
	my $realoffset = $offset + $drift;
	splice( @lemma_text, $realoffset, $length, @$items );
	$drift += @$items - $length;
    }
    return @lemma_text;
}
	

# Helper function. Given a witness sigil, if it is a post-correctione
# sigil,return the base witness.  If not, return a false value.
sub _is_post_corr {
    my( $sigil ) = @_;
    if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
	return $1;
    }
    return undef;
}

sub _add_hash_entry {
    my( $hash, $key, $entry ) = @_;
    if( exists $hash->{$key} ) {
	push( @{$hash->{$key}}, $entry );
    } else {
	$hash->{$key} = [ $entry ];
    }
}


=item B<cmp_str>

Pretend you never saw this method.  Really it needs to not be hardcoded.

=cut

sub cmp_str {
    my( $reading ) = @_;
    my $word = $reading->label();
    $word = lc( $word );
    $word =~ s/\W//g;
    $word =~ s/v/u/g;
    $word =~ s/j/i/g;
    $word =~ s/cha/ca/g;
    $word =~ s/quatuor/quattuor/g;
    $word =~ s/ioannes/iohannes/g;
    return $word;
}

=back

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews, aurum@cpan.org

=cut

1;
Commit	Line	Data
e58153d6	1	package Text::Tradition::Parser::BaseText;
b49c4318	2
	3	use strict;
	4	use warnings;
52ce987f	5	use Module::Load;
4ca00eca	6	use Algorithm::Diff;
b49c4318	7
2ceca8c3	8	=head1 NAME
	9
	10	Text::Tradition::Parser::BaseText
	11
	12	=head1 SYNOPSIS
	13
	14	use Text::Tradition::Parser::BaseText qw( merge_base );
	15	merge_base( $graph, 'reference.txt', @apparatus_entries )
	16
	17	=head1 DESCRIPTION
	18
	19	For an overview of the package, see the documentation for the
	20	Text::Tradition::Graph module.
	21
	22	This module is meant for use with certain of the other Parser classes
	23	- whenever a list of variants is given with reference to a base text,
	24	these must be joined into a single collation. The parser should
	25	therefore make a list of variants and their locations, and BaseText
	26	will join those listed variants onto the reference text.
	27
	28	=head1 SUBROUTINES
	29
	30	=over
	31
52ce987f	32	=item B<parse>
	33
	34	parse( $graph, %opts );
	35
	36	Takes an initialized graph and a set of options, which must include:
	37	- 'base' - the base text referenced by the variants
	38	- 'format' - the format of the variant list
	39	- 'data' - the variants, in the given format.
	40
	41	=cut
	42
	43	sub parse {
e2902068	44	my( $tradition, %opts ) = @_;
52ce987f	45
	46	my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
	47	load( $format_mod );
	48	my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
e2902068	49	merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
52ce987f	50	}
52ce987f	51
2ceca8c3	52	=item B<merge_base>
	53
	54	merge_base( $graph, 'reference.txt', @apparatus_entries )
	55
	56	Takes three arguments: a newly-initialized Text::Tradition::Graph
	57	object, a text file containing the reference text, and a list of
	58	variants (apparatus entries). Adds the base text to the graph, and
	59	joins the variants to that.
	60
	61	The list of variants is an array of hash references; each hash takes
	62	the form
	63	{ '_id' => line reference,
	64	'rdg_0' => lemma reading,
	65	'rdg_1' => first variant,
	66	... # and so on until all distinct readings are listed
	67	'WitnessA' => 'rdg_0',
	68	'WitnessB' => 'rdg_1',
	69	... # and so on until all witnesses are listed with their readings
	70	}
	71
	72	Any hash key that is not of the form /^rdg_\d+$/ and that does not
	73	begin with an underscore is assumed to be a witness name. Any 'meta'
	74	information to be passed must be passed in a key with a leading
	75	underscore in its name.
	76
	77	=cut
	78
15d2d3df	79	my $SHORTEND; # Debug var - set this to limit the number of lines parsed
4ca00eca	80
4ca00eca	81	my %base_text_index;
6a222840	82	my $edits_required = {};
4ca00eca	83
4ca00eca	84	# edits_required -> wit -> [ { start_idx, end_idx, items } ]
930ff666	85
b49c4318	86	sub merge_base {
e2902068	87	my( $collation, $base_file, @app_entries ) = @_;
e2902068	88	my @base_line_starts = read_base( $base_file, $collation );
b49c4318	89
52ce987f	90	my %all_witnesses;
6a222840	91	my @unwitnessed_lemma_nodes;
b49c4318	92	foreach my $app ( @app_entries ) {
	93	my( $line, $num ) = split( /\./, $app->{_id} );
	94	# DEBUG with a short graph
15d2d3df	95	last if $SHORTEND && $line > $SHORTEND;
2ceca8c3	96	# DEBUG for problematic entries
4ca00eca	97	my $scrutinize = '';
e2902068	98	my $first_line_reading = $base_line_starts[ $line ];
b49c4318	99	my $too_far = $base_line_starts[ $line+1 ];
	100
	101	my $lemma = $app->{rdg_0};
	102	my $seq = 1;
	103	# Is this the Nth occurrence of this reading in the line?
	104	if( $lemma =~ s/(_)?(\d)$// ) {
	105	$seq = $2;
	106	}
	107	my @lemma_words = split( /\s+/, $lemma );
	108
	109	# Now search for the lemma words within this line.
e2902068	110	my $lemma_start = $first_line_reading;
b49c4318	111	my $lemma_end;
	112	my %seen;
	113	while( $lemma_start ne $too_far ) {
	114	# Loop detection
	115	if( $seen{ $lemma_start->name() } ) {
	116	warn "Detected loop at " . $lemma_start->name() .
	117	", ref $line,$num";
	118	last;
	119	}
	120	$seen{ $lemma_start->name() } = 1;
	121
	122	# Try to match the lemma.
	123	my $unmatch = 0;
	124	print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
	125	$lemma_words[0] . "...\n"
	126	if "$line.$num" eq $scrutinize;
	127	if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
	128	# Skip it if we need a match that is not the first.
	129	if( --$seq < 1 ) {
	130	# Now we have to compare the rest of the words here.
	131	if( scalar( @lemma_words ) > 1 ) {
e2902068	132	my $next_reading =
e2902068	133	$collation->next_reading( $lemma_start );
b49c4318	134	foreach my $w ( @lemma_words[1..$#lemma_words] ) {
b49c4318	135	printf STDERR "Now matching %s against %s\n",
e2902068	136	cmp_str($next_reading), $w
b49c4318	137	if "$line.$num" eq $scrutinize;
e2902068	138	if( $w ne cmp_str($next_reading) ) {
b49c4318	139	$unmatch = 1;
	140	last;
	141	} else {
e2902068	142	$lemma_end = $next_reading;
	143	$next_reading =
	144	$collation->next_reading( $lemma_end );
b49c4318	145	}
	146	}
	147	} else {
	148	$lemma_end = $lemma_start;
	149	}
	150	} else {
	151	$unmatch = 1;
	152	}
	153	}
	154	last unless ( $unmatch \|\| !defined( $lemma_end ) );
	155	$lemma_end = undef;
e2902068	156	$lemma_start = $collation->next_reading( $lemma_start );
b49c4318	157	}
	158
	159	unless( $lemma_end ) {
	160	warn "No match found for @lemma_words at $line.$num";
	161	next;
b49c4318	162	}
b49c4318	163
4ca00eca	164	# Now we have found the lemma; we will record an 'edit', in
	165	# terms of a splice operation, for each subsequent reading.
	166	# We also note which witnesses take the given edit.
	167
6a222840	168	my @lemma_set = $collation->reading_sequence( $lemma_start,
6a222840	169	$lemma_end );
4ca00eca	170	my @reading_sets = [ @lemma_set ];
	171
	172	# For each reading that is not rdg_0, we create the variant
	173	# reading nodes, and store the range as an edit operation on
	174	# the base text.
	175	my $variant_objects;
6a222840	176	my %pc_seen; # Keep track of mss with explicit post-corr data
b49c4318	177	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
b49c4318	178	my @mss = grep { $app->{$_} eq $k } keys( %$app );
6a222840	179	push( @unwitnessed_lemma_nodes, @lemma_set )
	180	if !@mss && $k eq 'rdg_0';
	181
e2902068	182	# Keep track of what witnesses we have seen.
52ce987f	183	@all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
4ca00eca	184	# Keep track of which witnesses bear corrected readings here.
	185	foreach my $m ( @mss ) {
	186	my $base = _is_post_corr( $m );
	187	next unless $base;
6a222840	188	$pc_seen{$base} = 1;
4ca00eca	189	}
	190	next if $k eq 'rdg_0';
	191
	192	# TODO don't hardcode the reading split operation
	193	my @variant = split( /\s+/, $app->{$k} );
	194	@variant = () if $app->{$k} eq '/'; # This is an omission.
b49c4318	195
e2902068	196	# Make the variant into a set of readings.
4ca00eca	197	my @variant_readings;
b49c4318	198	my $ctr = 0;
b49c4318	199	foreach my $vw ( @variant ) {
b49c4318	200	my $vwname = "$k/$line.$num.$ctr"; $ctr++;
e2902068	201	my $vwreading = $collation->add_reading( $vwname );
e2902068	202	$vwreading->text( $vw );
4ca00eca	203	push( @variant_readings, $vwreading );
b49c4318	204	}
e49731d7	205
4ca00eca	206	$variant_objects->{$k} = { 'mss' => \@mss,
	207	'reading' => \@variant_readings,
	208	};
	209	push( @reading_sets, \@variant_readings );
	210	}
b49c4318	211
4ca00eca	212	# Now collate and collapse the identical readings within the
	213	# collated sets. Modifies the reading sets that were passed.
	214	collate_variants( $collation, @reading_sets );
	215
15d2d3df	216	# TODO Here would be a very good place to set up relationships
	217	# between the nodes and the lemma.
	218	set_relationships( $app, \@lemma_set, $variant_objects );
	219
4ca00eca	220	# Now create the splice-edit objects that will be used
	221	# to reconstruct each witness.
	222
	223	foreach my $rkey ( keys %$variant_objects ) {
	224	# Object is argument list for splice, so:
	225	# offset, length, replacements
	226	my $edit_object = [ $base_text_index{$lemma_start->name},
	227	scalar( @lemma_set ),
	228	$variant_objects->{$rkey}->{reading} ];
	229	foreach my $ms ( @{$variant_objects->{$rkey}->{mss}} ) {
	230	# Is this a p.c. entry?
	231	my $base = _is_post_corr( $ms );
	232	if( $base ) { # this is a post-corr witness
	233	my $pc_key = $base . "_post";
	234	_add_hash_entry( $edits_required, $pc_key, $edit_object );
	235	} else { # this is an ante-corr witness
	236	my $pc_key = $ms . "_post";
6a222840	237	_add_hash_entry( $edits_required, $ms, $edit_object );
	238	unless( $pc_seen{$ms} ) {
	239	# If this witness carries no correction, add this
	240	# same object to its post-corrected state.
	241	_add_hash_entry( $edits_required, $pc_key,
	242	$edit_object );
4ca00eca	243	}
4ca00eca	244	}
e2902068	245	}
4ca00eca	246	}
	247	} # Finished going through the apparatus entries
	248
	249	# Now make the witness objects, and create their text sequences
6a222840	250	foreach my $w ( grep { $_ !~ /_post$/ } keys %$edits_required ) {
15d2d3df	251	print STDERR "Creating witness $w\n";
4ca00eca	252	my $witness_obj = $collation->tradition->add_witness( sigil => $w );
6a222840	253	my @ante_corr_seq = apply_edits( $collation, $edits_required->{$w} );
6a222840	254	my @post_corr_seq = apply_edits( $collation, $edits_required->{$w."_post"} )
4ca00eca	255	if exists( $edits_required->{$w."_post"} );
4ca00eca	256
15d2d3df	257	my @repeated = _check_for_repeated( @ante_corr_seq );
	258	warn "Repeated elements @repeated in $w a.c."
	259	if @repeated;
	260	@repeated = _check_for_repeated( @post_corr_seq );
	261	warn "Repeated elements @repeated in $w p.c."
	262	if @repeated;
	263
6a222840	264	# Now save these paths in my witness object
4ca00eca	265	if( @post_corr_seq ) {
6a222840	266	$witness_obj->path( \@post_corr_seq );
15d2d3df	267	$witness_obj->uncorrected_path( \@ante_corr_seq );
e2902068	268	} else {
6a222840	269	$witness_obj->path( \@ante_corr_seq );
52ce987f	270	}
b49c4318	271	}
e2902068	272
6a222840	273	# Now remove our 'base text' edges, which is to say, the only
	274	# ones we have created so far. Also remove any nodes that didn't
	275	# appear in any witnesses.
	276	foreach ( $collation->paths() ) {
	277	$collation->del_path( $_ );
	278	}
	279	foreach( @unwitnessed_lemma_nodes ) {
	280	$collation->del_reading( $_ );
	281	}
4ca00eca	282
e2902068	283	# Now walk paths and calculate positions.
e2902068	284	my @common_readings =
6a222840	285	$collation->make_witness_paths();
e2902068	286	$collation->calculate_positions( @common_readings );
b49c4318	287	}
b49c4318	288
15d2d3df	289	sub _check_for_repeated {
	290	my @seq = @_;
	291	my %unique;
	292	my @repeated;
	293	foreach ( @seq ) {
	294	if( exists $unique{$_->name} ) {
	295	push( @repeated, $_->name );
	296	} else {
	297	$unique{$_->name} = 1;
	298	}
	299	}
	300	return @repeated;
	301	}
	302
2ceca8c3	303	=item B<read_base>
2ceca8c3	304
e2902068	305	my @line_beginnings = read_base( 'reference.txt', $collation );
2ceca8c3	306
e2902068	307	Takes a text file and a (presumed empty) collation object, adds the
	308	words as simple linear readings to the collation, and returns a
	309	list of readings that represent the beginning of lines. This collation
	310	is now the starting point for application of apparatus entries in
	311	merge_base, e.g. from a CSV file or a Classical Text Editor file.
2ceca8c3	312
2ceca8c3	313	=cut
b49c4318	314
b49c4318	315	sub read_base {
e2902068	316	my( $base_file, $collation ) = @_;
b49c4318	317
e2902068	318	# This array gives the first reading for each line. We put the
b49c4318	319	# common starting point in line zero.
e2902068	320	my $last_reading = $collation->start();
6a222840	321	$base_text_index{$last_reading->name} = 0;
e2902068	322	my $lineref_array = [ $last_reading ]; # There is no line zero.
b49c4318	323
b49c4318	324	open( BASE, $base_file ) or die "Could not open file $base_file: $!";
6a222840	325	my $i = 1;
b49c4318	326	while(<BASE>) {
e2902068	327	# Make the readings, and connect them up for the base, but
	328	# also save the first reading of each line in an array for the
	329	# purpose.
	330	# TODO use configurable reading separator
b49c4318	331	chomp;
	332	my @words = split;
	333	my $started = 0;
	334	my $wordref = 0;
	335	my $lineref = scalar @$lineref_array;
15d2d3df	336	last if $SHORTEND && $lineref > $SHORTEND;
b49c4318	337	foreach my $w ( @words ) {
e2902068	338	my $readingref = join( ',', $lineref, ++$wordref );
	339	my $reading = $collation->add_reading( $readingref );
	340	$reading->text( $w );
b49c4318	341	unless( $started ) {
e2902068	342	push( @$lineref_array, $reading );
b49c4318	343	$started = 1;
b49c4318	344	}
4ca00eca	345	# Add edge paths in the graph, for easier tracking when
	346	# we start applying corrections. These paths will be
	347	# removed when we're done.
	348	my $path = $collation->add_path( $last_reading, $reading,
	349	$collation->baselabel );
	350	$last_reading = $reading;
	351
	352	# Note an array index for the reading, for later correction splices.
	353	$base_text_index{$readingref} = $i++;
b49c4318	354	}
	355	}
	356	close BASE;
	357	# Ending point for all texts
e2902068	358	my $endpoint = $collation->add_reading( '#END#' );
930ff666	359	$collation->add_path( $last_reading, $endpoint, $collation->baselabel );
b49c4318	360	push( @$lineref_array, $endpoint );
6a222840	361	$base_text_index{$endpoint->name} = $i;
b49c4318	362
	363	return( @$lineref_array );
	364	}
	365
e49731d7	366	=item B<collate_variants>
2ceca8c3	367
4ca00eca	368	collate_variants( $collation, @reading_ranges )
2ceca8c3	369
e49731d7	370	Given a set of readings in the form
e49731d7	371	( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
e2902068	372	walks through each to identify those readings that are identical. The
	373	collation is a Text::Tradition::Collation object; the elements of
	374	@readings are Text::Tradition::Collation::Reading objects that appear
	375	on the collation graph.
b49c4318	376
2ceca8c3	377	TODO: Handle collapsed and non-collapsed transpositions.
	378
	379	=cut
b49c4318	380
e49731d7	381	sub collate_variants {
4ca00eca	382	my( $collation, @reading_sets ) = @_;
4ca00eca	383
	384	# Merge the nodes across the sets so that there is only one node
	385	# for any given reading. Use diff to identify the 'same' nodes.
	386
	387	my $lemma_set = shift @reading_sets;
	388
	389	my @unique;
	390	push( @unique, @$lemma_set );
	391
	392	while( @reading_sets ) {
	393	my $variant_set = shift @reading_sets;
15d2d3df	394	if( $collation->linear ) {
6a222840	395	# Use diff to do this job
	396	my $diff = Algorithm::Diff->new( \@unique, $variant_set,
	397	{'keyGen' => \&_collation_hash} );
	398	my @new_unique;
15d2d3df	399	my %merged;
6a222840	400	while( $diff->Next ) {
	401	if( $diff->Same ) {
	402	# merge the nodes
	403	my @l = $diff->Items( 1 );
	404	my @v = $diff->Items( 2 );
	405	foreach my $i ( 0 .. $#l ) {
15d2d3df	406	if( !$merged{$l[$i]->name} ) {
	407	$collation->merge_readings( $l[$i], $v[$i] );
	408	$merged{$l[$i]->name} = 1;
	409	} else {
	410	print STDERR "Would have double merged " . $l[$i]->name . "\n";
	411	}
6a222840	412	}
	413	# splice the lemma nodes into the variant set
	414	my( $offset ) = $diff->Get( 'min2' );
	415	splice( @$variant_set, $offset, scalar( @l ), @l );
	416	push( @new_unique, @l );
	417	} else {
	418	# Keep the old unique readings
	419	push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
	420	# Add the new readings to the 'unique' list
	421	push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
e49731d7	422	}
b49c4318	423	}
6a222840	424	@unique = @new_unique;
	425	} else {
	426	# It becomes a much simpler job
	427	$DB::single = 1;
	428	my @distinct;
15d2d3df	429	my %merged;
6a222840	430	foreach my $idx ( 0 .. $#{$variant_set} ) {
	431	my $vw = $variant_set->[$idx];
	432	my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
15d2d3df	433	my $matched;
6a222840	434	if( @same ) {
15d2d3df	435	foreach my $i ( 0 .. $#same ) {
	436	unless( $merged{$same[$i]->name} ) {
	437	print STDERR sprintf( "Merging %s into %s\n",
	438	$vw->name,
	439	$same[$i]->name );
	440	$collation->merge_readings( $same[$i], $vw );
	441	$merged{$same[$i]->name} = 1;
	442	$matched = $i;
	443	$variant_set->[$idx] = $same[$i];
	444	}
	445	}
	446	}
	447	unless( @same && defined($matched) ) {
6a222840	448	push( @distinct, $vw );
	449	}
	450	}
	451	push( @unique, @distinct );
b49c4318	452	}
b49c4318	453	}
2ceca8c3	454
4ca00eca	455	return;
4ca00eca	456	}
2ceca8c3	457
4ca00eca	458
	459	sub _collation_hash {
	460	my $node = shift;
6a222840	461	return cmp_str( $node );
4ca00eca	462	}
2ceca8c3	463
15d2d3df	464	sub set_relationships {
	465	my( $app, $lemma, $variants ) = @_;
	466	foreach my $rkey ( keys %$variants ) {
	467	my $var = $variants->{$rkey}->{'reading'};
	468	my $typekey = sprintf( "_%s_type", $rkey );
	469	my $type = $app->{$typekey};
	470
	471	# Transposition: look for nodes with the same label but different IDs
	472	# and mark them as transposed-identical.
	473
	474	# Lexical / Grammatical / Spelling: look for non-identical nodes.
	475	# Need to work out how to handle many-to-many mapping.
	476	}
	477	}
	478
	479
	480
4ca00eca	481	sub apply_edits {
6a222840	482	my( $collation, $edit_sequence ) = @_;
	483	my @lemma_names = sort { $base_text_index{$a} <=> $base_text_index{$b} }
	484	keys %base_text_index;
	485	my @lemma_text = map { $collation->reading( $_ ) } @lemma_names;
2ceca8c3	486
4ca00eca	487	my $drift = 0;
	488	foreach my $correction ( @$edit_sequence ) {
	489	my( $offset, $length, $items ) = @$correction;
	490	my $realoffset = $offset + $drift;
	491	splice( @lemma_text, $realoffset, $length, @$items );
	492	$drift += @$items - $length;
b49c4318	493	}
6a222840	494	return @lemma_text;
b49c4318	495	}
6a222840	496
4ca00eca	497
e2902068	498	# Helper function. Given a witness sigil, if it is a post-correctione
	499	# sigil,return the base witness. If not, return a false value.
	500	sub _is_post_corr {
	501	my( $sigil ) = @_;
930ff666	502	if( $sigil =~ /^(.?)(\s\(?p\.\s*c\.\)?)$/ ) {
e2902068	503	return $1;
	504	}
	505	return undef;
	506	}
	507
6a222840	508	sub _add_hash_entry {
	509	my( $hash, $key, $entry ) = @_;
	510	if( exists $hash->{$key} ) {
	511	push( @{$hash->{$key}}, $entry );
	512	} else {
	513	$hash->{$key} = [ $entry ];
	514	}
	515	}
	516
	517
2ceca8c3	518	=item B<cmp_str>
	519
	520	Pretend you never saw this method. Really it needs to not be hardcoded.
	521
	522	=cut
	523
b49c4318	524	sub cmp_str {
e2902068	525	my( $reading ) = @_;
e2902068	526	my $word = $reading->label();
b49c4318	527	$word = lc( $word );
	528	$word =~ s/\W//g;
	529	$word =~ s/v/u/g;
	530	$word =~ s/j/i/g;
	531	$word =~ s/cha/ca/g;
	532	$word =~ s/quatuor/quattuor/g;
	533	$word =~ s/ioannes/iohannes/g;
	534	return $word;
	535	}
	536
2ceca8c3	537	=back
	538
	539	=head1 LICENSE
	540
	541	This package is free software and is provided "as is" without express
	542	or implied warranty. You can redistribute it and/or modify it under
	543	the same terms as Perl itself.
	544
	545	=head1 AUTHOR
	546
	547	Tara L Andrews, aurum@cpan.org
	548
	549	=cut
	550
b49c4318	551	1;