[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / BaseText.pm

package Text::Tradition::Parser::BaseText;

use strict;
use warnings;
use Module::Load;

=head1 NAME

Text::Tradition::Parser::BaseText

=head1 SYNOPSIS

use Text::Tradition::Parser::BaseText qw( merge_base );
merge_base( $graph, 'reference.txt', @apparatus_entries )

=head1 DESCRIPTION

For an overview of the package, see the documentation for the
Text::Tradition::Graph module.

This module is meant for use with certain of the other Parser classes
- whenever a list of variants is given with reference to a base text,
these must be joined into a single collation.  The parser should
therefore make a list of variants and their locations, and BaseText
will join those listed variants onto the reference text.  

=head1 SUBROUTINES

=over

=item B<parse>

parse( $graph, %opts );

Takes an initialized graph and a set of options, which must include:
- 'base' - the base text referenced by the variants
- 'format' - the format of the variant list
- 'data' - the variants, in the given format.

=cut

sub parse {
    my( $tradition, %opts ) = @_;

    my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
    load( $format_mod );
    my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
    merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
}

=item B<merge_base>

merge_base( $graph, 'reference.txt', @apparatus_entries )

Takes three arguments: a newly-initialized Text::Tradition::Graph
object, a text file containing the reference text, and a list of
variants (apparatus entries).  Adds the base text to the graph, and
joins the variants to that.

The list of variants is an array of hash references; each hash takes
the form
 { '_id' => line reference,
   'rdg_0' => lemma reading,
   'rdg_1' => first variant,
   ...  # and so on until all distinct readings are listed
   'WitnessA' => 'rdg_0',
   'WitnessB' => 'rdg_1',
   ...  # and so on until all witnesses are listed with their readings
 }

Any hash key that is not of the form /^rdg_\d+$/ and that does not
begin with an underscore is assumed to be a witness name.  Any 'meta'
information to be passed must be passed in a key with a leading
underscore in its name.

=cut

    my $SHORT = 25;

sub merge_base {
    my( $collation, $base_file, @app_entries ) = @_;
    my @base_line_starts = read_base( $base_file, $collation );

    my %all_witnesses;
    foreach my $app ( @app_entries ) {
	my( $line, $num ) = split( /\./, $app->{_id} );
	# DEBUG with a short graph
	last if $SHORT && $line > $SHORT;
	# DEBUG for problematic entries
	my $scrutinize = "";
	my $first_line_reading = $base_line_starts[ $line ];
	my $too_far = $base_line_starts[ $line+1 ];
	
	my $lemma = $app->{rdg_0};
	my $seq = 1; 
	# Is this the Nth occurrence of this reading in the line?
	if( $lemma =~ s/(_)?(\d)$// ) {
	    $seq = $2;
	}
	my @lemma_words = split( /\s+/, $lemma );
	
	# Now search for the lemma words within this line.
	my $lemma_start = $first_line_reading;
	my $lemma_end;
	my %seen;
	while( $lemma_start ne $too_far ) {
	    # Loop detection
	    if( $seen{ $lemma_start->name() } ) {
		warn "Detected loop at " . $lemma_start->name() . 
		    ", ref $line,$num";
		last;
	    }
	    $seen{ $lemma_start->name() } = 1;
	    
	    # Try to match the lemma.
	    my $unmatch = 0;
	    print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
		$lemma_words[0] . "...\n"
		if "$line.$num" eq $scrutinize;
	    if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
		# Skip it if we need a match that is not the first.
		if( --$seq < 1 ) {
		    # Now we have to compare the rest of the words here.
		    if( scalar( @lemma_words ) > 1 ) {
			my $next_reading = 
			    $collation->next_reading( $lemma_start );
			foreach my $w ( @lemma_words[1..$#lemma_words] ) {
			    printf STDERR "Now matching %s against %s\n", 
				    cmp_str($next_reading), $w
				if "$line.$num" eq $scrutinize;
			    if( $w ne cmp_str($next_reading) ) {
				$unmatch = 1;
				last;
			    } else {
				$lemma_end = $next_reading;
				$next_reading = 
				    $collation->next_reading( $lemma_end );
			    }
			}
		    } else {
			$lemma_end = $lemma_start;
		    }
		} else {
		    $unmatch = 1;
		}
	    }
	    last unless ( $unmatch || !defined( $lemma_end ) );
	    $lemma_end = undef;
	    $lemma_start = $collation->next_reading( $lemma_start );
	}
	
	unless( $lemma_end ) {
	    warn "No match found for @lemma_words at $line.$num";
	    next;
	} else {
	    # These are no longer common readings; unmark them as such.
	    my @lemma_readings = $collation->reading_sequence( $lemma_start, 
						     $lemma_end );
	    map { $_->make_variant } @lemma_readings;
	}
	
	# Now we have our lemma readings; we add the variant readings
	# to the collation.
	
	# Keep track of the start and end point of each reading for later
	# reading collapse.
	my @readings = ( $lemma_start, $lemma_end );

	# For each reading that is not rdg_0, we make a chain of readings
	# and connect them to the anchor.  Edges are named after the mss
	# that are relevant.
	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
	    if( $k eq 'rdg_0' ) { # that's the lemma
		# The lemma is already in the graph, but we need to look for
		# any explicit post-correctione readings and add the
		# relevant path.
		my @mss = grep { $app->{$_} eq $k } keys( %$app );
		# Keep track of what witnesses we have seen.
		@all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
		foreach my $m ( @mss ) {
		    my $base = _is_post_corr( $m );
		    next unless $base;
		    my @lem = $collation->reading_sequence( $lemma_start, $lemma_end );
		    $collation->add_path( $collation->prior_reading( $lem[0] ), $lem[0], $m );
		    foreach my $i ( 0 .. $#lem-1 ) {
			$collation->add_path( $lem[$i], $lem[++$i], $m );
		    }
		    $collation->add_path( $lem[-1], $collation->next_reading( $lem[-1] ), $m );
		}
		next;
	    }
	    my @variant = split( /\s+/, $app->{$k} );
	    @variant = () if $app->{$k} eq '/'; # This is an omission.
	    my @mss = grep { $app->{$_} eq $k } keys( %$app );
	    
	    unless( @mss ) {
		print STDERR "Skipping '@variant' at $line.$num: no mss\n";
		next;
	    }
	    
	    # Keep track of what witnesses we have seen.
	    @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
	    
	    # Make the variant into a set of readings.
	    my $ctr = 0;
	    my $last_reading = $collation->prior_reading( $lemma_start );
	    my $var_start;
	    foreach my $vw ( @variant ) {
		my $vwname = "$k/$line.$num.$ctr"; $ctr++;
		my $vwreading = $collation->add_reading( $vwname );
		$vwreading->text( $vw );
		$vwreading->make_variant();
		foreach ( @mss ) {
		    $collation->add_path( $last_reading, $vwreading, $_ );
		}
		$var_start = $vwreading unless $var_start;
		$last_reading = $vwreading;
	    }
	    # Now hook it up at the end.
	    foreach ( @mss ) {
		$collation->add_path( $last_reading, 
				      $collation->next_reading( $lemma_end ),
				      $_ );
	    }
	    
	    if( $var_start ) { # if it wasn't an empty reading
		push( @readings, $var_start, $last_reading );
	    }
	}

	# Now collate and collapse the identical readings within the collation.
	collate_variants( $collation, @readings );
    }

    # Now make the witness objects
    foreach my $w ( keys %all_witnesses ) {
	my $base = _is_post_corr( $w );
	if( $base ) {
	    my $pctag = substr( $w, length( $base ) );
	    my $existing_wit = $collation->tradition->witness( $base );
	    unless( $existing_wit ) {
		$existing_wit = $collation->tradition->add_witness( sigil => $base );
	    }
	    $existing_wit->post_correctione( $pctag );
	} else {
	    $collation->tradition->add_witness( sigil => $w )
		unless $collation->tradition->witness( $w );
	}
    }

    # Now walk paths and calculate positions.
    my @common_readings = 
        $collation->walk_and_expand_base( $collation->reading( '#END#' ) );
    $collation->calculate_positions( @common_readings );
}

=item B<read_base>

my @line_beginnings = read_base( 'reference.txt', $collation );

Takes a text file and a (presumed empty) collation object, adds the
words as simple linear readings to the collation, and returns a
list of readings that represent the beginning of lines. This collation
is now the starting point for application of apparatus entries in
merge_base, e.g. from a CSV file or a Classical Text Editor file.

=cut

sub read_base {
    my( $base_file, $collation ) = @_;
    
    # This array gives the first reading for each line.  We put the
    # common starting point in line zero.
    my $last_reading = $collation->start();
    my $lineref_array = [ $last_reading ]; # There is no line zero.

    open( BASE, $base_file ) or die "Could not open file $base_file: $!";
    while(<BASE>) {
	# Make the readings, and connect them up for the base, but
	# also save the first reading of each line in an array for the
	# purpose.
	# TODO use configurable reading separator
	chomp;
	my @words = split;
	my $started = 0;
	my $wordref = 0;
	my $lineref = scalar @$lineref_array;
	last if $SHORT && $lineref > $SHORT;
	foreach my $w ( @words ) {
	    my $readingref = join( ',', $lineref, ++$wordref );
	    my $reading = $collation->add_reading( $readingref );
	    $reading->text( $w );
	    $reading->make_common();
	    unless( $started ) {
		push( @$lineref_array, $reading );
		$started = 1;
	    }
	    if( $last_reading ) {
		my $path = $collation->add_path( $last_reading, $reading, 
						 $collation->baselabel );
		$path->set_attribute( 'class', 'basetext' );
		$last_reading = $reading;
	    } # TODO there should be no else here...
	}
    }
    close BASE;
    # Ending point for all texts
    my $endpoint = $collation->add_reading( '#END#' );
    $collation->add_path( $last_reading, $endpoint, $collation->baselabel );
    push( @$lineref_array, $endpoint );

    return( @$lineref_array );
}

=item B<collate_variants>

collate_variants( $collation, @readings )

Given a set of readings in the form 
( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
walks through each to identify those readings that are identical.  The
collation is a Text::Tradition::Collation object; the elements of
@readings are Text::Tradition::Collation::Reading objects that appear
on the collation graph.

TODO: Handle collapsed and non-collapsed transpositions.

=cut

sub collate_variants {
    my( $collation, @readings ) = @_;
    my $lemma_start = shift @readings;
    my $lemma_end = shift @readings;
    my $detranspose = 1;

    # Start the list of distinct readings with those readings in the lemma.
    my @distinct_readings;
    while( $lemma_start ne $lemma_end ) {
	push( @distinct_readings, [ $lemma_start, $collation->baselabel ] );
	$lemma_start = $collation->next_reading( $lemma_start );
    } 
    push( @distinct_readings, [ $lemma_end, $collation->baselabel ] );
    

    while( scalar @readings ) {
	my( $var_start, $var_end ) = splice( @readings, 0, 2 );

	# I want to look at the readings in the variant and lemma, and
	# collapse readings that are the same word.  This is mini-collation.
	# Each word in the 'main' list can only be collapsed once with a
	# word from the current reading.
	my %collapsed = ();

	# Get the variant witnesses.  They will all be going along the
	# same path, so just use the first one as representative for
	# the purpose of following the path.
	my @var_wits = map { $_->label } $var_start->outgoing();
	my $rep_wit = $var_wits[0];

	my @variant_readings;
	while( $var_start ne $var_end ) {
	    push( @variant_readings, $var_start );
	    $var_start = $collation->next_reading( $var_start, $rep_wit );
	}
	push( @variant_readings, $var_end );

	# Go through the variant readings, and if we find a lemma reading that
	# hasn't yet been collapsed with a reading, equate them.  If we do
	# not, keep them to push onto the end of all_readings.
	# TODO replace this with proper mini-collation
	my @remaining_readings;
	my $last_index = 0;
	my $curr_pos = 0;
	foreach my $w ( @variant_readings ) {
	    my $word = $w->label();
	    my $matched = 0;
	    foreach my $idx ( $last_index .. $#distinct_readings ) {
		my( $l, $pathlabel ) = @{$distinct_readings[$idx]};
		if( $word eq cmp_str( $l ) ) {
		    next if exists( $collapsed{ $l->label } )
			&& $collapsed{ $l->label } eq $l;
		    $matched = 1;
		    $last_index = $idx if $detranspose;
		    # Collapse the readings.
		    printf STDERR "Merging readings %s/%s and %s/%s\n", 
		        $l->name, $l->label, $w->name, $w->label;
		    $collation->merge_readings( $l, $w );
		    $collapsed{ $l->label } = $l;
		    # Now collapse any multiple paths to and from the reading.
 		    remove_duplicate_paths( $collation, 
 				    $collation->prior_reading( $l, $rep_wit ), $l );
 		    remove_duplicate_paths( $collation, $l, 
 				    $collation->next_reading( $l, $rep_wit ) );
		    last;
		}
	    }
	    push( @remaining_readings, [ $w, $rep_wit ] ) unless $matched;
	}
	push( @distinct_readings, @remaining_readings) if scalar( @remaining_readings );
    }
}

=item B<remove_duplicate_paths>

remove_duplicate_paths( $collation, $from, $to );

Given two readings, reduce the number of paths between those readings to
a set of unique paths.

=cut

# TODO wonder if this is necessary
sub remove_duplicate_paths {
    my( $collation, $from, $to ) = @_;
    my %seen_paths;
    foreach my $p ( $from->edges_to( $to ) ) {
	if( exists $seen_paths{$p->name} ) {
	    $collation->del_path( $p );
	} else {
	    $seen_paths{$p->name} = 1;
	}
    }
}

# Helper function. Given a witness sigil, if it is a post-correctione
# sigil,return the base witness.  If not, return a false value.
sub _is_post_corr {
    my( $sigil ) = @_;
    if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
	return $1;
    }
    return undef;
}

=item B<cmp_str>

Pretend you never saw this method.  Really it needs to not be hardcoded.

=cut

sub cmp_str {
    my( $reading ) = @_;
    my $word = $reading->label();
    $word = lc( $word );
    $word =~ s/\W//g;
    $word =~ s/v/u/g;
    $word =~ s/j/i/g;
    $word =~ s/cha/ca/g;
    $word =~ s/quatuor/quattuor/g;
    $word =~ s/ioannes/iohannes/g;
    return $word;
}

=back

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews, aurum@cpan.org

=cut

1;
Commit	Line	Data
e58153d6	1	package Text::Tradition::Parser::BaseText;
b49c4318	2
	3	use strict;
	4	use warnings;
52ce987f	5	use Module::Load;
b49c4318	6
2ceca8c3	7	=head1 NAME
	8
	9	Text::Tradition::Parser::BaseText
	10
	11	=head1 SYNOPSIS
	12
	13	use Text::Tradition::Parser::BaseText qw( merge_base );
	14	merge_base( $graph, 'reference.txt', @apparatus_entries )
	15
	16	=head1 DESCRIPTION
	17
	18	For an overview of the package, see the documentation for the
	19	Text::Tradition::Graph module.
	20
	21	This module is meant for use with certain of the other Parser classes
	22	- whenever a list of variants is given with reference to a base text,
	23	these must be joined into a single collation. The parser should
	24	therefore make a list of variants and their locations, and BaseText
	25	will join those listed variants onto the reference text.
	26
	27	=head1 SUBROUTINES
	28
	29	=over
	30
52ce987f	31	=item B<parse>
	32
	33	parse( $graph, %opts );
	34
	35	Takes an initialized graph and a set of options, which must include:
	36	- 'base' - the base text referenced by the variants
	37	- 'format' - the format of the variant list
	38	- 'data' - the variants, in the given format.
	39
	40	=cut
	41
	42	sub parse {
e2902068	43	my( $tradition, %opts ) = @_;
52ce987f	44
	45	my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
	46	load( $format_mod );
	47	my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
e2902068	48	merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
52ce987f	49	}
52ce987f	50
2ceca8c3	51	=item B<merge_base>
	52
	53	merge_base( $graph, 'reference.txt', @apparatus_entries )
	54
	55	Takes three arguments: a newly-initialized Text::Tradition::Graph
	56	object, a text file containing the reference text, and a list of
	57	variants (apparatus entries). Adds the base text to the graph, and
	58	joins the variants to that.
	59
	60	The list of variants is an array of hash references; each hash takes
	61	the form
	62	{ '_id' => line reference,
	63	'rdg_0' => lemma reading,
	64	'rdg_1' => first variant,
	65	... # and so on until all distinct readings are listed
	66	'WitnessA' => 'rdg_0',
	67	'WitnessB' => 'rdg_1',
	68	... # and so on until all witnesses are listed with their readings
	69	}
	70
	71	Any hash key that is not of the form /^rdg_\d+$/ and that does not
	72	begin with an underscore is assumed to be a witness name. Any 'meta'
	73	information to be passed must be passed in a key with a leading
	74	underscore in its name.
	75
	76	=cut
	77
1f563ac3	78	my $SHORT = 25;
930ff666	79
b49c4318	80	sub merge_base {
e2902068	81	my( $collation, $base_file, @app_entries ) = @_;
e2902068	82	my @base_line_starts = read_base( $base_file, $collation );
b49c4318	83
52ce987f	84	my %all_witnesses;
b49c4318	85	foreach my $app ( @app_entries ) {
	86	my( $line, $num ) = split( /\./, $app->{_id} );
	87	# DEBUG with a short graph
930ff666	88	last if $SHORT && $line > $SHORT;
2ceca8c3	89	# DEBUG for problematic entries
930ff666	90	my $scrutinize = "";
e2902068	91	my $first_line_reading = $base_line_starts[ $line ];
b49c4318	92	my $too_far = $base_line_starts[ $line+1 ];
	93
	94	my $lemma = $app->{rdg_0};
	95	my $seq = 1;
	96	# Is this the Nth occurrence of this reading in the line?
	97	if( $lemma =~ s/(_)?(\d)$// ) {
	98	$seq = $2;
	99	}
	100	my @lemma_words = split( /\s+/, $lemma );
	101
	102	# Now search for the lemma words within this line.
e2902068	103	my $lemma_start = $first_line_reading;
b49c4318	104	my $lemma_end;
	105	my %seen;
	106	while( $lemma_start ne $too_far ) {
	107	# Loop detection
	108	if( $seen{ $lemma_start->name() } ) {
	109	warn "Detected loop at " . $lemma_start->name() .
	110	", ref $line,$num";
	111	last;
	112	}
	113	$seen{ $lemma_start->name() } = 1;
	114
	115	# Try to match the lemma.
	116	my $unmatch = 0;
	117	print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
	118	$lemma_words[0] . "...\n"
	119	if "$line.$num" eq $scrutinize;
	120	if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
	121	# Skip it if we need a match that is not the first.
	122	if( --$seq < 1 ) {
	123	# Now we have to compare the rest of the words here.
	124	if( scalar( @lemma_words ) > 1 ) {
e2902068	125	my $next_reading =
e2902068	126	$collation->next_reading( $lemma_start );
b49c4318	127	foreach my $w ( @lemma_words[1..$#lemma_words] ) {
b49c4318	128	printf STDERR "Now matching %s against %s\n",
e2902068	129	cmp_str($next_reading), $w
b49c4318	130	if "$line.$num" eq $scrutinize;
e2902068	131	if( $w ne cmp_str($next_reading) ) {
b49c4318	132	$unmatch = 1;
	133	last;
	134	} else {
e2902068	135	$lemma_end = $next_reading;
	136	$next_reading =
	137	$collation->next_reading( $lemma_end );
b49c4318	138	}
	139	}
	140	} else {
	141	$lemma_end = $lemma_start;
	142	}
	143	} else {
	144	$unmatch = 1;
	145	}
	146	}
	147	last unless ( $unmatch \|\| !defined( $lemma_end ) );
	148	$lemma_end = undef;
e2902068	149	$lemma_start = $collation->next_reading( $lemma_start );
b49c4318	150	}
	151
	152	unless( $lemma_end ) {
	153	warn "No match found for @lemma_words at $line.$num";
	154	next;
	155	} else {
e2902068	156	# These are no longer common readings; unmark them as such.
e2902068	157	my @lemma_readings = $collation->reading_sequence( $lemma_start,
b49c4318	158	$lemma_end );
930ff666	159	map { $_->make_variant } @lemma_readings;
b49c4318	160	}
b49c4318	161
e2902068	162	# Now we have our lemma readings; we add the variant readings
e2902068	163	# to the collation.
b49c4318	164
e49731d7	165	# Keep track of the start and end point of each reading for later
e2902068	166	# reading collapse.
e49731d7	167	my @readings = ( $lemma_start, $lemma_end );
e49731d7	168
e2902068	169	# For each reading that is not rdg_0, we make a chain of readings
b49c4318	170	# and connect them to the anchor. Edges are named after the mss
	171	# that are relevant.
	172	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
7854e12e	173	if( $k eq 'rdg_0' ) { # that's the lemma
	174	# The lemma is already in the graph, but we need to look for
	175	# any explicit post-correctione readings and add the
	176	# relevant path.
	177	my @mss = grep { $app->{$_} eq $k } keys( %$app );
930ff666	178	# Keep track of what witnesses we have seen.
930ff666	179	@all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
7854e12e	180	foreach my $m ( @mss ) {
	181	my $base = _is_post_corr( $m );
	182	next unless $base;
	183	my @lem = $collation->reading_sequence( $lemma_start, $lemma_end );
930ff666	184	$collation->add_path( $collation->prior_reading( $lem[0] ), $lem[0], $m );
7854e12e	185	foreach my $i ( 0 .. $#lem-1 ) {
930ff666	186	$collation->add_path( $lem[$i], $lem[++$i], $m );
7854e12e	187	}
930ff666	188	$collation->add_path( $lem[-1], $collation->next_reading( $lem[-1] ), $m );
7854e12e	189	}
930ff666	190	next;
7854e12e	191	}
b49c4318	192	my @variant = split( /\s+/, $app->{$k} );
	193	@variant = () if $app->{$k} eq '/'; # This is an omission.
	194	my @mss = grep { $app->{$_} eq $k } keys( %$app );
	195
	196	unless( @mss ) {
	197	print STDERR "Skipping '@variant' at $line.$num: no mss\n";
	198	next;
	199	}
	200
e2902068	201	# Keep track of what witnesses we have seen.
52ce987f	202	@all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
b49c4318	203
e2902068	204	# Make the variant into a set of readings.
b49c4318	205	my $ctr = 0;
e2902068	206	my $last_reading = $collation->prior_reading( $lemma_start );
b49c4318	207	my $var_start;
	208	foreach my $vw ( @variant ) {
	209	my $vwname = "$k/$line.$num.$ctr"; $ctr++;
e2902068	210	my $vwreading = $collation->add_reading( $vwname );
	211	$vwreading->text( $vw );
	212	$vwreading->make_variant();
	213	foreach ( @mss ) {
	214	$collation->add_path( $last_reading, $vwreading, $_ );
	215	}
	216	$var_start = $vwreading unless $var_start;
	217	$last_reading = $vwreading;
b49c4318	218	}
b49c4318	219	# Now hook it up at the end.
e2902068	220	foreach ( @mss ) {
e2902068	221	$collation->add_path( $last_reading,
7854e12e	222	$collation->next_reading( $lemma_end ),
e2902068	223	$_ );
e2902068	224	}
b49c4318	225
e49731d7	226	if( $var_start ) { # if it wasn't an empty reading
e2902068	227	push( @readings, $var_start, $last_reading );
e49731d7	228	}
b49c4318	229	}
e49731d7	230
e2902068	231	# Now collate and collapse the identical readings within the collation.
e2902068	232	collate_variants( $collation, @readings );
b49c4318	233	}
b49c4318	234
e2902068	235	# Now make the witness objects
52ce987f	236	foreach my $w ( keys %all_witnesses ) {
e2902068	237	my $base = _is_post_corr( $w );
	238	if( $base ) {
	239	my $pctag = substr( $w, length( $base ) );
	240	my $existing_wit = $collation->tradition->witness( $base );
	241	unless( $existing_wit ) {
7854e12e	242	$existing_wit = $collation->tradition->add_witness( sigil => $base );
e2902068	243	}
	244	$existing_wit->post_correctione( $pctag );
	245	} else {
7854e12e	246	$collation->tradition->add_witness( sigil => $w )
e2902068	247	unless $collation->tradition->witness( $w );
52ce987f	248	}
b49c4318	249	}
e2902068	250
	251	# Now walk paths and calculate positions.
	252	my @common_readings =
1f563ac3	253	$collation->walk_and_expand_base( $collation->reading( '#END#' ) );
e2902068	254	$collation->calculate_positions( @common_readings );
b49c4318	255	}
b49c4318	256
2ceca8c3	257	=item B<read_base>
2ceca8c3	258
e2902068	259	my @line_beginnings = read_base( 'reference.txt', $collation );
2ceca8c3	260
e2902068	261	Takes a text file and a (presumed empty) collation object, adds the
	262	words as simple linear readings to the collation, and returns a
	263	list of readings that represent the beginning of lines. This collation
	264	is now the starting point for application of apparatus entries in
	265	merge_base, e.g. from a CSV file or a Classical Text Editor file.
2ceca8c3	266
2ceca8c3	267	=cut
b49c4318	268
b49c4318	269	sub read_base {
e2902068	270	my( $base_file, $collation ) = @_;
b49c4318	271
e2902068	272	# This array gives the first reading for each line. We put the
b49c4318	273	# common starting point in line zero.
e2902068	274	my $last_reading = $collation->start();
e2902068	275	my $lineref_array = [ $last_reading ]; # There is no line zero.
b49c4318	276
	277	open( BASE, $base_file ) or die "Could not open file $base_file: $!";
	278	while(<BASE>) {
e2902068	279	# Make the readings, and connect them up for the base, but
	280	# also save the first reading of each line in an array for the
	281	# purpose.
	282	# TODO use configurable reading separator
b49c4318	283	chomp;
	284	my @words = split;
	285	my $started = 0;
	286	my $wordref = 0;
	287	my $lineref = scalar @$lineref_array;
930ff666	288	last if $SHORT && $lineref > $SHORT;
b49c4318	289	foreach my $w ( @words ) {
e2902068	290	my $readingref = join( ',', $lineref, ++$wordref );
	291	my $reading = $collation->add_reading( $readingref );
	292	$reading->text( $w );
	293	$reading->make_common();
b49c4318	294	unless( $started ) {
e2902068	295	push( @$lineref_array, $reading );
b49c4318	296	$started = 1;
b49c4318	297	}
e2902068	298	if( $last_reading ) {
e2902068	299	my $path = $collation->add_path( $last_reading, $reading,
930ff666	300	$collation->baselabel );
e2902068	301	$path->set_attribute( 'class', 'basetext' );
e2902068	302	$last_reading = $reading;
b49c4318	303	} # TODO there should be no else here...
	304	}
	305	}
	306	close BASE;
	307	# Ending point for all texts
e2902068	308	my $endpoint = $collation->add_reading( '#END#' );
930ff666	309	$collation->add_path( $last_reading, $endpoint, $collation->baselabel );
b49c4318	310	push( @$lineref_array, $endpoint );
	311
	312	return( @$lineref_array );
	313	}
	314
e49731d7	315	=item B<collate_variants>
2ceca8c3	316
e2902068	317	collate_variants( $collation, @readings )
2ceca8c3	318
e49731d7	319	Given a set of readings in the form
e49731d7	320	( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
e2902068	321	walks through each to identify those readings that are identical. The
	322	collation is a Text::Tradition::Collation object; the elements of
	323	@readings are Text::Tradition::Collation::Reading objects that appear
	324	on the collation graph.
b49c4318	325
2ceca8c3	326	TODO: Handle collapsed and non-collapsed transpositions.
	327
	328	=cut
b49c4318	329
e49731d7	330	sub collate_variants {
e2902068	331	my( $collation, @readings ) = @_;
e49731d7	332	my $lemma_start = shift @readings;
e49731d7	333	my $lemma_end = shift @readings;
7854e12e	334	my $detranspose = 1;
1b9423d5	335
e2902068	336	# Start the list of distinct readings with those readings in the lemma.
e2902068	337	my @distinct_readings;
b49c4318	338	while( $lemma_start ne $lemma_end ) {
930ff666	339	push( @distinct_readings, [ $lemma_start, $collation->baselabel ] );
7854e12e	340	$lemma_start = $collation->next_reading( $lemma_start );
b49c4318	341	}
930ff666	342	push( @distinct_readings, [ $lemma_end, $collation->baselabel ] );
b49c4318	343
e49731d7	344
	345	while( scalar @readings ) {
	346	my( $var_start, $var_end ) = splice( @readings, 0, 2 );
	347
e2902068	348	# I want to look at the readings in the variant and lemma, and
e2902068	349	# collapse readings that are the same word. This is mini-collation.
e49731d7	350	# Each word in the 'main' list can only be collapsed once with a
	351	# word from the current reading.
	352	my %collapsed = ();
	353
7854e12e	354	# Get the variant witnesses. They will all be going along the
	355	# same path, so just use the first one as representative for
	356	# the purpose of following the path.
	357	my @var_wits = map { $_->label } $var_start->outgoing();
	358	my $rep_wit = $var_wits[0];
e49731d7	359
e2902068	360	my @variant_readings;
e49731d7	361	while( $var_start ne $var_end ) {
e2902068	362	push( @variant_readings, $var_start );
7854e12e	363	$var_start = $collation->next_reading( $var_start, $rep_wit );
e49731d7	364	}
e2902068	365	push( @variant_readings, $var_end );
e49731d7	366
e2902068	367	# Go through the variant readings, and if we find a lemma reading that
	368	# hasn't yet been collapsed with a reading, equate them. If we do
	369	# not, keep them to push onto the end of all_readings.
7854e12e	370	# TODO replace this with proper mini-collation
e2902068	371	my @remaining_readings;
e49731d7	372	my $last_index = 0;
1b9423d5	373	my $curr_pos = 0;
e2902068	374	foreach my $w ( @variant_readings ) {
e49731d7	375	my $word = $w->label();
e49731d7	376	my $matched = 0;
e2902068	377	foreach my $idx ( $last_index .. $#distinct_readings ) {
7854e12e	378	my( $l, $pathlabel ) = @{$distinct_readings[$idx]};
e49731d7	379	if( $word eq cmp_str( $l ) ) {
	380	next if exists( $collapsed{ $l->label } )
	381	&& $collapsed{ $l->label } eq $l;
	382	$matched = 1;
	383	$last_index = $idx if $detranspose;
e2902068	384	# Collapse the readings.
e2902068	385	printf STDERR "Merging readings %s/%s and %s/%s\n",
e49731d7	386	$l->name, $l->label, $w->name, $w->label;
e2902068	387	$collation->merge_readings( $l, $w );
e49731d7	388	$collapsed{ $l->label } = $l;
e2902068	389	# Now collapse any multiple paths to and from the reading.
e2902068	390	remove_duplicate_paths( $collation,
7854e12e	391	$collation->prior_reading( $l, $rep_wit ), $l );
e2902068	392	remove_duplicate_paths( $collation, $l,
7854e12e	393	$collation->next_reading( $l, $rep_wit ) );
1b9423d5	394	last;
e49731d7	395	}
b49c4318	396	}
7854e12e	397	push( @remaining_readings, [ $w, $rep_wit ] ) unless $matched;
b49c4318	398	}
e2902068	399	push( @distinct_readings, @remaining_readings) if scalar( @remaining_readings );
b49c4318	400	}
	401	}
	402
e2902068	403	=item B<remove_duplicate_paths>
2ceca8c3	404
e2902068	405	remove_duplicate_paths( $collation, $from, $to );
2ceca8c3	406
e2902068	407	Given two readings, reduce the number of paths between those readings to
7854e12e	408	a set of unique paths.
2ceca8c3	409
	410	=cut
	411
7854e12e	412	# TODO wonder if this is necessary
e2902068	413	sub remove_duplicate_paths {
e2902068	414	my( $collation, $from, $to ) = @_;
7854e12e	415	my %seen_paths;
	416	foreach my $p ( $from->edges_to( $to ) ) {
	417	if( exists $seen_paths{$p->name} ) {
	418	$collation->del_path( $p );
b49c4318	419	} else {
7854e12e	420	$seen_paths{$p->name} = 1;
b49c4318	421	}
	422	}
	423	}
	424
e2902068	425	# Helper function. Given a witness sigil, if it is a post-correctione
	426	# sigil,return the base witness. If not, return a false value.
	427	sub _is_post_corr {
	428	my( $sigil ) = @_;
930ff666	429	if( $sigil =~ /^(.?)(\s\(?p\.\s*c\.\)?)$/ ) {
e2902068	430	return $1;
	431	}
	432	return undef;
	433	}
	434
2ceca8c3	435	=item B<cmp_str>
	436
	437	Pretend you never saw this method. Really it needs to not be hardcoded.
	438
	439	=cut
	440
b49c4318	441	sub cmp_str {
e2902068	442	my( $reading ) = @_;
e2902068	443	my $word = $reading->label();
b49c4318	444	$word = lc( $word );
	445	$word =~ s/\W//g;
	446	$word =~ s/v/u/g;
	447	$word =~ s/j/i/g;
	448	$word =~ s/cha/ca/g;
	449	$word =~ s/quatuor/quattuor/g;
	450	$word =~ s/ioannes/iohannes/g;
	451	return $word;
	452	}
	453
2ceca8c3	454	=back
	455
	456	=head1 LICENSE
	457
	458	This package is free software and is provided "as is" without express
	459	or implied warranty. You can redistribute it and/or modify it under
	460	the same terms as Perl itself.
	461
	462	=head1 AUTHOR
	463
	464	Tara L Andrews, aurum@cpan.org
	465
	466	=cut
	467
b49c4318	468	1;