[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / BaseText.pm

package Text::Tradition::Parser::BaseText;

use strict;
use warnings;
use Module::Load;

=head1 NAME

Text::Tradition::Parser::BaseText

=head1 SYNOPSIS

use Text::Tradition::Parser::BaseText qw( merge_base );
merge_base( $graph, 'reference.txt', @apparatus_entries )

=head1 DESCRIPTION

For an overview of the package, see the documentation for the
Text::Tradition::Graph module.

This module is meant for use with certain of the other Parser classes
- whenever a list of variants is given with reference to a base text,
these must be joined into a single collation.  The parser should
therefore make a list of variants and their locations, and BaseText
will join those listed variants onto the reference text.  

=head1 SUBROUTINES

=over

=item B<parse>

parse( $graph, %opts );

Takes an initialized graph and a set of options, which must include:
- 'base' - the base text referenced by the variants
- 'format' - the format of the variant list
- 'data' - the variants, in the given format.

=cut

sub parse {
    my( $tradition, %opts ) = @_;

    my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
    load( $format_mod );
    my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
    merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
}

=item B<merge_base>

merge_base( $graph, 'reference.txt', @apparatus_entries )

Takes three arguments: a newly-initialized Text::Tradition::Graph
object, a text file containing the reference text, and a list of
variants (apparatus entries).  Adds the base text to the graph, and
joins the variants to that.

The list of variants is an array of hash references; each hash takes
the form
 { '_id' => line reference,
   'rdg_0' => lemma reading,
   'rdg_1' => first variant,
   ...  # and so on until all distinct readings are listed
   'WitnessA' => 'rdg_0',
   'WitnessB' => 'rdg_1',
   ...  # and so on until all witnesses are listed with their readings
 }

Any hash key that is not of the form /^rdg_\d+$/ and that does not
begin with an underscore is assumed to be a witness name.  Any 'meta'
information to be passed must be passed in a key with a leading
underscore in its name.

=cut

sub merge_base {
    my( $collation, $base_file, @app_entries ) = @_;
    my @base_line_starts = read_base( $base_file, $collation );

    my %all_witnesses;
    foreach my $app ( @app_entries ) {
	my( $line, $num ) = split( /\./, $app->{_id} );
	# DEBUG with a short graph
	# last if $line > 2;
	# DEBUG for problematic entries
	my $scrutinize = "";
	my $first_line_reading = $base_line_starts[ $line ];
	my $too_far = $base_line_starts[ $line+1 ];
	
	my $lemma = $app->{rdg_0};
	my $seq = 1; 
	# Is this the Nth occurrence of this reading in the line?
	if( $lemma =~ s/(_)?(\d)$// ) {
	    $seq = $2;
	}
	my @lemma_words = split( /\s+/, $lemma );
	
	# Now search for the lemma words within this line.
	my $lemma_start = $first_line_reading;
	my $lemma_end;
	my %seen;
	while( $lemma_start ne $too_far ) {
	    # Loop detection
	    if( $seen{ $lemma_start->name() } ) {
		warn "Detected loop at " . $lemma_start->name() . 
		    ", ref $line,$num";
		last;
	    }
	    $seen{ $lemma_start->name() } = 1;
	    
	    # Try to match the lemma.
	    my $unmatch = 0;
	    print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
		$lemma_words[0] . "...\n"
		if "$line.$num" eq $scrutinize;
	    if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
		# Skip it if we need a match that is not the first.
		if( --$seq < 1 ) {
		    # Now we have to compare the rest of the words here.
		    if( scalar( @lemma_words ) > 1 ) {
			my $next_reading = 
			    $collation->next_reading( $lemma_start );
			foreach my $w ( @lemma_words[1..$#lemma_words] ) {
			    printf STDERR "Now matching %s against %s\n", 
				    cmp_str($next_reading), $w
				if "$line.$num" eq $scrutinize;
			    if( $w ne cmp_str($next_reading) ) {
				$unmatch = 1;
				last;
			    } else {
				$lemma_end = $next_reading;
				$next_reading = 
				    $collation->next_reading( $lemma_end );
			    }
			}
		    } else {
			$lemma_end = $lemma_start;
		    }
		} else {
		    $unmatch = 1;
		}
	    }
	    last unless ( $unmatch || !defined( $lemma_end ) );
	    $lemma_end = undef;
	    $lemma_start = $collation->next_reading( $lemma_start );
	}
	
	unless( $lemma_end ) {
	    warn "No match found for @lemma_words at $line.$num";
	    next;
	} else {
	    # These are no longer common readings; unmark them as such.
	    my @lemma_readings = $collation->reading_sequence( $lemma_start, 
						     $lemma_end );
	    map { $_->set_attribute( 'class', 'lemma' ) } @lemma_readings;
	}
	
	# Now we have our lemma readings; we add the variant readings
	# to the collation.
	
	# Keep track of the start and end point of each reading for later
	# reading collapse.
	my @readings = ( $lemma_start, $lemma_end );

	# For each reading that is not rdg_0, we make a chain of readings
	# and connect them to the anchor.  Edges are named after the mss
	# that are relevant.
	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
	    next if $k eq 'rdg_0'; # that's the lemma.
	    # TODO look at the lemma for any p.c. readings, and add
	    # them explicitly!
	    my @variant = split( /\s+/, $app->{$k} );
	    @variant = () if $app->{$k} eq '/'; # This is an omission.
	    my @mss = grep { $app->{$_} eq $k } keys( %$app );
	    
	    unless( @mss ) {
		print STDERR "Skipping '@variant' at $line.$num: no mss\n";
		next;
	    }
	    
	    # Keep track of what witnesses we have seen.
	    @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
	    
	    # Make the variant into a set of readings.
	    my $ctr = 0;
	    my $last_reading = $collation->prior_reading( $lemma_start );
	    my $var_start;
	    foreach my $vw ( @variant ) {
		my $vwname = "$k/$line.$num.$ctr"; $ctr++;
		my $vwreading = $collation->add_reading( $vwname );
		$vwreading->text( $vw );
		$vwreading->make_variant();
		foreach ( @mss ) {
		    $collation->add_path( $last_reading, $vwreading, $_ );
		}
		$var_start = $vwreading unless $var_start;
		$last_reading = $vwreading;
	    }
	    # Now hook it up at the end.
	    foreach ( @mss ) {
		$collation->add_path( $last_reading, 
				      $collation->next_word( $lemma_end ),
				      $_ );
	    }
	    
	    if( $var_start ) { # if it wasn't an empty reading
		push( @readings, $var_start, $last_reading );
	    }
	}

	# Now collate and collapse the identical readings within the collation.
	collate_variants( $collation, @readings );
    }

    # Now make the witness objects
    foreach my $w ( keys %all_witnesses ) {
	my $base = _is_post_corr( $w );
	if( $base ) {
	    my $pctag = substr( $w, length( $base ) );
	    my $existing_wit = $collation->tradition->witness( $base );
	    unless( $existing_wit ) {
		$existing_wit = $collation->tradition->add_witness( $base );
	    }
	    $existing_wit->post_correctione( $pctag );
	} else {
	    $collation->tradition->add_witness( $w )
		unless $collation->tradition->witness( $w );
	}
    }

    # Now walk paths and calculate positions.
    my @common_readings = 
	$collation->walk_and_expand_base( $collation->reading( '#END#' ) );
    $collation->calculate_positions( @common_readings );
}

=item B<read_base>

my @line_beginnings = read_base( 'reference.txt', $collation );

Takes a text file and a (presumed empty) collation object, adds the
words as simple linear readings to the collation, and returns a
list of readings that represent the beginning of lines. This collation
is now the starting point for application of apparatus entries in
merge_base, e.g. from a CSV file or a Classical Text Editor file.

=cut

sub read_base {
    my( $base_file, $collation ) = @_;
    
    # This array gives the first reading for each line.  We put the
    # common starting point in line zero.
    my $last_reading = $collation->start();
    my $lineref_array = [ $last_reading ]; # There is no line zero.

    open( BASE, $base_file ) or die "Could not open file $base_file: $!";
    while(<BASE>) {
	# Make the readings, and connect them up for the base, but
	# also save the first reading of each line in an array for the
	# purpose.
	# TODO use configurable reading separator
	chomp;
	my @words = split;
	my $started = 0;
	my $wordref = 0;
	my $lineref = scalar @$lineref_array;
	foreach my $w ( @words ) {
	    my $readingref = join( ',', $lineref, ++$wordref );
	    my $reading = $collation->add_reading( $readingref );
	    $reading->text( $w );
	    $reading->make_common();
	    unless( $started ) {
		push( @$lineref_array, $reading );
		$started = 1;
	    }
	    if( $last_reading ) {
		my $path = $collation->add_path( $last_reading, $reading, 
						 "base text" );
		$path->set_attribute( 'class', 'basetext' );
		$last_reading = $reading;
	    } # TODO there should be no else here...
	}
    }
    close BASE;
    # Ending point for all texts
    my $endpoint = $collation->add_reading( '#END#' );
    $collation->add_path( $last_reading, $endpoint, "base text" );
    push( @$lineref_array, $endpoint );

    return( @$lineref_array );
}

=item B<collate_variants>

collate_variants( $collation, @readings )

Given a set of readings in the form 
( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
walks through each to identify those readings that are identical.  The
collation is a Text::Tradition::Collation object; the elements of
@readings are Text::Tradition::Collation::Reading objects that appear
on the collation graph.

TODO: Handle collapsed and non-collapsed transpositions.

=cut

sub collate_variants {
    my( $collation, @readings ) = @_;
    my $lemma_start = shift @readings;
    my $lemma_end = shift @readings;
    my $detranspose = 0;

    # We need to calculate positions at this point, which is where
    # we are getting the implicit information from the apparatus.

    # Start the list of distinct readings with those readings in the lemma.
    my @distinct_readings;
    my $position = 0;
    while( $lemma_start ne $lemma_end ) {
	push( @distinct_readings, [ $lemma_start, 'base text', $position++ ] );
	$lemma_start = $collation->next_word( $lemma_start );
    } 
    push( @distinct_readings, [ $lemma_end, 'base text', $position++ ] );
    

    while( scalar @readings ) {
	my( $var_start, $var_end ) = splice( @readings, 0, 2 );

	# I want to look at the readings in the variant and lemma, and
	# collapse readings that are the same word.  This is mini-collation.
	# Each word in the 'main' list can only be collapsed once with a
	# word from the current reading.
	my %collapsed = ();

	# Get the label. There will only be one outgoing path to start
	# with, so this is safe.
	my @out = $var_start->outgoing();
	my $var_label = $out[0]->label();

	my @variant_readings;
	while( $var_start ne $var_end ) {
	    push( @variant_readings, $var_start );
	    $var_start = $collation->next_word( $var_start, $var_label );
	}
	push( @variant_readings, $var_end );

	# Go through the variant readings, and if we find a lemma reading that
	# hasn't yet been collapsed with a reading, equate them.  If we do
	# not, keep them to push onto the end of all_readings.
	my @remaining_readings;
	my $last_index = 0;
	my $curr_pos = 0;
	foreach my $w ( @variant_readings ) {
	    my $word = $w->label();
	    my $matched = 0;
	    foreach my $idx ( $last_index .. $#distinct_readings ) {
		my( $l, $pathlabel, $pos ) = @{$distinct_readings[$idx]};
		if( $word eq cmp_str( $l ) ) {
		    next if exists( $collapsed{ $l->label } )
			&& $collapsed{ $l->label } eq $l;
		    $matched = 1;
		    $last_index = $idx if $detranspose;
		    # Collapse the readings.
		    printf STDERR "Merging readings %s/%s and %s/%s\n", 
		        $l->name, $l->label, $w->name, $w->label;
		    $collation->merge_readings( $l, $w );
		    $collapsed{ $l->label } = $l;
		    # Now collapse any multiple paths to and from the reading.
 		    remove_duplicate_paths( $collation, 
 				    $collation->prior_word( $l, $pathlabel ), $l );
 		    remove_duplicate_paths( $collation, $l, 
 				    $collation->next_word( $l, $pathlabel ) );
		    $curr_pos = $pos;
		    last;
		}
	    }
	    push( @remaining_readings, [ $w, $var_label, $curr_pos++ ] ) unless $matched;
	}
	push( @distinct_readings, @remaining_readings) if scalar( @remaining_readings );
    }

    # Now set the positions of all the readings in this variation.
    #$DB::single = 1;
    print STDERR "Readings and their positions are:\n";
    foreach my $n ( @distinct_readings ) {
	printf STDERR "\t%s (position %s)\n", $n->[0]->label(), $n->[2];
    }
}

=item B<remove_duplicate_paths>

remove_duplicate_paths( $collation, $from, $to );

Given two readings, reduce the number of paths between those readings to
one.  If neither path represents a base text, combine their labels.

=cut

sub remove_duplicate_paths {
    my( $collation, $from, $to ) = @_;
    my @paths = $from->paths_to( $to );
    if( scalar @paths > 1 ) {
	my @base = grep { $_->label eq 'base text' } @paths;
	if ( scalar @base ) {
	    # Remove the paths that are not base.
	    foreach my $e ( @paths ) {
		$collation->del_path( $e )
		    unless $e eq $base[0];
	    }
	} else {
	    # Combine the paths into one.
	    my $new_path_name = join( ', ', map { $_->label() } @paths );
	    my $new_path = shift @paths;
	    $new_path->set_attribute( 'label', $new_path_name );
	    foreach my $e ( @paths ) {
		$collation->del_path( $e );
	    }
	}
    }
}

# Helper function. Given a witness sigil, if it is a post-correctione
# sigil,return the base witness.  If not, return a false value.
sub _is_post_corr {
    my( $sigil ) = @_;
    if( $sigil =~ /^(.*?)(\s*\(p\.\s*c\.\))$/ ) {
	return $1;
    }
    return undef;
}

=item B<cmp_str>

Pretend you never saw this method.  Really it needs to not be hardcoded.

=cut

sub cmp_str {
    my( $reading ) = @_;
    my $word = $reading->label();
    $word = lc( $word );
    $word =~ s/\W//g;
    $word =~ s/v/u/g;
    $word =~ s/j/i/g;
    $word =~ s/cha/ca/g;
    $word =~ s/quatuor/quattuor/g;
    $word =~ s/ioannes/iohannes/g;
    return $word;
}

=back

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews, aurum@cpan.org

=cut

1;
Commit	Line	Data
e58153d6	1	package Text::Tradition::Parser::BaseText;
b49c4318	2
	3	use strict;
	4	use warnings;
52ce987f	5	use Module::Load;
b49c4318	6
2ceca8c3	7	=head1 NAME
	8
	9	Text::Tradition::Parser::BaseText
	10
	11	=head1 SYNOPSIS
	12
	13	use Text::Tradition::Parser::BaseText qw( merge_base );
	14	merge_base( $graph, 'reference.txt', @apparatus_entries )
	15
	16	=head1 DESCRIPTION
	17
	18	For an overview of the package, see the documentation for the
	19	Text::Tradition::Graph module.
	20
	21	This module is meant for use with certain of the other Parser classes
	22	- whenever a list of variants is given with reference to a base text,
	23	these must be joined into a single collation. The parser should
	24	therefore make a list of variants and their locations, and BaseText
	25	will join those listed variants onto the reference text.
	26
	27	=head1 SUBROUTINES
	28
	29	=over
	30
52ce987f	31	=item B<parse>
	32
	33	parse( $graph, %opts );
	34
	35	Takes an initialized graph and a set of options, which must include:
	36	- 'base' - the base text referenced by the variants
	37	- 'format' - the format of the variant list
	38	- 'data' - the variants, in the given format.
	39
	40	=cut
	41
	42	sub parse {
e2902068	43	my( $tradition, %opts ) = @_;
52ce987f	44
	45	my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
	46	load( $format_mod );
	47	my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
e2902068	48	merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
52ce987f	49	}
52ce987f	50
2ceca8c3	51	=item B<merge_base>
	52
	53	merge_base( $graph, 'reference.txt', @apparatus_entries )
	54
	55	Takes three arguments: a newly-initialized Text::Tradition::Graph
	56	object, a text file containing the reference text, and a list of
	57	variants (apparatus entries). Adds the base text to the graph, and
	58	joins the variants to that.
	59
	60	The list of variants is an array of hash references; each hash takes
	61	the form
	62	{ '_id' => line reference,
	63	'rdg_0' => lemma reading,
	64	'rdg_1' => first variant,
	65	... # and so on until all distinct readings are listed
	66	'WitnessA' => 'rdg_0',
	67	'WitnessB' => 'rdg_1',
	68	... # and so on until all witnesses are listed with their readings
	69	}
	70
	71	Any hash key that is not of the form /^rdg_\d+$/ and that does not
	72	begin with an underscore is assumed to be a witness name. Any 'meta'
	73	information to be passed must be passed in a key with a leading
	74	underscore in its name.
	75
	76	=cut
	77
b49c4318	78	sub merge_base {
e2902068	79	my( $collation, $base_file, @app_entries ) = @_;
e2902068	80	my @base_line_starts = read_base( $base_file, $collation );
b49c4318	81
52ce987f	82	my %all_witnesses;
b49c4318	83	foreach my $app ( @app_entries ) {
	84	my( $line, $num ) = split( /\./, $app->{_id} );
	85	# DEBUG with a short graph
	86	# last if $line > 2;
2ceca8c3	87	# DEBUG for problematic entries
e49731d7	88	my $scrutinize = "";
e2902068	89	my $first_line_reading = $base_line_starts[ $line ];
b49c4318	90	my $too_far = $base_line_starts[ $line+1 ];
	91
	92	my $lemma = $app->{rdg_0};
	93	my $seq = 1;
	94	# Is this the Nth occurrence of this reading in the line?
	95	if( $lemma =~ s/(_)?(\d)$// ) {
	96	$seq = $2;
	97	}
	98	my @lemma_words = split( /\s+/, $lemma );
	99
	100	# Now search for the lemma words within this line.
e2902068	101	my $lemma_start = $first_line_reading;
b49c4318	102	my $lemma_end;
	103	my %seen;
	104	while( $lemma_start ne $too_far ) {
	105	# Loop detection
	106	if( $seen{ $lemma_start->name() } ) {
	107	warn "Detected loop at " . $lemma_start->name() .
	108	", ref $line,$num";
	109	last;
	110	}
	111	$seen{ $lemma_start->name() } = 1;
	112
	113	# Try to match the lemma.
	114	my $unmatch = 0;
	115	print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
	116	$lemma_words[0] . "...\n"
	117	if "$line.$num" eq $scrutinize;
	118	if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
	119	# Skip it if we need a match that is not the first.
	120	if( --$seq < 1 ) {
	121	# Now we have to compare the rest of the words here.
	122	if( scalar( @lemma_words ) > 1 ) {
e2902068	123	my $next_reading =
e2902068	124	$collation->next_reading( $lemma_start );
b49c4318	125	foreach my $w ( @lemma_words[1..$#lemma_words] ) {
b49c4318	126	printf STDERR "Now matching %s against %s\n",
e2902068	127	cmp_str($next_reading), $w
b49c4318	128	if "$line.$num" eq $scrutinize;
e2902068	129	if( $w ne cmp_str($next_reading) ) {
b49c4318	130	$unmatch = 1;
	131	last;
	132	} else {
e2902068	133	$lemma_end = $next_reading;
	134	$next_reading =
	135	$collation->next_reading( $lemma_end );
b49c4318	136	}
	137	}
	138	} else {
	139	$lemma_end = $lemma_start;
	140	}
	141	} else {
	142	$unmatch = 1;
	143	}
	144	}
	145	last unless ( $unmatch \|\| !defined( $lemma_end ) );
	146	$lemma_end = undef;
e2902068	147	$lemma_start = $collation->next_reading( $lemma_start );
b49c4318	148	}
	149
	150	unless( $lemma_end ) {
	151	warn "No match found for @lemma_words at $line.$num";
	152	next;
	153	} else {
e2902068	154	# These are no longer common readings; unmark them as such.
e2902068	155	my @lemma_readings = $collation->reading_sequence( $lemma_start,
b49c4318	156	$lemma_end );
e2902068	157	map { $_->set_attribute( 'class', 'lemma' ) } @lemma_readings;
b49c4318	158	}
b49c4318	159
e2902068	160	# Now we have our lemma readings; we add the variant readings
e2902068	161	# to the collation.
b49c4318	162
e49731d7	163	# Keep track of the start and end point of each reading for later
e2902068	164	# reading collapse.
e49731d7	165	my @readings = ( $lemma_start, $lemma_end );
e49731d7	166
e2902068	167	# For each reading that is not rdg_0, we make a chain of readings
b49c4318	168	# and connect them to the anchor. Edges are named after the mss
	169	# that are relevant.
	170	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
	171	next if $k eq 'rdg_0'; # that's the lemma.
e2902068	172	# TODO look at the lemma for any p.c. readings, and add
e2902068	173	# them explicitly!
b49c4318	174	my @variant = split( /\s+/, $app->{$k} );
	175	@variant = () if $app->{$k} eq '/'; # This is an omission.
	176	my @mss = grep { $app->{$_} eq $k } keys( %$app );
	177
	178	unless( @mss ) {
	179	print STDERR "Skipping '@variant' at $line.$num: no mss\n";
	180	next;
	181	}
	182
e2902068	183	# Keep track of what witnesses we have seen.
52ce987f	184	@all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
b49c4318	185
e2902068	186	# Make the variant into a set of readings.
b49c4318	187	my $ctr = 0;
e2902068	188	my $last_reading = $collation->prior_reading( $lemma_start );
b49c4318	189	my $var_start;
	190	foreach my $vw ( @variant ) {
	191	my $vwname = "$k/$line.$num.$ctr"; $ctr++;
e2902068	192	my $vwreading = $collation->add_reading( $vwname );
	193	$vwreading->text( $vw );
	194	$vwreading->make_variant();
	195	foreach ( @mss ) {
	196	$collation->add_path( $last_reading, $vwreading, $_ );
	197	}
	198	$var_start = $vwreading unless $var_start;
	199	$last_reading = $vwreading;
b49c4318	200	}
b49c4318	201	# Now hook it up at the end.
e2902068	202	foreach ( @mss ) {
	203	$collation->add_path( $last_reading,
	204	$collation->next_word( $lemma_end ),
	205	$_ );
	206	}
b49c4318	207
e49731d7	208	if( $var_start ) { # if it wasn't an empty reading
e2902068	209	push( @readings, $var_start, $last_reading );
e49731d7	210	}
b49c4318	211	}
e49731d7	212
e2902068	213	# Now collate and collapse the identical readings within the collation.
e2902068	214	collate_variants( $collation, @readings );
b49c4318	215	}
b49c4318	216
e2902068	217	# Now make the witness objects
52ce987f	218	foreach my $w ( keys %all_witnesses ) {
e2902068	219	my $base = _is_post_corr( $w );
	220	if( $base ) {
	221	my $pctag = substr( $w, length( $base ) );
	222	my $existing_wit = $collation->tradition->witness( $base );
	223	unless( $existing_wit ) {
	224	$existing_wit = $collation->tradition->add_witness( $base );
	225	}
	226	$existing_wit->post_correctione( $pctag );
	227	} else {
	228	$collation->tradition->add_witness( $w )
	229	unless $collation->tradition->witness( $w );
52ce987f	230	}
b49c4318	231	}
e2902068	232
	233	# Now walk paths and calculate positions.
	234	my @common_readings =
	235	$collation->walk_and_expand_base( $collation->reading( '#END#' ) );
	236	$collation->calculate_positions( @common_readings );
b49c4318	237	}
b49c4318	238
2ceca8c3	239	=item B<read_base>
2ceca8c3	240
e2902068	241	my @line_beginnings = read_base( 'reference.txt', $collation );
2ceca8c3	242
e2902068	243	Takes a text file and a (presumed empty) collation object, adds the
	244	words as simple linear readings to the collation, and returns a
	245	list of readings that represent the beginning of lines. This collation
	246	is now the starting point for application of apparatus entries in
	247	merge_base, e.g. from a CSV file or a Classical Text Editor file.
2ceca8c3	248
2ceca8c3	249	=cut
b49c4318	250
b49c4318	251	sub read_base {
e2902068	252	my( $base_file, $collation ) = @_;
b49c4318	253
e2902068	254	# This array gives the first reading for each line. We put the
b49c4318	255	# common starting point in line zero.
e2902068	256	my $last_reading = $collation->start();
e2902068	257	my $lineref_array = [ $last_reading ]; # There is no line zero.
b49c4318	258
	259	open( BASE, $base_file ) or die "Could not open file $base_file: $!";
	260	while(<BASE>) {
e2902068	261	# Make the readings, and connect them up for the base, but
	262	# also save the first reading of each line in an array for the
	263	# purpose.
	264	# TODO use configurable reading separator
b49c4318	265	chomp;
	266	my @words = split;
	267	my $started = 0;
	268	my $wordref = 0;
	269	my $lineref = scalar @$lineref_array;
	270	foreach my $w ( @words ) {
e2902068	271	my $readingref = join( ',', $lineref, ++$wordref );
	272	my $reading = $collation->add_reading( $readingref );
	273	$reading->text( $w );
	274	$reading->make_common();
b49c4318	275	unless( $started ) {
e2902068	276	push( @$lineref_array, $reading );
b49c4318	277	$started = 1;
b49c4318	278	}
e2902068	279	if( $last_reading ) {
	280	my $path = $collation->add_path( $last_reading, $reading,
	281	"base text" );
	282	$path->set_attribute( 'class', 'basetext' );
	283	$last_reading = $reading;
b49c4318	284	} # TODO there should be no else here...
	285	}
	286	}
	287	close BASE;
	288	# Ending point for all texts
e2902068	289	my $endpoint = $collation->add_reading( '#END#' );
e2902068	290	$collation->add_path( $last_reading, $endpoint, "base text" );
b49c4318	291	push( @$lineref_array, $endpoint );
	292
	293	return( @$lineref_array );
	294	}
	295
e49731d7	296	=item B<collate_variants>
2ceca8c3	297
e2902068	298	collate_variants( $collation, @readings )
2ceca8c3	299
e49731d7	300	Given a set of readings in the form
e49731d7	301	( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
e2902068	302	walks through each to identify those readings that are identical. The
	303	collation is a Text::Tradition::Collation object; the elements of
	304	@readings are Text::Tradition::Collation::Reading objects that appear
	305	on the collation graph.
b49c4318	306
2ceca8c3	307	TODO: Handle collapsed and non-collapsed transpositions.
	308
	309	=cut
b49c4318	310
e49731d7	311	sub collate_variants {
e2902068	312	my( $collation, @readings ) = @_;
e49731d7	313	my $lemma_start = shift @readings;
e49731d7	314	my $lemma_end = shift @readings;
52ce987f	315	my $detranspose = 0;
b49c4318	316
1b9423d5	317	# We need to calculate positions at this point, which is where
	318	# we are getting the implicit information from the apparatus.
	319
e2902068	320	# Start the list of distinct readings with those readings in the lemma.
e2902068	321	my @distinct_readings;
1b9423d5	322	my $position = 0;
b49c4318	323	while( $lemma_start ne $lemma_end ) {
e2902068	324	push( @distinct_readings, [ $lemma_start, 'base text', $position++ ] );
e2902068	325	$lemma_start = $collation->next_word( $lemma_start );
b49c4318	326	}
e2902068	327	push( @distinct_readings, [ $lemma_end, 'base text', $position++ ] );
b49c4318	328
e49731d7	329
	330	while( scalar @readings ) {
	331	my( $var_start, $var_end ) = splice( @readings, 0, 2 );
	332
e2902068	333	# I want to look at the readings in the variant and lemma, and
e2902068	334	# collapse readings that are the same word. This is mini-collation.
e49731d7	335	# Each word in the 'main' list can only be collapsed once with a
	336	# word from the current reading.
	337	my %collapsed = ();
	338
e2902068	339	# Get the label. There will only be one outgoing path to start
e49731d7	340	# with, so this is safe.
	341	my @out = $var_start->outgoing();
	342	my $var_label = $out[0]->label();
	343
e2902068	344	my @variant_readings;
e49731d7	345	while( $var_start ne $var_end ) {
e2902068	346	push( @variant_readings, $var_start );
e2902068	347	$var_start = $collation->next_word( $var_start, $var_label );
e49731d7	348	}
e2902068	349	push( @variant_readings, $var_end );
e49731d7	350
e2902068	351	# Go through the variant readings, and if we find a lemma reading that
	352	# hasn't yet been collapsed with a reading, equate them. If we do
	353	# not, keep them to push onto the end of all_readings.
	354	my @remaining_readings;
e49731d7	355	my $last_index = 0;
1b9423d5	356	my $curr_pos = 0;
e2902068	357	foreach my $w ( @variant_readings ) {
e49731d7	358	my $word = $w->label();
e49731d7	359	my $matched = 0;
e2902068	360	foreach my $idx ( $last_index .. $#distinct_readings ) {
e2902068	361	my( $l, $pathlabel, $pos ) = @{$distinct_readings[$idx]};
e49731d7	362	if( $word eq cmp_str( $l ) ) {
	363	next if exists( $collapsed{ $l->label } )
	364	&& $collapsed{ $l->label } eq $l;
	365	$matched = 1;
	366	$last_index = $idx if $detranspose;
e2902068	367	# Collapse the readings.
e2902068	368	printf STDERR "Merging readings %s/%s and %s/%s\n",
e49731d7	369	$l->name, $l->label, $w->name, $w->label;
e2902068	370	$collation->merge_readings( $l, $w );
e49731d7	371	$collapsed{ $l->label } = $l;
e2902068	372	# Now collapse any multiple paths to and from the reading.
	373	remove_duplicate_paths( $collation,
	374	$collation->prior_word( $l, $pathlabel ), $l );
	375	remove_duplicate_paths( $collation, $l,
	376	$collation->next_word( $l, $pathlabel ) );
1b9423d5	377	$curr_pos = $pos;
1b9423d5	378	last;
e49731d7	379	}
b49c4318	380	}
e2902068	381	push( @remaining_readings, [ $w, $var_label, $curr_pos++ ] ) unless $matched;
b49c4318	382	}
e2902068	383	push( @distinct_readings, @remaining_readings) if scalar( @remaining_readings );
b49c4318	384	}
1b9423d5	385
e2902068	386	# Now set the positions of all the readings in this variation.
1b9423d5	387	#$DB::single = 1;
e2902068	388	print STDERR "Readings and their positions are:\n";
e2902068	389	foreach my $n ( @distinct_readings ) {
1b9423d5	390	printf STDERR "\t%s (position %s)\n", $n->[0]->label(), $n->[2];
1b9423d5	391	}
b49c4318	392	}
b49c4318	393
e2902068	394	=item B<remove_duplicate_paths>
2ceca8c3	395
e2902068	396	remove_duplicate_paths( $collation, $from, $to );
2ceca8c3	397
e2902068	398	Given two readings, reduce the number of paths between those readings to
e2902068	399	one. If neither path represents a base text, combine their labels.
2ceca8c3	400
	401	=cut
	402
e2902068	403	sub remove_duplicate_paths {
	404	my( $collation, $from, $to ) = @_;
	405	my @paths = $from->paths_to( $to );
	406	if( scalar @paths > 1 ) {
	407	my @base = grep { $_->label eq 'base text' } @paths;
b49c4318	408	if ( scalar @base ) {
e2902068	409	# Remove the paths that are not base.
	410	foreach my $e ( @paths ) {
	411	$collation->del_path( $e )
b49c4318	412	unless $e eq $base[0];
	413	}
	414	} else {
e2902068	415	# Combine the paths into one.
	416	my $new_path_name = join( ', ', map { $_->label() } @paths );
	417	my $new_path = shift @paths;
	418	$new_path->set_attribute( 'label', $new_path_name );
	419	foreach my $e ( @paths ) {
	420	$collation->del_path( $e );
b49c4318	421	}
	422	}
	423	}
	424	}
	425
e2902068	426	# Helper function. Given a witness sigil, if it is a post-correctione
	427	# sigil,return the base witness. If not, return a false value.
	428	sub _is_post_corr {
	429	my( $sigil ) = @_;
	430	if( $sigil =~ /^(.?)(\s\(p\.\s*c\.\))$/ ) {
	431	return $1;
	432	}
	433	return undef;
	434	}
	435
2ceca8c3	436	=item B<cmp_str>
	437
	438	Pretend you never saw this method. Really it needs to not be hardcoded.
	439
	440	=cut
	441
b49c4318	442	sub cmp_str {
e2902068	443	my( $reading ) = @_;
e2902068	444	my $word = $reading->label();
b49c4318	445	$word = lc( $word );
	446	$word =~ s/\W//g;
	447	$word =~ s/v/u/g;
	448	$word =~ s/j/i/g;
	449	$word =~ s/cha/ca/g;
	450	$word =~ s/quatuor/quattuor/g;
	451	$word =~ s/ioannes/iohannes/g;
	452	return $word;
	453	}
	454
2ceca8c3	455	=back
	456
	457	=head1 LICENSE
	458
	459	This package is free software and is provided "as is" without express
	460	or implied warranty. You can redistribute it and/or modify it under
	461	the same terms as Perl itself.
	462
	463	=head1 AUTHOR
	464
	465	Tara L Andrews, aurum@cpan.org
	466
	467	=cut
	468
b49c4318	469	1;