[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / BaseText.pm

package Text::Tradition::Parser::BaseText;

use strict;
use warnings;
use Module::Load;

=head1 NAME

Text::Tradition::Parser::BaseText

=head1 SYNOPSIS

use Text::Tradition::Parser::BaseText qw( merge_base );
merge_base( $graph, 'reference.txt', @apparatus_entries )

=head1 DESCRIPTION

For an overview of the package, see the documentation for the
Text::Tradition::Graph module.

This module is meant for use with certain of the other Parser classes
- whenever a list of variants is given with reference to a base text,
these must be joined into a single collation.  The parser should
therefore make a list of variants and their locations, and BaseText
will join those listed variants onto the reference text.  

=head1 SUBROUTINES

=over

=item B<parse>

parse( $graph, %opts );

Takes an initialized graph and a set of options, which must include:
- 'base' - the base text referenced by the variants
- 'format' - the format of the variant list
- 'data' - the variants, in the given format.

=cut

sub parse {
    my( $graph, %opts ) = @_;

    my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
    load( $format_mod );
    my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
    merge_base( $graph, $opts{'base'}, @apparatus_entries );
}

=item B<merge_base>

merge_base( $graph, 'reference.txt', @apparatus_entries )

Takes three arguments: a newly-initialized Text::Tradition::Graph
object, a text file containing the reference text, and a list of
variants (apparatus entries).  Adds the base text to the graph, and
joins the variants to that.

The list of variants is an array of hash references; each hash takes
the form
 { '_id' => line reference,
   'rdg_0' => lemma reading,
   'rdg_1' => first variant,
   ...  # and so on until all distinct readings are listed
   'WitnessA' => 'rdg_0',
   'WitnessB' => 'rdg_1',
   ...  # and so on until all witnesses are listed with their readings
 }

Any hash key that is not of the form /^rdg_\d+$/ and that does not
begin with an underscore is assumed to be a witness name.  Any 'meta'
information to be passed must be passed in a key with a leading
underscore in its name.

=cut

sub merge_base {
    my( $graph, $base_file, @app_entries ) = @_;
    my @base_line_starts = read_base( $base_file, $graph );

    my %all_witnesses;
    foreach my $app ( @app_entries ) {
	my( $line, $num ) = split( /\./, $app->{_id} );
	# DEBUG with a short graph
	# last if $line > 2;
	# DEBUG for problematic entries
	my $scrutinize = "";
	my $first_line_node = $base_line_starts[ $line ];
	my $too_far = $base_line_starts[ $line+1 ];
	
	my $lemma = $app->{rdg_0};
	my $seq = 1; 
	# Is this the Nth occurrence of this reading in the line?
	if( $lemma =~ s/(_)?(\d)$// ) {
	    $seq = $2;
	}
	my @lemma_words = split( /\s+/, $lemma );
	
	# Now search for the lemma words within this line.
	my $lemma_start = $first_line_node;
	my $lemma_end;
	my %seen;
	while( $lemma_start ne $too_far ) {
	    # Loop detection
	    if( $seen{ $lemma_start->name() } ) {
		warn "Detected loop at " . $lemma_start->name() . 
		    ", ref $line,$num";
		last;
	    }
	    $seen{ $lemma_start->name() } = 1;
	    
	    # Try to match the lemma.
	    my $unmatch = 0;
	    print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
		$lemma_words[0] . "...\n"
		if "$line.$num" eq $scrutinize;
	    if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
		# Skip it if we need a match that is not the first.
		if( --$seq < 1 ) {
		    # Now we have to compare the rest of the words here.
		    if( scalar( @lemma_words ) > 1 ) {
			my $next_node = $graph->next_word( $lemma_start );
			foreach my $w ( @lemma_words[1..$#lemma_words] ) {
			    printf STDERR "Now matching %s against %s\n", 
				    cmp_str($next_node), $w
				if "$line.$num" eq $scrutinize;
			    if( $w ne cmp_str($next_node) ) {
				$unmatch = 1;
				last;
			    } else {
				$lemma_end = $next_node;
				$next_node = $graph->next_word( $lemma_end );
			    }
			}
		    } else {
			$lemma_end = $lemma_start;
		    }
		} else {
		    $unmatch = 1;
		}
	    }
	    last unless ( $unmatch || !defined( $lemma_end ) );
	    $lemma_end = undef;
	    $lemma_start = $graph->next_word( $lemma_start );
	}
	
	unless( $lemma_end ) {
	    warn "No match found for @lemma_words at $line.$num";
	    next;
	} else {
	    # These are no longer common nodes; unmark them as such.
	    my @lemma_nodes = $graph->node_sequence( $lemma_start, 
						     $lemma_end );
	    map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes;
	}
	
	# Now we have our lemma nodes; we add the variant nodes to the graph.
	
	# Keep track of the start and end point of each reading for later
	# node collapse.
	my @readings = ( $lemma_start, $lemma_end );

	# For each reading that is not rdg_0, we make a chain of nodes
	# and connect them to the anchor.  Edges are named after the mss
	# that are relevant.
	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
	    next if $k eq 'rdg_0'; # that's the lemma.
	    my @variant = split( /\s+/, $app->{$k} );
	    @variant = () if $app->{$k} eq '/'; # This is an omission.
	    my @mss = grep { $app->{$_} eq $k } keys( %$app );
	    
	    unless( @mss ) {
		print STDERR "Skipping '@variant' at $line.$num: no mss\n";
		next;
	    }
	    
	    # Determine the label name for the edges here.
	    my $edge_name = join(', ', @mss );
	    @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
	    
	    # Make the variant into a set of nodes.
	    my $ctr = 0;
	    my $last_node = $graph->prior_word( $lemma_start );
	    my $var_start;
	    foreach my $vw ( @variant ) {
		my $vwname = "$k/$line.$num.$ctr"; $ctr++;
		my $vwnode = $graph->add_node( $vwname );
		$vwnode->set_attribute( 'label', $vw );
		$vwnode->set_attribute( 'class', 'variant' );
		$graph->add_edge( $last_node, $vwnode, $edge_name );
		$var_start = $vwnode unless $var_start;
		$last_node = $vwnode;
	    }
	    # Now hook it up at the end.
	    $graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
					$edge_name );
	    
	    if( $var_start ) { # if it wasn't an empty reading
		push( @readings, $var_start, $last_node );
	    }
	}

	# Now collate and collapse the identical nodes within the graph.
	collate_variants( $graph, @readings );
    }

    ## Now in theory I have a graph.  I want to make it a little easier to
    ## read.  So I collapse nodes that have only one edge in and one edge
    ## out, and I do this by looking at the edges.
    
#     foreach my $edge ( $graph->edges() ) {
# 	my @out_edges = $edge->from()->outgoing();
# 	my @in_edges = $edge->to()->incoming();
	
# 	next if $edge->from() eq $graph->start();
# 	next if $edge->to()->name() eq '#END#';
# 	next unless scalar( @out_edges ) == 1;
# 	next unless scalar( @in_edges ) == 1;
# 	next unless $out_edges[0] eq $in_edges[0];
# 	# In theory if we've got this far, we're safe, but just to
# 	# double-check...
# 	next unless $out_edges[0] eq $edge;
	
# 	$graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
#     }

    # Now walk the path for each witness, so that we can do the
    # position calculations.
    my $paths = {};
    foreach my $w ( keys %all_witnesses ) {
	my $back = undef;
	if( $w =~ /^(.*)\s*\(p\.\s*c\.\)/ ) {
	    $back = $1;
	}
	my @wit_nodes = $graph->node_sequence( $graph->start, 
					       $graph->node( '#END#' ), 
					       $w, $back );
	my @wn_names = map { $_->name() } @wit_nodes;
	$paths->{$w} = \@wn_names;
    }
    $DB::single = 1;
    my @common_nodes = grep { $graph->is_common( $_ ) } $graph->nodes();
    $graph->make_positions( \@common_nodes, $paths );
}

=item B<read_base>

my @line_beginnings = read_base( 'reference.txt', $graph );

Takes a text file and a (presumed empty) graph object, adds the words
as simple linear nodes to the graph, and returns a list of nodes that
represent the beginning of lines. This graph is now the starting point
for application of apparatus entries in merge_base, e.g. from a CSV
file or a Classical Text Editor file.

=cut

sub read_base {
    my( $base_file, $graph ) = @_;
    
    # This array gives the first node for each line.  We put the
    # common starting point in line zero.
    my $last_node = $graph->start();
    my $lineref_array = [ $last_node ]; # There is no line zero.

    open( BASE, $base_file ) or die "Could not open file $base_file: $!";
    while(<BASE>) {
	# Make the nodes, and connect them up for the base, but also
	# save the first node of each line in an array for the purpose.
	chomp;
	my @words = split;
	my $started = 0;
	my $wordref = 0;
	my $lineref = scalar @$lineref_array;
	foreach my $w ( @words ) {
	    my $noderef = join( ',', $lineref, ++$wordref );
	    my $node = $graph->add_node( $noderef );
	    $node->set_attribute( 'label', $w );
	    $node->set_attribute( 'class', 'common' );
	    unless( $started ) {
		push( @$lineref_array, $node );
		$started = 1;
	    }
	    if( $last_node ) {
		my $edge = $graph->add_edge( $last_node, $node, "base text" );
		$edge->set_attribute( 'class', 'basetext' );
		$last_node = $node;
	    } # TODO there should be no else here...
	}
    }
    close BASE;
    # Ending point for all texts
    my $endpoint = $graph->add_node( '#END#' );
    $graph->add_edge( $last_node, $endpoint, "base text" );
    push( @$lineref_array, $endpoint );

    return( @$lineref_array );
}

=item B<collate_variants>

collate_variants( $graph, @readings )

Given a set of readings in the form 
( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
walks through each to identify those nodes that are identical.  The
graph is a Text::Tradition::Graph object; the elements of @readings are
Graph::Easy::Node objects that appear on the graph.

TODO: Handle collapsed and non-collapsed transpositions.

=cut

sub collate_variants {
    my( $graph, @readings ) = @_;
    my $lemma_start = shift @readings;
    my $lemma_end = shift @readings;
    my $detranspose = 0;

    # We need to calculate positions at this point, which is where
    # we are getting the implicit information from the apparatus.

    # Start the list of distinct nodes with those nodes in the lemma.
    my @distinct_nodes;
    my $position = 0;
    while( $lemma_start ne $lemma_end ) {
	push( @distinct_nodes, [ $lemma_start, 'base text', $position++ ] );
	$lemma_start = $graph->next_word( $lemma_start );
    } 
    push( @distinct_nodes, [ $lemma_end, 'base text', $position++ ] );
    

    while( scalar @readings ) {
	my( $var_start, $var_end ) = splice( @readings, 0, 2 );

	# I want to look at the nodes in the variant and lemma, and
	# collapse nodes that are the same word.  This is mini-collation.
	# Each word in the 'main' list can only be collapsed once with a
	# word from the current reading.
	my %collapsed = ();

	# Get the label. There will only be one outgoing edge to start
	# with, so this is safe.
	my @out = $var_start->outgoing();
	my $var_label = $out[0]->label();

	my @variant_nodes;
	while( $var_start ne $var_end ) {
	    push( @variant_nodes, $var_start );
	    $var_start = $graph->next_word( $var_start, $var_label );
	}
	push( @variant_nodes, $var_end );

	# Go through the variant nodes, and if we find a lemma node that
	# hasn't yet been collapsed with a node, equate them.  If we do
	# not, keep them to push onto the end of all_nodes.
	my @remaining_nodes;
	my $last_index = 0;
	my $curr_pos = 0;
	foreach my $w ( @variant_nodes ) {
	    my $word = $w->label();
	    my $matched = 0;
	    foreach my $idx ( $last_index .. $#distinct_nodes ) {
		my( $l, $edgelabel, $pos ) = @{$distinct_nodes[$idx]};
		if( $word eq cmp_str( $l ) ) {
		    next if exists( $collapsed{ $l->label } )
			&& $collapsed{ $l->label } eq $l;
		    $matched = 1;
		    $last_index = $idx if $detranspose;
		    # Collapse the nodes.
		    printf STDERR "Merging nodes %s/%s and %s/%s\n", 
		        $l->name, $l->label, $w->name, $w->label;
		    $graph->merge_nodes( $l, $w );
		    $collapsed{ $l->label } = $l;
		    # Now collapse any multiple edges to and from the node.
 		    remove_duplicate_edges( $graph, 
 				    $graph->prior_word( $l, $edgelabel ), $l );
 		    remove_duplicate_edges( $graph, $l, 
 				    $graph->next_word( $l, $edgelabel ) );
		    $curr_pos = $pos;
		    last;
		}
	    }
	    push( @remaining_nodes, [ $w, $var_label, $curr_pos++ ] ) unless $matched;
	}
	push( @distinct_nodes, @remaining_nodes) if scalar( @remaining_nodes );
    }

    # Now set the positions of all the nodes in this variation.
    #$DB::single = 1;
    print STDERR "Nodes and their positions are:\n";
    foreach my $n ( @distinct_nodes ) {
	printf STDERR "\t%s (position %s)\n", $n->[0]->label(), $n->[2];
    }
}

=item B<remove_duplicate_edges>

remove_duplicate_edges( $graph, $from, $to );

Given two nodes, reduce the number of edges between those nodes to
one.  If neither edge represents a base text, combine their labels.

=cut

sub remove_duplicate_edges {
    my( $graph, $from, $to ) = @_;
    my @edges = $from->edges_to( $to );
    if( scalar @edges > 1 ) {
	my @base = grep { $_->label eq 'base text' } @edges;
	if ( scalar @base ) {
	    # Remove the edges that are not base.
	    foreach my $e ( @edges ) {
		$graph->del_edge( $e )
		    unless $e eq $base[0];
	    }
	} else {
	    # Combine the edges into one.
	    my $new_edge_name = join( ', ', map { $_->label() } @edges );
	    my $new_edge = shift @edges;
	    $new_edge->set_attribute( 'label', $new_edge_name );
	    foreach my $e ( @edges ) {
		$graph->del_edge( $e );
	    }
	}
    }
}

=item B<cmp_str>

Pretend you never saw this method.  Really it needs to not be hardcoded.

=cut

sub cmp_str {
    my( $node ) = @_;
    my $word = $node->label();
    $word = lc( $word );
    $word =~ s/\W//g;
    $word =~ s/v/u/g;
    $word =~ s/j/i/g;
    $word =~ s/cha/ca/g;
    $word =~ s/quatuor/quattuor/g;
    $word =~ s/ioannes/iohannes/g;
    return $word;
}

=back

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews, aurum@cpan.org

=cut

1;
Commit	Line	Data
e58153d6	1	package Text::Tradition::Parser::BaseText;
b49c4318	2
	3	use strict;
	4	use warnings;
52ce987f	5	use Module::Load;
b49c4318	6
2ceca8c3	7	=head1 NAME
	8
	9	Text::Tradition::Parser::BaseText
	10
	11	=head1 SYNOPSIS
	12
	13	use Text::Tradition::Parser::BaseText qw( merge_base );
	14	merge_base( $graph, 'reference.txt', @apparatus_entries )
	15
	16	=head1 DESCRIPTION
	17
	18	For an overview of the package, see the documentation for the
	19	Text::Tradition::Graph module.
	20
	21	This module is meant for use with certain of the other Parser classes
	22	- whenever a list of variants is given with reference to a base text,
	23	these must be joined into a single collation. The parser should
	24	therefore make a list of variants and their locations, and BaseText
	25	will join those listed variants onto the reference text.
	26
	27	=head1 SUBROUTINES
	28
	29	=over
	30
52ce987f	31	=item B<parse>
	32
	33	parse( $graph, %opts );
	34
	35	Takes an initialized graph and a set of options, which must include:
	36	- 'base' - the base text referenced by the variants
	37	- 'format' - the format of the variant list
	38	- 'data' - the variants, in the given format.
	39
	40	=cut
	41
	42	sub parse {
	43	my( $graph, %opts ) = @_;
	44
	45	my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
	46	load( $format_mod );
	47	my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
	48	merge_base( $graph, $opts{'base'}, @apparatus_entries );
	49	}
	50
2ceca8c3	51	=item B<merge_base>
	52
	53	merge_base( $graph, 'reference.txt', @apparatus_entries )
	54
	55	Takes three arguments: a newly-initialized Text::Tradition::Graph
	56	object, a text file containing the reference text, and a list of
	57	variants (apparatus entries). Adds the base text to the graph, and
	58	joins the variants to that.
	59
	60	The list of variants is an array of hash references; each hash takes
	61	the form
	62	{ '_id' => line reference,
	63	'rdg_0' => lemma reading,
	64	'rdg_1' => first variant,
	65	... # and so on until all distinct readings are listed
	66	'WitnessA' => 'rdg_0',
	67	'WitnessB' => 'rdg_1',
	68	... # and so on until all witnesses are listed with their readings
	69	}
	70
	71	Any hash key that is not of the form /^rdg_\d+$/ and that does not
	72	begin with an underscore is assumed to be a witness name. Any 'meta'
	73	information to be passed must be passed in a key with a leading
	74	underscore in its name.
	75
	76	=cut
	77
b49c4318	78	sub merge_base {
	79	my( $graph, $base_file, @app_entries ) = @_;
	80	my @base_line_starts = read_base( $base_file, $graph );
	81
52ce987f	82	my %all_witnesses;
b49c4318	83	foreach my $app ( @app_entries ) {
	84	my( $line, $num ) = split( /\./, $app->{_id} );
	85	# DEBUG with a short graph
	86	# last if $line > 2;
2ceca8c3	87	# DEBUG for problematic entries
e49731d7	88	my $scrutinize = "";
b49c4318	89	my $first_line_node = $base_line_starts[ $line ];
	90	my $too_far = $base_line_starts[ $line+1 ];
	91
	92	my $lemma = $app->{rdg_0};
	93	my $seq = 1;
	94	# Is this the Nth occurrence of this reading in the line?
	95	if( $lemma =~ s/(_)?(\d)$// ) {
	96	$seq = $2;
	97	}
	98	my @lemma_words = split( /\s+/, $lemma );
	99
	100	# Now search for the lemma words within this line.
	101	my $lemma_start = $first_line_node;
	102	my $lemma_end;
	103	my %seen;
	104	while( $lemma_start ne $too_far ) {
	105	# Loop detection
	106	if( $seen{ $lemma_start->name() } ) {
	107	warn "Detected loop at " . $lemma_start->name() .
	108	", ref $line,$num";
	109	last;
	110	}
	111	$seen{ $lemma_start->name() } = 1;
	112
	113	# Try to match the lemma.
	114	my $unmatch = 0;
	115	print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
	116	$lemma_words[0] . "...\n"
	117	if "$line.$num" eq $scrutinize;
	118	if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
	119	# Skip it if we need a match that is not the first.
	120	if( --$seq < 1 ) {
	121	# Now we have to compare the rest of the words here.
	122	if( scalar( @lemma_words ) > 1 ) {
	123	my $next_node = $graph->next_word( $lemma_start );
	124	foreach my $w ( @lemma_words[1..$#lemma_words] ) {
	125	printf STDERR "Now matching %s against %s\n",
	126	cmp_str($next_node), $w
	127	if "$line.$num" eq $scrutinize;
	128	if( $w ne cmp_str($next_node) ) {
	129	$unmatch = 1;
	130	last;
	131	} else {
	132	$lemma_end = $next_node;
	133	$next_node = $graph->next_word( $lemma_end );
	134	}
	135	}
	136	} else {
	137	$lemma_end = $lemma_start;
	138	}
	139	} else {
	140	$unmatch = 1;
	141	}
	142	}
	143	last unless ( $unmatch \|\| !defined( $lemma_end ) );
	144	$lemma_end = undef;
	145	$lemma_start = $graph->next_word( $lemma_start );
	146	}
	147
	148	unless( $lemma_end ) {
	149	warn "No match found for @lemma_words at $line.$num";
	150	next;
	151	} else {
	152	# These are no longer common nodes; unmark them as such.
153	my @lemma_nodes = $graph->node_sequence( $lemma_start,
154	$lemma_end );
155	map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes;
156	}
157
158	# Now we have our lemma nodes; we add the variant nodes to the graph.
159
e49731d7	160	# Keep track of the start and end point of each reading for later
	161	# node collapse.
	162	my @readings = ( $lemma_start, $lemma_end );
	163
b49c4318	164	# For each reading that is not rdg_0, we make a chain of nodes
	165	# and connect them to the anchor. Edges are named after the mss
	166	# that are relevant.
	167	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
	168	next if $k eq 'rdg_0'; # that's the lemma.
	169	my @variant = split( /\s+/, $app->{$k} );
	170	@variant = () if $app->{$k} eq '/'; # This is an omission.
	171	my @mss = grep { $app->{$_} eq $k } keys( %$app );
	172
	173	unless( @mss ) {
	174	print STDERR "Skipping '@variant' at $line.$num: no mss\n";
	175	next;
	176	}
	177
	178	# Determine the label name for the edges here.
	179	my $edge_name = join(', ', @mss );
52ce987f	180	@all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
b49c4318	181
	182	# Make the variant into a set of nodes.
	183	my $ctr = 0;
	184	my $last_node = $graph->prior_word( $lemma_start );
	185	my $var_start;
	186	foreach my $vw ( @variant ) {
	187	my $vwname = "$k/$line.$num.$ctr"; $ctr++;
	188	my $vwnode = $graph->add_node( $vwname );
	189	$vwnode->set_attribute( 'label', $vw );
	190	$vwnode->set_attribute( 'class', 'variant' );
	191	$graph->add_edge( $last_node, $vwnode, $edge_name );
	192	$var_start = $vwnode unless $var_start;
	193	$last_node = $vwnode;
	194	}
	195	# Now hook it up at the end.
	196	$graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
	197	$edge_name );
	198
e49731d7	199	if( $var_start ) { # if it wasn't an empty reading
	200	push( @readings, $var_start, $last_node );
	201	}
b49c4318	202	}
e49731d7	203
	204	# Now collate and collapse the identical nodes within the graph.
	205	collate_variants( $graph, @readings );
b49c4318	206	}
	207
	208	## Now in theory I have a graph. I want to make it a little easier to
	209	## read. So I collapse nodes that have only one edge in and one edge
	210	## out, and I do this by looking at the edges.
	211
52ce987f	212	# foreach my $edge ( $graph->edges() ) {
	213	# my @out_edges = $edge->from()->outgoing();
	214	# my @in_edges = $edge->to()->incoming();
b49c4318	215
52ce987f	216	# next if $edge->from() eq $graph->start();
	217	# next if $edge->to()->name() eq '#END#';
	218	# next unless scalar( @out_edges ) == 1;
	219	# next unless scalar( @in_edges ) == 1;
	220	# next unless $out_edges[0] eq $in_edges[0];
	221	# # In theory if we've got this far, we're safe, but just to
	222	# # double-check...
	223	# next unless $out_edges[0] eq $edge;
b49c4318	224
52ce987f	225	# $graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
	226	# }
	227
	228	# Now walk the path for each witness, so that we can do the
	229	# position calculations.
	230	my $paths = {};
	231	foreach my $w ( keys %all_witnesses ) {
	232	my $back = undef;
	233	if( $w =~ /^(.)\s\(p\.\s*c\.\)/ ) {
	234	$back = $1;
	235	}
	236	my @wit_nodes = $graph->node_sequence( $graph->start,
	237	$graph->node( '#END#' ),
	238	$w, $back );
	239	my @wn_names = map { $_->name() } @wit_nodes;
	240	$paths->{$w} = \@wn_names;
b49c4318	241	}
52ce987f	242	$DB::single = 1;
	243	my @common_nodes = grep { $graph->is_common( $_ ) } $graph->nodes();
	244	$graph->make_positions( \@common_nodes, $paths );
b49c4318	245	}
b49c4318	246
2ceca8c3	247	=item B<read_base>
	248
	249	my @line_beginnings = read_base( 'reference.txt', $graph );
	250
	251	Takes a text file and a (presumed empty) graph object, adds the words
	252	as simple linear nodes to the graph, and returns a list of nodes that
	253	represent the beginning of lines. This graph is now the starting point
	254	for application of apparatus entries in merge_base, e.g. from a CSV
	255	file or a Classical Text Editor file.
	256
	257	=cut
b49c4318	258
	259	sub read_base {
	260	my( $base_file, $graph ) = @_;
	261
	262	# This array gives the first node for each line. We put the
	263	# common starting point in line zero.
	264	my $last_node = $graph->start();
	265	my $lineref_array = [ $last_node ]; # There is no line zero.
	266
	267	open( BASE, $base_file ) or die "Could not open file $base_file: $!";
	268	while(<BASE>) {
	269	# Make the nodes, and connect them up for the base, but also
	270	# save the first node of each line in an array for the purpose.
	271	chomp;
	272	my @words = split;
	273	my $started = 0;
	274	my $wordref = 0;
	275	my $lineref = scalar @$lineref_array;
	276	foreach my $w ( @words ) {
	277	my $noderef = join( ',', $lineref, ++$wordref );
	278	my $node = $graph->add_node( $noderef );
	279	$node->set_attribute( 'label', $w );
	280	$node->set_attribute( 'class', 'common' );
	281	unless( $started ) {
	282	push( @$lineref_array, $node );
	283	$started = 1;
	284	}
	285	if( $last_node ) {
e49731d7	286	my $edge = $graph->add_edge( $last_node, $node, "base text" );
e49731d7	287	$edge->set_attribute( 'class', 'basetext' );
b49c4318	288	$last_node = $node;
	289	} # TODO there should be no else here...
	290	}
	291	}
	292	close BASE;
	293	# Ending point for all texts
	294	my $endpoint = $graph->add_node( '#END#' );
	295	$graph->add_edge( $last_node, $endpoint, "base text" );
	296	push( @$lineref_array, $endpoint );
	297
	298	return( @$lineref_array );
	299	}
	300
e49731d7	301	=item B<collate_variants>
2ceca8c3	302
e49731d7	303	collate_variants( $graph, @readings )
2ceca8c3	304
e49731d7	305	Given a set of readings in the form
e49731d7	306	( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
2ceca8c3	307	walks through each to identify those nodes that are identical. The
e49731d7	308	graph is a Text::Tradition::Graph object; the elements of @readings are
2ceca8c3	309	Graph::Easy::Node objects that appear on the graph.
b49c4318	310
2ceca8c3	311	TODO: Handle collapsed and non-collapsed transpositions.
	312
	313	=cut
b49c4318	314
e49731d7	315	sub collate_variants {
	316	my( $graph, @readings ) = @_;
	317	my $lemma_start = shift @readings;
	318	my $lemma_end = shift @readings;
52ce987f	319	my $detranspose = 0;
b49c4318	320
1b9423d5	321	# We need to calculate positions at this point, which is where
	322	# we are getting the implicit information from the apparatus.
	323
e49731d7	324	# Start the list of distinct nodes with those nodes in the lemma.
e49731d7	325	my @distinct_nodes;
1b9423d5	326	my $position = 0;
b49c4318	327	while( $lemma_start ne $lemma_end ) {
1b9423d5	328	push( @distinct_nodes, [ $lemma_start, 'base text', $position++ ] );
b49c4318	329	$lemma_start = $graph->next_word( $lemma_start );
b49c4318	330	}
1b9423d5	331	push( @distinct_nodes, [ $lemma_end, 'base text', $position++ ] );
b49c4318	332
e49731d7	333
	334	while( scalar @readings ) {
	335	my( $var_start, $var_end ) = splice( @readings, 0, 2 );
	336
	337	# I want to look at the nodes in the variant and lemma, and
	338	# collapse nodes that are the same word. This is mini-collation.
	339	# Each word in the 'main' list can only be collapsed once with a
	340	# word from the current reading.
	341	my %collapsed = ();
	342
	343	# Get the label. There will only be one outgoing edge to start
	344	# with, so this is safe.
	345	my @out = $var_start->outgoing();
	346	my $var_label = $out[0]->label();
	347
	348	my @variant_nodes;
	349	while( $var_start ne $var_end ) {
	350	push( @variant_nodes, $var_start );
	351	$var_start = $graph->next_word( $var_start, $var_label );
	352	}
	353	push( @variant_nodes, $var_end );
	354
	355	# Go through the variant nodes, and if we find a lemma node that
	356	# hasn't yet been collapsed with a node, equate them. If we do
	357	# not, keep them to push onto the end of all_nodes.
	358	my @remaining_nodes;
	359	my $last_index = 0;
1b9423d5	360	my $curr_pos = 0;
e49731d7	361	foreach my $w ( @variant_nodes ) {
	362	my $word = $w->label();
	363	my $matched = 0;
	364	foreach my $idx ( $last_index .. $#distinct_nodes ) {
1b9423d5	365	my( $l, $edgelabel, $pos ) = @{$distinct_nodes[$idx]};
e49731d7	366	if( $word eq cmp_str( $l ) ) {
	367	next if exists( $collapsed{ $l->label } )
	368	&& $collapsed{ $l->label } eq $l;
	369	$matched = 1;
	370	$last_index = $idx if $detranspose;
	371	# Collapse the nodes.
	372	printf STDERR "Merging nodes %s/%s and %s/%s\n",
	373	$l->name, $l->label, $w->name, $w->label;
	374	$graph->merge_nodes( $l, $w );
	375	$collapsed{ $l->label } = $l;
	376	# Now collapse any multiple edges to and from the node.
	377	remove_duplicate_edges( $graph,
	378	$graph->prior_word( $l, $edgelabel ), $l );
	379	remove_duplicate_edges( $graph, $l,
	380	$graph->next_word( $l, $edgelabel ) );
1b9423d5	381	$curr_pos = $pos;
1b9423d5	382	last;
e49731d7	383	}
b49c4318	384	}
1b9423d5	385	push( @remaining_nodes, [ $w, $var_label, $curr_pos++ ] ) unless $matched;
b49c4318	386	}
e49731d7	387	push( @distinct_nodes, @remaining_nodes) if scalar( @remaining_nodes );
b49c4318	388	}
1b9423d5	389
	390	# Now set the positions of all the nodes in this variation.
	391	#$DB::single = 1;
	392	print STDERR "Nodes and their positions are:\n";
	393	foreach my $n ( @distinct_nodes ) {
	394	printf STDERR "\t%s (position %s)\n", $n->[0]->label(), $n->[2];
	395	}
b49c4318	396	}
b49c4318	397
2ceca8c3	398	=item B<remove_duplicate_edges>
	399
	400	remove_duplicate_edges( $graph, $from, $to );
	401
	402	Given two nodes, reduce the number of edges between those nodes to
	403	one. If neither edge represents a base text, combine their labels.
	404
	405	=cut
	406
b49c4318	407	sub remove_duplicate_edges {
	408	my( $graph, $from, $to ) = @_;
	409	my @edges = $from->edges_to( $to );
	410	if( scalar @edges > 1 ) {
	411	my @base = grep { $_->label eq 'base text' } @edges;
	412	if ( scalar @base ) {
	413	# Remove the edges that are not base.
	414	foreach my $e ( @edges ) {
	415	$graph->del_edge( $e )
	416	unless $e eq $base[0];
	417	}
	418	} else {
	419	# Combine the edges into one.
	420	my $new_edge_name = join( ', ', map { $_->label() } @edges );
	421	my $new_edge = shift @edges;
	422	$new_edge->set_attribute( 'label', $new_edge_name );
	423	foreach my $e ( @edges ) {
	424	$graph->del_edge( $e );
	425	}
	426	}
	427	}
	428	}
	429
2ceca8c3	430	=item B<cmp_str>
	431
	432	Pretend you never saw this method. Really it needs to not be hardcoded.
	433
	434	=cut
	435
b49c4318	436	sub cmp_str {
	437	my( $node ) = @_;
	438	my $word = $node->label();
	439	$word = lc( $word );
	440	$word =~ s/\W//g;
	441	$word =~ s/v/u/g;
	442	$word =~ s/j/i/g;
	443	$word =~ s/cha/ca/g;
	444	$word =~ s/quatuor/quattuor/g;
	445	$word =~ s/ioannes/iohannes/g;
	446	return $word;
	447	}
	448
2ceca8c3	449	=back
	450
	451	=head1 LICENSE
	452
	453	This package is free software and is provided "as is" without express
	454	or implied warranty. You can redistribute it and/or modify it under
	455	the same terms as Perl itself.
	456
	457	=head1 AUTHOR
	458
	459	Tara L Andrews, aurum@cpan.org
	460
	461	=cut
	462
b49c4318	463	1;