[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / BaseText.pm

package Text::Tradition::Parser::BaseText;

use strict;
use warnings;
use Exporter 'import';
use vars qw( @EXPORT_OK );
@EXPORT_OK = qw( merge_base );

=head1 NAME

Text::Tradition::Parser::BaseText

=head1 SYNOPSIS

use Text::Tradition::Parser::BaseText qw( merge_base );
merge_base( $graph, 'reference.txt', @apparatus_entries )

=head1 DESCRIPTION

For an overview of the package, see the documentation for the
Text::Tradition::Graph module.

This module is meant for use with certain of the other Parser classes
- whenever a list of variants is given with reference to a base text,
these must be joined into a single collation.  The parser should
therefore make a list of variants and their locations, and BaseText
will join those listed variants onto the reference text.  

=head1 SUBROUTINES

=over

=item B<merge_base>

merge_base( $graph, 'reference.txt', @apparatus_entries )

Takes three arguments: a newly-initialized Text::Tradition::Graph
object, a text file containing the reference text, and a list of
variants (apparatus entries).  Adds the base text to the graph, and
joins the variants to that.

The list of variants is an array of hash references; each hash takes
the form
 { '_id' => line reference,
   'rdg_0' => lemma reading,
   'rdg_1' => first variant,
   ...  # and so on until all distinct readings are listed
   'WitnessA' => 'rdg_0',
   'WitnessB' => 'rdg_1',
   ...  # and so on until all witnesses are listed with their readings
 }

Any hash key that is not of the form /^rdg_\d+$/ and that does not
begin with an underscore is assumed to be a witness name.  Any 'meta'
information to be passed must be passed in a key with a leading
underscore in its name.

=cut

sub merge_base {
    my( $graph, $base_file, @app_entries ) = @_;
    my @base_line_starts = read_base( $base_file, $graph );

    foreach my $app ( @app_entries ) {
	my( $line, $num ) = split( /\./, $app->{_id} );
	# DEBUG with a short graph
	# last if $line > 2;
	# DEBUG for problematic entries
	my $scrutinize = "";
	my $first_line_node = $base_line_starts[ $line ];
	my $too_far = $base_line_starts[ $line+1 ];
	
	my $lemma = $app->{rdg_0};
	my $seq = 1; 
	# Is this the Nth occurrence of this reading in the line?
	if( $lemma =~ s/(_)?(\d)$// ) {
	    $seq = $2;
	}
	my @lemma_words = split( /\s+/, $lemma );
	
	# Now search for the lemma words within this line.
	my $lemma_start = $first_line_node;
	my $lemma_end;
	my %seen;
	while( $lemma_start ne $too_far ) {
	    # Loop detection
	    if( $seen{ $lemma_start->name() } ) {
		warn "Detected loop at " . $lemma_start->name() . 
		    ", ref $line,$num";
		last;
	    }
	    $seen{ $lemma_start->name() } = 1;
	    
	    # Try to match the lemma.
	    my $unmatch = 0;
	    print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
		$lemma_words[0] . "...\n"
		if "$line.$num" eq $scrutinize;
	    if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
		# Skip it if we need a match that is not the first.
		if( --$seq < 1 ) {
		    # Now we have to compare the rest of the words here.
		    if( scalar( @lemma_words ) > 1 ) {
			my $next_node = $graph->next_word( $lemma_start );
			foreach my $w ( @lemma_words[1..$#lemma_words] ) {
			    printf STDERR "Now matching %s against %s\n", 
				    cmp_str($next_node), $w
				if "$line.$num" eq $scrutinize;
			    if( $w ne cmp_str($next_node) ) {
				$unmatch = 1;
				last;
			    } else {
				$lemma_end = $next_node;
				$next_node = $graph->next_word( $lemma_end );
			    }
			}
		    } else {
			$lemma_end = $lemma_start;
		    }
		} else {
		    $unmatch = 1;
		}
	    }
	    last unless ( $unmatch || !defined( $lemma_end ) );
	    $lemma_end = undef;
	    $lemma_start = $graph->next_word( $lemma_start );
	}
	
	unless( $lemma_end ) {
	    warn "No match found for @lemma_words at $line.$num";
	    next;
	} else {
	    # These are no longer common nodes; unmark them as such.
	    my @lemma_nodes = $graph->node_sequence( $lemma_start, 
						     $lemma_end );
	    map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes;
	}
	
	# Now we have our lemma nodes; we add the variant nodes to the graph.
	
	# Keep track of the start and end point of each reading for later
	# node collapse.
	my @readings = ( $lemma_start, $lemma_end );

	# For each reading that is not rdg_0, we make a chain of nodes
	# and connect them to the anchor.  Edges are named after the mss
	# that are relevant.
	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
	    next if $k eq 'rdg_0'; # that's the lemma.
	    my @variant = split( /\s+/, $app->{$k} );
	    @variant = () if $app->{$k} eq '/'; # This is an omission.
	    my @mss = grep { $app->{$_} eq $k } keys( %$app );
	    
	    unless( @mss ) {
		print STDERR "Skipping '@variant' at $line.$num: no mss\n";
		next;
	    }
	    
	    # Determine the label name for the edges here.
	    my $edge_name = join(', ', @mss );
	    
	    # Make the variant into a set of nodes.
	    my $ctr = 0;
	    my $last_node = $graph->prior_word( $lemma_start );
	    my $var_start;
	    foreach my $vw ( @variant ) {
		my $vwname = "$k/$line.$num.$ctr"; $ctr++;
		my $vwnode = $graph->add_node( $vwname );
		$vwnode->set_attribute( 'label', $vw );
		$vwnode->set_attribute( 'class', 'variant' );
		$graph->add_edge( $last_node, $vwnode, $edge_name );
		$var_start = $vwnode unless $var_start;
		$last_node = $vwnode;
	    }
	    # Now hook it up at the end.
	    $graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
					$edge_name );
	    
	    if( $var_start ) { # if it wasn't an empty reading
		push( @readings, $var_start, $last_node );
	    }
	}

	# Now collate and collapse the identical nodes within the graph.
	collate_variants( $graph, @readings );
    }

    ## Now in theory I have a graph.  I want to make it a little easier to
    ## read.  So I collapse nodes that have only one edge in and one edge
    ## out, and I do this by looking at the edges.
    
    foreach my $edge ( $graph->edges() ) {
	my @out_edges = $edge->from()->outgoing();
	my @in_edges = $edge->to()->incoming();
	
	next unless scalar( @out_edges ) == 1;
	next unless scalar( @in_edges ) == 1;
	next unless $out_edges[0] eq $in_edges[0];
	# In theory if we've got this far, we're safe, but just to
	# double-check...
	next unless $out_edges[0] eq $edge;
	
	$graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
    }
}

=item B<read_base>

my @line_beginnings = read_base( 'reference.txt', $graph );

Takes a text file and a (presumed empty) graph object, adds the words
as simple linear nodes to the graph, and returns a list of nodes that
represent the beginning of lines. This graph is now the starting point
for application of apparatus entries in merge_base, e.g. from a CSV
file or a Classical Text Editor file.

=cut

sub read_base {
    my( $base_file, $graph ) = @_;
    
    # This array gives the first node for each line.  We put the
    # common starting point in line zero.
    my $last_node = $graph->start();
    my $lineref_array = [ $last_node ]; # There is no line zero.

    open( BASE, $base_file ) or die "Could not open file $base_file: $!";
    while(<BASE>) {
	# Make the nodes, and connect them up for the base, but also
	# save the first node of each line in an array for the purpose.
	chomp;
	my @words = split;
	my $started = 0;
	my $wordref = 0;
	my $lineref = scalar @$lineref_array;
	foreach my $w ( @words ) {
	    my $noderef = join( ',', $lineref, ++$wordref );
	    my $node = $graph->add_node( $noderef );
	    $node->set_attribute( 'label', $w );
	    $node->set_attribute( 'class', 'common' );
	    unless( $started ) {
		push( @$lineref_array, $node );
		$started = 1;
	    }
	    if( $last_node ) {
		my $edge = $graph->add_edge( $last_node, $node, "base text" );
		$edge->set_attribute( 'class', 'basetext' );
		$last_node = $node;
	    } # TODO there should be no else here...
	}
    }
    close BASE;
    # Ending point for all texts
    my $endpoint = $graph->add_node( '#END#' );
    $graph->add_edge( $last_node, $endpoint, "base text" );
    push( @$lineref_array, $endpoint );

    return( @$lineref_array );
}

=item B<collate_variants>

collate_variants( $graph, @readings )

Given a set of readings in the form 
( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
walks through each to identify those nodes that are identical.  The
graph is a Text::Tradition::Graph object; the elements of @readings are
Graph::Easy::Node objects that appear on the graph.

TODO: Handle collapsed and non-collapsed transpositions.

=cut

sub collate_variants {
    my( $graph, @readings ) = @_;
    my $lemma_start = shift @readings;
    my $lemma_end = shift @readings;
    my $detranspose = 1;

    # Start the list of distinct nodes with those nodes in the lemma.
    my @distinct_nodes;
    while( $lemma_start ne $lemma_end ) {
	push( @distinct_nodes, [ $lemma_start, 'base text' ] );
	$lemma_start = $graph->next_word( $lemma_start );
    } 
    push( @distinct_nodes, [ $lemma_end, 'base text' ] );
    

    while( scalar @readings ) {
	my( $var_start, $var_end ) = splice( @readings, 0, 2 );

	# I want to look at the nodes in the variant and lemma, and
	# collapse nodes that are the same word.  This is mini-collation.
	# Each word in the 'main' list can only be collapsed once with a
	# word from the current reading.
	my %collapsed = ();

	# Get the label. There will only be one outgoing edge to start
	# with, so this is safe.
	my @out = $var_start->outgoing();
	my $var_label = $out[0]->label();

	my @variant_nodes;
	while( $var_start ne $var_end ) {
	    push( @variant_nodes, $var_start );
	    $var_start = $graph->next_word( $var_start, $var_label );
	}
	push( @variant_nodes, $var_end );

	# Go through the variant nodes, and if we find a lemma node that
	# hasn't yet been collapsed with a node, equate them.  If we do
	# not, keep them to push onto the end of all_nodes.
	my @remaining_nodes;
	my $last_index = 0;
	foreach my $w ( @variant_nodes ) {
	    my $word = $w->label();
	    my $matched = 0;
	    foreach my $idx ( $last_index .. $#distinct_nodes ) {
		my( $l, $edgelabel ) = @{$distinct_nodes[$idx]};
		if( $word eq cmp_str( $l ) ) {
		    next if exists( $collapsed{ $l->label } )
			&& $collapsed{ $l->label } eq $l;
		    $matched = 1;
		    $last_index = $idx if $detranspose;
		    # Collapse the nodes.
		    printf STDERR "Merging nodes %s/%s and %s/%s\n", 
		        $l->name, $l->label, $w->name, $w->label;
		    $graph->merge_nodes( $l, $w );
		    $collapsed{ $l->label } = $l;
		    # Now collapse any multiple edges to and from the node.
 		    remove_duplicate_edges( $graph, 
 				    $graph->prior_word( $l, $edgelabel ), $l );
 		    remove_duplicate_edges( $graph, $l, 
 				    $graph->next_word( $l, $edgelabel ) );
		}
	    }
	    push( @remaining_nodes, [ $w, $var_label ] ) unless $matched;
	}
	push( @distinct_nodes, @remaining_nodes) if scalar( @remaining_nodes );
    }
}

=item B<remove_duplicate_edges>

remove_duplicate_edges( $graph, $from, $to );

Given two nodes, reduce the number of edges between those nodes to
one.  If neither edge represents a base text, combine their labels.

=cut

sub remove_duplicate_edges {
    my( $graph, $from, $to ) = @_;
    my @edges = $from->edges_to( $to );
    if( scalar @edges > 1 ) {
	my @base = grep { $_->label eq 'base text' } @edges;
	if ( scalar @base ) {
	    # Remove the edges that are not base.
	    foreach my $e ( @edges ) {
		$graph->del_edge( $e )
		    unless $e eq $base[0];
	    }
	} else {
	    # Combine the edges into one.
	    my $new_edge_name = join( ', ', map { $_->label() } @edges );
	    my $new_edge = shift @edges;
	    $new_edge->set_attribute( 'label', $new_edge_name );
	    foreach my $e ( @edges ) {
		$graph->del_edge( $e );
	    }
	}
    }
}

=item B<cmp_str>

Pretend you never saw this method.  Really it needs to not be hardcoded.

=cut

sub cmp_str {
    my( $node ) = @_;
    my $word = $node->label();
    $word = lc( $word );
    $word =~ s/\W//g;
    $word =~ s/v/u/g;
    $word =~ s/j/i/g;
    $word =~ s/cha/ca/g;
    $word =~ s/quatuor/quattuor/g;
    $word =~ s/ioannes/iohannes/g;
    return $word;
}

=back

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews, aurum@cpan.org

=cut

1;
Commit	Line	Data
e58153d6	1	package Text::Tradition::Parser::BaseText;
b49c4318	2
	3	use strict;
	4	use warnings;
	5	use Exporter 'import';
	6	use vars qw( @EXPORT_OK );
	7	@EXPORT_OK = qw( merge_base );
	8
2ceca8c3	9	=head1 NAME
	10
	11	Text::Tradition::Parser::BaseText
	12
	13	=head1 SYNOPSIS
	14
	15	use Text::Tradition::Parser::BaseText qw( merge_base );
	16	merge_base( $graph, 'reference.txt', @apparatus_entries )
	17
	18	=head1 DESCRIPTION
	19
	20	For an overview of the package, see the documentation for the
	21	Text::Tradition::Graph module.
	22
	23	This module is meant for use with certain of the other Parser classes
	24	- whenever a list of variants is given with reference to a base text,
	25	these must be joined into a single collation. The parser should
	26	therefore make a list of variants and their locations, and BaseText
	27	will join those listed variants onto the reference text.
	28
	29	=head1 SUBROUTINES
	30
	31	=over
	32
	33	=item B<merge_base>
	34
	35	merge_base( $graph, 'reference.txt', @apparatus_entries )
	36
	37	Takes three arguments: a newly-initialized Text::Tradition::Graph
	38	object, a text file containing the reference text, and a list of
	39	variants (apparatus entries). Adds the base text to the graph, and
	40	joins the variants to that.
	41
	42	The list of variants is an array of hash references; each hash takes
	43	the form
	44	{ '_id' => line reference,
	45	'rdg_0' => lemma reading,
	46	'rdg_1' => first variant,
	47	... # and so on until all distinct readings are listed
	48	'WitnessA' => 'rdg_0',
	49	'WitnessB' => 'rdg_1',
	50	... # and so on until all witnesses are listed with their readings
	51	}
	52
	53	Any hash key that is not of the form /^rdg_\d+$/ and that does not
	54	begin with an underscore is assumed to be a witness name. Any 'meta'
	55	information to be passed must be passed in a key with a leading
	56	underscore in its name.
	57
	58	=cut
	59
b49c4318	60	sub merge_base {
	61	my( $graph, $base_file, @app_entries ) = @_;
	62	my @base_line_starts = read_base( $base_file, $graph );
	63
	64	foreach my $app ( @app_entries ) {
	65	my( $line, $num ) = split( /\./, $app->{_id} );
	66	# DEBUG with a short graph
	67	# last if $line > 2;
2ceca8c3	68	# DEBUG for problematic entries
e49731d7	69	my $scrutinize = "";
b49c4318	70	my $first_line_node = $base_line_starts[ $line ];
	71	my $too_far = $base_line_starts[ $line+1 ];
	72
	73	my $lemma = $app->{rdg_0};
	74	my $seq = 1;
	75	# Is this the Nth occurrence of this reading in the line?
	76	if( $lemma =~ s/(_)?(\d)$// ) {
	77	$seq = $2;
	78	}
	79	my @lemma_words = split( /\s+/, $lemma );
	80
	81	# Now search for the lemma words within this line.
	82	my $lemma_start = $first_line_node;
	83	my $lemma_end;
	84	my %seen;
	85	while( $lemma_start ne $too_far ) {
	86	# Loop detection
	87	if( $seen{ $lemma_start->name() } ) {
	88	warn "Detected loop at " . $lemma_start->name() .
	89	", ref $line,$num";
	90	last;
	91	}
	92	$seen{ $lemma_start->name() } = 1;
	93
	94	# Try to match the lemma.
	95	my $unmatch = 0;
	96	print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
	97	$lemma_words[0] . "...\n"
	98	if "$line.$num" eq $scrutinize;
	99	if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
	100	# Skip it if we need a match that is not the first.
	101	if( --$seq < 1 ) {
	102	# Now we have to compare the rest of the words here.
	103	if( scalar( @lemma_words ) > 1 ) {
	104	my $next_node = $graph->next_word( $lemma_start );
	105	foreach my $w ( @lemma_words[1..$#lemma_words] ) {
	106	printf STDERR "Now matching %s against %s\n",
	107	cmp_str($next_node), $w
	108	if "$line.$num" eq $scrutinize;
	109	if( $w ne cmp_str($next_node) ) {
	110	$unmatch = 1;
	111	last;
	112	} else {
	113	$lemma_end = $next_node;
	114	$next_node = $graph->next_word( $lemma_end );
	115	}
	116	}
	117	} else {
	118	$lemma_end = $lemma_start;
	119	}
	120	} else {
	121	$unmatch = 1;
	122	}
	123	}
	124	last unless ( $unmatch \|\| !defined( $lemma_end ) );
	125	$lemma_end = undef;
	126	$lemma_start = $graph->next_word( $lemma_start );
	127	}
	128
	129	unless( $lemma_end ) {
	130	warn "No match found for @lemma_words at $line.$num";
	131	next;
	132	} else {
	133	# These are no longer common nodes; unmark them as such.
134	my @lemma_nodes = $graph->node_sequence( $lemma_start,
135	$lemma_end );
136	map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes;
137	}
138
139	# Now we have our lemma nodes; we add the variant nodes to the graph.
140
e49731d7	141	# Keep track of the start and end point of each reading for later
	142	# node collapse.
	143	my @readings = ( $lemma_start, $lemma_end );
	144
b49c4318	145	# For each reading that is not rdg_0, we make a chain of nodes
	146	# and connect them to the anchor. Edges are named after the mss
	147	# that are relevant.
	148	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
	149	next if $k eq 'rdg_0'; # that's the lemma.
	150	my @variant = split( /\s+/, $app->{$k} );
	151	@variant = () if $app->{$k} eq '/'; # This is an omission.
	152	my @mss = grep { $app->{$_} eq $k } keys( %$app );
	153
	154	unless( @mss ) {
	155	print STDERR "Skipping '@variant' at $line.$num: no mss\n";
	156	next;
	157	}
	158
	159	# Determine the label name for the edges here.
	160	my $edge_name = join(', ', @mss );
	161
	162	# Make the variant into a set of nodes.
	163	my $ctr = 0;
	164	my $last_node = $graph->prior_word( $lemma_start );
	165	my $var_start;
	166	foreach my $vw ( @variant ) {
	167	my $vwname = "$k/$line.$num.$ctr"; $ctr++;
	168	my $vwnode = $graph->add_node( $vwname );
	169	$vwnode->set_attribute( 'label', $vw );
	170	$vwnode->set_attribute( 'class', 'variant' );
	171	$graph->add_edge( $last_node, $vwnode, $edge_name );
	172	$var_start = $vwnode unless $var_start;
	173	$last_node = $vwnode;
	174	}
	175	# Now hook it up at the end.
	176	$graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
	177	$edge_name );
	178
e49731d7	179	if( $var_start ) { # if it wasn't an empty reading
	180	push( @readings, $var_start, $last_node );
	181	}
b49c4318	182	}
e49731d7	183
	184	# Now collate and collapse the identical nodes within the graph.
	185	collate_variants( $graph, @readings );
b49c4318	186	}
	187
	188	## Now in theory I have a graph. I want to make it a little easier to
	189	## read. So I collapse nodes that have only one edge in and one edge
	190	## out, and I do this by looking at the edges.
	191
	192	foreach my $edge ( $graph->edges() ) {
	193	my @out_edges = $edge->from()->outgoing();
	194	my @in_edges = $edge->to()->incoming();
	195
	196	next unless scalar( @out_edges ) == 1;
	197	next unless scalar( @in_edges ) == 1;
	198	next unless $out_edges[0] eq $in_edges[0];
	199	# In theory if we've got this far, we're safe, but just to
	200	# double-check...
	201	next unless $out_edges[0] eq $edge;
	202
	203	$graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
	204	}
	205	}
	206
2ceca8c3	207	=item B<read_base>
	208
	209	my @line_beginnings = read_base( 'reference.txt', $graph );
	210
	211	Takes a text file and a (presumed empty) graph object, adds the words
	212	as simple linear nodes to the graph, and returns a list of nodes that
	213	represent the beginning of lines. This graph is now the starting point
	214	for application of apparatus entries in merge_base, e.g. from a CSV
	215	file or a Classical Text Editor file.
	216
	217	=cut
b49c4318	218
	219	sub read_base {
	220	my( $base_file, $graph ) = @_;
	221
	222	# This array gives the first node for each line. We put the
	223	# common starting point in line zero.
	224	my $last_node = $graph->start();
	225	my $lineref_array = [ $last_node ]; # There is no line zero.
	226
	227	open( BASE, $base_file ) or die "Could not open file $base_file: $!";
	228	while(<BASE>) {
	229	# Make the nodes, and connect them up for the base, but also
	230	# save the first node of each line in an array for the purpose.
	231	chomp;
	232	my @words = split;
	233	my $started = 0;
	234	my $wordref = 0;
	235	my $lineref = scalar @$lineref_array;
	236	foreach my $w ( @words ) {
	237	my $noderef = join( ',', $lineref, ++$wordref );
	238	my $node = $graph->add_node( $noderef );
	239	$node->set_attribute( 'label', $w );
	240	$node->set_attribute( 'class', 'common' );
	241	unless( $started ) {
	242	push( @$lineref_array, $node );
	243	$started = 1;
	244	}
	245	if( $last_node ) {
e49731d7	246	my $edge = $graph->add_edge( $last_node, $node, "base text" );
e49731d7	247	$edge->set_attribute( 'class', 'basetext' );
b49c4318	248	$last_node = $node;
	249	} # TODO there should be no else here...
	250	}
	251	}
	252	close BASE;
	253	# Ending point for all texts
	254	my $endpoint = $graph->add_node( '#END#' );
	255	$graph->add_edge( $last_node, $endpoint, "base text" );
	256	push( @$lineref_array, $endpoint );
	257
	258	return( @$lineref_array );
	259	}
	260
e49731d7	261	=item B<collate_variants>
2ceca8c3	262
e49731d7	263	collate_variants( $graph, @readings )
2ceca8c3	264
e49731d7	265	Given a set of readings in the form
e49731d7	266	( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
2ceca8c3	267	walks through each to identify those nodes that are identical. The
e49731d7	268	graph is a Text::Tradition::Graph object; the elements of @readings are
2ceca8c3	269	Graph::Easy::Node objects that appear on the graph.
b49c4318	270
2ceca8c3	271	TODO: Handle collapsed and non-collapsed transpositions.
	272
	273	=cut
b49c4318	274
e49731d7	275	sub collate_variants {
	276	my( $graph, @readings ) = @_;
	277	my $lemma_start = shift @readings;
	278	my $lemma_end = shift @readings;
	279	my $detranspose = 1;
b49c4318	280
e49731d7	281	# Start the list of distinct nodes with those nodes in the lemma.
e49731d7	282	my @distinct_nodes;
b49c4318	283	while( $lemma_start ne $lemma_end ) {
e49731d7	284	push( @distinct_nodes, [ $lemma_start, 'base text' ] );
b49c4318	285	$lemma_start = $graph->next_word( $lemma_start );
b49c4318	286	}
e49731d7	287	push( @distinct_nodes, [ $lemma_end, 'base text' ] );
b49c4318	288
e49731d7	289
	290	while( scalar @readings ) {
	291	my( $var_start, $var_end ) = splice( @readings, 0, 2 );
	292
	293	# I want to look at the nodes in the variant and lemma, and
	294	# collapse nodes that are the same word. This is mini-collation.
	295	# Each word in the 'main' list can only be collapsed once with a
	296	# word from the current reading.
	297	my %collapsed = ();
	298
	299	# Get the label. There will only be one outgoing edge to start
	300	# with, so this is safe.
	301	my @out = $var_start->outgoing();
	302	my $var_label = $out[0]->label();
	303
	304	my @variant_nodes;
	305	while( $var_start ne $var_end ) {
	306	push( @variant_nodes, $var_start );
	307	$var_start = $graph->next_word( $var_start, $var_label );
	308	}
	309	push( @variant_nodes, $var_end );
	310
	311	# Go through the variant nodes, and if we find a lemma node that
	312	# hasn't yet been collapsed with a node, equate them. If we do
	313	# not, keep them to push onto the end of all_nodes.
	314	my @remaining_nodes;
	315	my $last_index = 0;
	316	foreach my $w ( @variant_nodes ) {
	317	my $word = $w->label();
	318	my $matched = 0;
	319	foreach my $idx ( $last_index .. $#distinct_nodes ) {
	320	my( $l, $edgelabel ) = @{$distinct_nodes[$idx]};
	321	if( $word eq cmp_str( $l ) ) {
	322	next if exists( $collapsed{ $l->label } )
	323	&& $collapsed{ $l->label } eq $l;
	324	$matched = 1;
	325	$last_index = $idx if $detranspose;
	326	# Collapse the nodes.
	327	printf STDERR "Merging nodes %s/%s and %s/%s\n",
	328	$l->name, $l->label, $w->name, $w->label;
	329	$graph->merge_nodes( $l, $w );
	330	$collapsed{ $l->label } = $l;
	331	# Now collapse any multiple edges to and from the node.
	332	remove_duplicate_edges( $graph,
	333	$graph->prior_word( $l, $edgelabel ), $l );
	334	remove_duplicate_edges( $graph, $l,
	335	$graph->next_word( $l, $edgelabel ) );
	336	}
b49c4318	337	}
e49731d7	338	push( @remaining_nodes, [ $w, $var_label ] ) unless $matched;
b49c4318	339	}
e49731d7	340	push( @distinct_nodes, @remaining_nodes) if scalar( @remaining_nodes );
b49c4318	341	}
	342	}
	343
2ceca8c3	344	=item B<remove_duplicate_edges>
	345
	346	remove_duplicate_edges( $graph, $from, $to );
	347
	348	Given two nodes, reduce the number of edges between those nodes to
	349	one. If neither edge represents a base text, combine their labels.
	350
	351	=cut
	352
b49c4318	353	sub remove_duplicate_edges {
	354	my( $graph, $from, $to ) = @_;
	355	my @edges = $from->edges_to( $to );
	356	if( scalar @edges > 1 ) {
	357	my @base = grep { $_->label eq 'base text' } @edges;
	358	if ( scalar @base ) {
	359	# Remove the edges that are not base.
	360	foreach my $e ( @edges ) {
	361	$graph->del_edge( $e )
	362	unless $e eq $base[0];
	363	}
	364	} else {
	365	# Combine the edges into one.
	366	my $new_edge_name = join( ', ', map { $_->label() } @edges );
	367	my $new_edge = shift @edges;
	368	$new_edge->set_attribute( 'label', $new_edge_name );
	369	foreach my $e ( @edges ) {
	370	$graph->del_edge( $e );
	371	}
	372	}
	373	}
	374	}
	375
2ceca8c3	376	=item B<cmp_str>
	377
	378	Pretend you never saw this method. Really it needs to not be hardcoded.
	379
	380	=cut
	381
b49c4318	382	sub cmp_str {
	383	my( $node ) = @_;
	384	my $word = $node->label();
	385	$word = lc( $word );
	386	$word =~ s/\W//g;
	387	$word =~ s/v/u/g;
	388	$word =~ s/j/i/g;
	389	$word =~ s/cha/ca/g;
	390	$word =~ s/quatuor/quattuor/g;
	391	$word =~ s/ioannes/iohannes/g;
	392	return $word;
	393	}
	394
2ceca8c3	395	=back
	396
	397	=head1 LICENSE
	398
	399	This package is free software and is provided "as is" without express
	400	or implied warranty. You can redistribute it and/or modify it under
	401	the same terms as Perl itself.
	402
	403	=head1 AUTHOR
	404
	405	Tara L Andrews, aurum@cpan.org
	406
	407	=cut
	408
b49c4318	409	1;