[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / BaseText.pm

package Text::Tradition::Parser::BaseText;

use strict;
use warnings;
use Module::Load;

=head1 NAME

Text::Tradition::Parser::BaseText

=head1 SYNOPSIS

use Text::Tradition::Parser::BaseText qw( merge_base );
merge_base( $graph, 'reference.txt', @apparatus_entries )

=head1 DESCRIPTION

For an overview of the package, see the documentation for the
Text::Tradition::Graph module.

This module is meant for use with certain of the other Parser classes
- whenever a list of variants is given with reference to a base text,
these must be joined into a single collation.  The parser should
therefore make a list of variants and their locations, and BaseText
will join those listed variants onto the reference text.  

=head1 SUBROUTINES

=over

=item B<parse>

parse( $graph, %opts );

Takes an initialized graph and a set of options, which must include:
- 'base' - the base text referenced by the variants
- 'format' - the format of the variant list
- 'data' - the variants, in the given format.

=cut

sub parse {
    my( $graph, %opts ) = @_;

    my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
    load( $format_mod );
    my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
    merge_base( $graph, $opts{'base'}, @apparatus_entries );
}

=item B<merge_base>

merge_base( $graph, 'reference.txt', @apparatus_entries )

Takes three arguments: a newly-initialized Text::Tradition::Graph
object, a text file containing the reference text, and a list of
variants (apparatus entries).  Adds the base text to the graph, and
joins the variants to that.

The list of variants is an array of hash references; each hash takes
the form
 { '_id' => line reference,
   'rdg_0' => lemma reading,
   'rdg_1' => first variant,
   ...  # and so on until all distinct readings are listed
   'WitnessA' => 'rdg_0',
   'WitnessB' => 'rdg_1',
   ...  # and so on until all witnesses are listed with their readings
 }

Any hash key that is not of the form /^rdg_\d+$/ and that does not
begin with an underscore is assumed to be a witness name.  Any 'meta'
information to be passed must be passed in a key with a leading
underscore in its name.

=cut

sub merge_base {
    my( $graph, $base_file, @app_entries ) = @_;
    my @base_line_starts = read_base( $base_file, $graph );

    my %all_witnesses;
    foreach my $app ( @app_entries ) {
	my( $line, $num ) = split( /\./, $app->{_id} );
	# DEBUG with a short graph
	# last if $line > 2;
	# DEBUG for problematic entries
	my $scrutinize = "";
	my $first_line_node = $base_line_starts[ $line ];
	my $too_far = $base_line_starts[ $line+1 ];
	
	my $lemma = $app->{rdg_0};
	my $seq = 1; 
	# Is this the Nth occurrence of this reading in the line?
	if( $lemma =~ s/(_)?(\d)$// ) {
	    $seq = $2;
	}
	my @lemma_words = split( /\s+/, $lemma );
	
	# Now search for the lemma words within this line.
	my $lemma_start = $first_line_node;
	my $lemma_end;
	my %seen;
	while( $lemma_start ne $too_far ) {
	    # Loop detection
	    if( $seen{ $lemma_start->name() } ) {
		warn "Detected loop at " . $lemma_start->name() . 
		    ", ref $line,$num";
		last;
	    }
	    $seen{ $lemma_start->name() } = 1;
	    
	    # Try to match the lemma.
	    my $unmatch = 0;
	    print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
		$lemma_words[0] . "...\n"
		if "$line.$num" eq $scrutinize;
	    if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
		# Skip it if we need a match that is not the first.
		if( --$seq < 1 ) {
		    # Now we have to compare the rest of the words here.
		    if( scalar( @lemma_words ) > 1 ) {
			my $next_node = $graph->next_word( $lemma_start );
			foreach my $w ( @lemma_words[1..$#lemma_words] ) {
			    printf STDERR "Now matching %s against %s\n", 
				    cmp_str($next_node), $w
				if "$line.$num" eq $scrutinize;
			    if( $w ne cmp_str($next_node) ) {
				$unmatch = 1;
				last;
			    } else {
				$lemma_end = $next_node;
				$next_node = $graph->next_word( $lemma_end );
			    }
			}
		    } else {
			$lemma_end = $lemma_start;
		    }
		} else {
		    $unmatch = 1;
		}
	    }
	    last unless ( $unmatch || !defined( $lemma_end ) );
	    $lemma_end = undef;
	    $lemma_start = $graph->next_word( $lemma_start );
	}
	
	unless( $lemma_end ) {
	    warn "No match found for @lemma_words at $line.$num";
	    next;
	} else {
	    # These are no longer common nodes; unmark them as such.
	    my @lemma_nodes = $graph->node_sequence( $lemma_start, 
						     $lemma_end );
	    map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes;
	}
	
	# Now we have our lemma nodes; we add the variant nodes to the graph.
	
	# Keep track of the start and end point of each reading for later
	# node collapse.
	my @readings = ( $lemma_start, $lemma_end );

	# For each reading that is not rdg_0, we make a chain of nodes
	# and connect them to the anchor.  Edges are named after the mss
	# that are relevant.
	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
	    next if $k eq 'rdg_0'; # that's the lemma.
	    my @variant = split( /\s+/, $app->{$k} );
	    @variant = () if $app->{$k} eq '/'; # This is an omission.
	    my @mss = grep { $app->{$_} eq $k } keys( %$app );
	    
	    unless( @mss ) {
		print STDERR "Skipping '@variant' at $line.$num: no mss\n";
		next;
	    }
	    
	    # Determine the label name for the edges here.
	    my $edge_name = join(', ', @mss );
	    @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
	    
	    # Make the variant into a set of nodes.
	    my $ctr = 0;
	    my $last_node = $graph->prior_word( $lemma_start );
	    my $var_start;
	    foreach my $vw ( @variant ) {
		my $vwname = "$k/$line.$num.$ctr"; $ctr++;
		my $vwnode = $graph->add_node( $vwname );
		$vwnode->set_attribute( 'label', $vw );
		$vwnode->set_attribute( 'class', 'variant' );
		$graph->add_edge( $last_node, $vwnode, $edge_name );
		$var_start = $vwnode unless $var_start;
		$last_node = $vwnode;
	    }
	    # Now hook it up at the end.
	    $graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
					$edge_name );
	    
	    if( $var_start ) { # if it wasn't an empty reading
		push( @readings, $var_start, $last_node );
	    }
	}

	# Now collate and collapse the identical nodes within the graph.
	collate_variants( $graph, @readings );
    }

    ## Now in theory I have a graph.  I want to make it a little easier to
    ## read.  So I collapse nodes that have only one edge in and one edge
    ## out, and I do this by looking at the edges.
    
#     foreach my $edge ( $graph->edges() ) {
# 	my @out_edges = $edge->from()->outgoing();
# 	my @in_edges = $edge->to()->incoming();
	
# 	next if $edge->from() eq $graph->start();
# 	next if $edge->to()->name() eq '#END#';
# 	next unless scalar( @out_edges ) == 1;
# 	next unless scalar( @in_edges ) == 1;
# 	next unless $out_edges[0] eq $in_edges[0];
# 	# In theory if we've got this far, we're safe, but just to
# 	# double-check...
# 	next unless $out_edges[0] eq $edge;
	
# 	$graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
#     }

    # Now walk the path for each witness, so that we can do the
    # position calculations.
    my $paths = {};
    foreach my $w ( keys %all_witnesses ) {
	my $back = undef;
	if( $w =~ /^(.*)\s*\(p\.\s*c\.\)/ ) {
	    $back = $1;
	}
	my @wit_nodes = $graph->node_sequence( $graph->start, 
					       $graph->node( '#END#' ), 
					       $w, $back );
	my @wn_names = map { $_->name() } @wit_nodes;
	$paths->{$w} = \@wn_names;
    }
    $DB::single = 1;
    my @common_nodes = grep { $graph->is_common( $_ ) } $graph->nodes();
    $graph->make_positions( \@common_nodes, $paths );
}

=item B<read_base>

my @line_beginnings = read_base( 'reference.txt', $graph );

Takes a text file and a (presumed empty) graph object, adds the words
as simple linear nodes to the graph, and returns a list of nodes that
represent the beginning of lines. This graph is now the starting point
for application of apparatus entries in merge_base, e.g. from a CSV
file or a Classical Text Editor file.

=cut

sub read_base {
    my( $base_file, $graph ) = @_;
    
    # This array gives the first node for each line.  We put the
    # common starting point in line zero.
    my $last_node = $graph->start();
    my $lineref_array = [ $last_node ]; # There is no line zero.

    open( BASE, $base_file ) or die "Could not open file $base_file: $!";
    while(<BASE>) {
	# Make the nodes, and connect them up for the base, but also
	# save the first node of each line in an array for the purpose.
	chomp;
	my @words = split;
	my $started = 0;
	my $wordref = 0;
	my $lineref = scalar @$lineref_array;
	foreach my $w ( @words ) {
	    my $noderef = join( ',', $lineref, ++$wordref );
	    my $node = $graph->add_node( $noderef );
	    $node->set_attribute( 'label', $w );
	    $node->set_attribute( 'class', 'common' );
	    unless( $started ) {
		push( @$lineref_array, $node );
		$started = 1;
	    }
	    if( $last_node ) {
		my $edge = $graph->add_edge( $last_node, $node, "base text" );
		$edge->set_attribute( 'class', 'basetext' );
		$last_node = $node;
	    } # TODO there should be no else here...
	}
    }
    close BASE;
    # Ending point for all texts
    my $endpoint = $graph->add_node( '#END#' );
    $graph->add_edge( $last_node, $endpoint, "base text" );
    push( @$lineref_array, $endpoint );

    return( @$lineref_array );
}

=item B<collate_variants>

collate_variants( $graph, @readings )

Given a set of readings in the form 
( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
walks through each to identify those nodes that are identical.  The
graph is a Text::Tradition::Graph object; the elements of @readings are
Graph::Easy::Node objects that appear on the graph.

TODO: Handle collapsed and non-collapsed transpositions.

=cut

sub collate_variants {
    my( $graph, @readings ) = @_;
    my $lemma_start = shift @readings;
    my $lemma_end = shift @readings;
    my $detranspose = 0;

    # Start the list of distinct nodes with those nodes in the lemma.
    my @distinct_nodes;
    while( $lemma_start ne $lemma_end ) {
	push( @distinct_nodes, [ $lemma_start, 'base text' ] );
	$lemma_start = $graph->next_word( $lemma_start );
    } 
    push( @distinct_nodes, [ $lemma_end, 'base text' ] );
    

    while( scalar @readings ) {
	my( $var_start, $var_end ) = splice( @readings, 0, 2 );

	# I want to look at the nodes in the variant and lemma, and
	# collapse nodes that are the same word.  This is mini-collation.
	# Each word in the 'main' list can only be collapsed once with a
	# word from the current reading.
	my %collapsed = ();

	# Get the label. There will only be one outgoing edge to start
	# with, so this is safe.
	my @out = $var_start->outgoing();
	my $var_label = $out[0]->label();

	my @variant_nodes;
	while( $var_start ne $var_end ) {
	    push( @variant_nodes, $var_start );
	    $var_start = $graph->next_word( $var_start, $var_label );
	}
	push( @variant_nodes, $var_end );

	# Go through the variant nodes, and if we find a lemma node that
	# hasn't yet been collapsed with a node, equate them.  If we do
	# not, keep them to push onto the end of all_nodes.
	my @remaining_nodes;
	my $last_index = 0;
	foreach my $w ( @variant_nodes ) {
	    my $word = $w->label();
	    my $matched = 0;
	    foreach my $idx ( $last_index .. $#distinct_nodes ) {
		my( $l, $edgelabel ) = @{$distinct_nodes[$idx]};
		if( $word eq cmp_str( $l ) ) {
		    next if exists( $collapsed{ $l->label } )
			&& $collapsed{ $l->label } eq $l;
		    $matched = 1;
		    $last_index = $idx if $detranspose;
		    # Collapse the nodes.
		    printf STDERR "Merging nodes %s/%s and %s/%s\n", 
		        $l->name, $l->label, $w->name, $w->label;
		    $graph->merge_nodes( $l, $w );
		    $collapsed{ $l->label } = $l;
		    # Now collapse any multiple edges to and from the node.
 		    remove_duplicate_edges( $graph, 
 				    $graph->prior_word( $l, $edgelabel ), $l );
 		    remove_duplicate_edges( $graph, $l, 
 				    $graph->next_word( $l, $edgelabel ) );
		    last if $matched;
		}
	    }
	    push( @remaining_nodes, [ $w, $var_label ] ) unless $matched;
	}
	push( @distinct_nodes, @remaining_nodes) if scalar( @remaining_nodes );
    }
}

=item B<remove_duplicate_edges>

remove_duplicate_edges( $graph, $from, $to );

Given two nodes, reduce the number of edges between those nodes to
one.  If neither edge represents a base text, combine their labels.

=cut

sub remove_duplicate_edges {
    my( $graph, $from, $to ) = @_;
    my @edges = $from->edges_to( $to );
    if( scalar @edges > 1 ) {
	my @base = grep { $_->label eq 'base text' } @edges;
	if ( scalar @base ) {
	    # Remove the edges that are not base.
	    foreach my $e ( @edges ) {
		$graph->del_edge( $e )
		    unless $e eq $base[0];
	    }
	} else {
	    # Combine the edges into one.
	    my $new_edge_name = join( ', ', map { $_->label() } @edges );
	    my $new_edge = shift @edges;
	    $new_edge->set_attribute( 'label', $new_edge_name );
	    foreach my $e ( @edges ) {
		$graph->del_edge( $e );
	    }
	}
    }
}

=item B<cmp_str>

Pretend you never saw this method.  Really it needs to not be hardcoded.

=cut

sub cmp_str {
    my( $node ) = @_;
    my $word = $node->label();
    $word = lc( $word );
    $word =~ s/\W//g;
    $word =~ s/v/u/g;
    $word =~ s/j/i/g;
    $word =~ s/cha/ca/g;
    $word =~ s/quatuor/quattuor/g;
    $word =~ s/ioannes/iohannes/g;
    return $word;
}

=back

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews, aurum@cpan.org

=cut

1;
Commit	Line	Data
e58153d6	1	package Text::Tradition::Parser::BaseText;
b49c4318	2
	3	use strict;
	4	use warnings;
52ce987f	5	use Module::Load;
b49c4318	6
2ceca8c3	7	=head1 NAME
	8
	9	Text::Tradition::Parser::BaseText
	10
	11	=head1 SYNOPSIS
	12
	13	use Text::Tradition::Parser::BaseText qw( merge_base );
	14	merge_base( $graph, 'reference.txt', @apparatus_entries )
	15
	16	=head1 DESCRIPTION
	17
	18	For an overview of the package, see the documentation for the
	19	Text::Tradition::Graph module.
	20
	21	This module is meant for use with certain of the other Parser classes
	22	- whenever a list of variants is given with reference to a base text,
	23	these must be joined into a single collation. The parser should
	24	therefore make a list of variants and their locations, and BaseText
	25	will join those listed variants onto the reference text.
	26
	27	=head1 SUBROUTINES
	28
	29	=over
	30
52ce987f	31	=item B<parse>
	32
	33	parse( $graph, %opts );
	34
	35	Takes an initialized graph and a set of options, which must include:
	36	- 'base' - the base text referenced by the variants
	37	- 'format' - the format of the variant list
	38	- 'data' - the variants, in the given format.
	39
	40	=cut
	41
	42	sub parse {
	43	my( $graph, %opts ) = @_;
	44
	45	my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
	46	load( $format_mod );
	47	my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
	48	merge_base( $graph, $opts{'base'}, @apparatus_entries );
	49	}
	50
2ceca8c3	51	=item B<merge_base>
	52
	53	merge_base( $graph, 'reference.txt', @apparatus_entries )
	54
	55	Takes three arguments: a newly-initialized Text::Tradition::Graph
	56	object, a text file containing the reference text, and a list of
	57	variants (apparatus entries). Adds the base text to the graph, and
	58	joins the variants to that.
	59
	60	The list of variants is an array of hash references; each hash takes
	61	the form
	62	{ '_id' => line reference,
	63	'rdg_0' => lemma reading,
	64	'rdg_1' => first variant,
	65	... # and so on until all distinct readings are listed
	66	'WitnessA' => 'rdg_0',
	67	'WitnessB' => 'rdg_1',
	68	... # and so on until all witnesses are listed with their readings
	69	}
	70
	71	Any hash key that is not of the form /^rdg_\d+$/ and that does not
	72	begin with an underscore is assumed to be a witness name. Any 'meta'
	73	information to be passed must be passed in a key with a leading
	74	underscore in its name.
	75
	76	=cut
	77
b49c4318	78	sub merge_base {
	79	my( $graph, $base_file, @app_entries ) = @_;
	80	my @base_line_starts = read_base( $base_file, $graph );
	81
52ce987f	82	my %all_witnesses;
b49c4318	83	foreach my $app ( @app_entries ) {
	84	my( $line, $num ) = split( /\./, $app->{_id} );
	85	# DEBUG with a short graph
	86	# last if $line > 2;
2ceca8c3	87	# DEBUG for problematic entries
e49731d7	88	my $scrutinize = "";
b49c4318	89	my $first_line_node = $base_line_starts[ $line ];
	90	my $too_far = $base_line_starts[ $line+1 ];
	91
	92	my $lemma = $app->{rdg_0};
	93	my $seq = 1;
	94	# Is this the Nth occurrence of this reading in the line?
	95	if( $lemma =~ s/(_)?(\d)$// ) {
	96	$seq = $2;
	97	}
	98	my @lemma_words = split( /\s+/, $lemma );
	99
	100	# Now search for the lemma words within this line.
	101	my $lemma_start = $first_line_node;
	102	my $lemma_end;
	103	my %seen;
	104	while( $lemma_start ne $too_far ) {
	105	# Loop detection
	106	if( $seen{ $lemma_start->name() } ) {
	107	warn "Detected loop at " . $lemma_start->name() .
	108	", ref $line,$num";
	109	last;
	110	}
	111	$seen{ $lemma_start->name() } = 1;
	112
	113	# Try to match the lemma.
	114	my $unmatch = 0;
	115	print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
	116	$lemma_words[0] . "...\n"
	117	if "$line.$num" eq $scrutinize;
	118	if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
	119	# Skip it if we need a match that is not the first.
	120	if( --$seq < 1 ) {
	121	# Now we have to compare the rest of the words here.
	122	if( scalar( @lemma_words ) > 1 ) {
	123	my $next_node = $graph->next_word( $lemma_start );
	124	foreach my $w ( @lemma_words[1..$#lemma_words] ) {
	125	printf STDERR "Now matching %s against %s\n",
	126	cmp_str($next_node), $w
	127	if "$line.$num" eq $scrutinize;
	128	if( $w ne cmp_str($next_node) ) {
	129	$unmatch = 1;
	130	last;
	131	} else {
	132	$lemma_end = $next_node;
	133	$next_node = $graph->next_word( $lemma_end );
	134	}
	135	}
	136	} else {
	137	$lemma_end = $lemma_start;
	138	}
	139	} else {
	140	$unmatch = 1;
	141	}
	142	}
	143	last unless ( $unmatch \|\| !defined( $lemma_end ) );
	144	$lemma_end = undef;
	145	$lemma_start = $graph->next_word( $lemma_start );
	146	}
	147
	148	unless( $lemma_end ) {
	149	warn "No match found for @lemma_words at $line.$num";
	150	next;
	151	} else {
	152	# These are no longer common nodes; unmark them as such.
153	my @lemma_nodes = $graph->node_sequence( $lemma_start,
154	$lemma_end );
155	map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes;
156	}
157
158	# Now we have our lemma nodes; we add the variant nodes to the graph.
159
e49731d7	160	# Keep track of the start and end point of each reading for later
	161	# node collapse.
	162	my @readings = ( $lemma_start, $lemma_end );
	163
b49c4318	164	# For each reading that is not rdg_0, we make a chain of nodes
	165	# and connect them to the anchor. Edges are named after the mss
	166	# that are relevant.
	167	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
	168	next if $k eq 'rdg_0'; # that's the lemma.
	169	my @variant = split( /\s+/, $app->{$k} );
	170	@variant = () if $app->{$k} eq '/'; # This is an omission.
	171	my @mss = grep { $app->{$_} eq $k } keys( %$app );
	172
	173	unless( @mss ) {
	174	print STDERR "Skipping '@variant' at $line.$num: no mss\n";
	175	next;
	176	}
	177
	178	# Determine the label name for the edges here.
	179	my $edge_name = join(', ', @mss );
52ce987f	180	@all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
b49c4318	181
	182	# Make the variant into a set of nodes.
	183	my $ctr = 0;
	184	my $last_node = $graph->prior_word( $lemma_start );
	185	my $var_start;
	186	foreach my $vw ( @variant ) {
	187	my $vwname = "$k/$line.$num.$ctr"; $ctr++;
	188	my $vwnode = $graph->add_node( $vwname );
	189	$vwnode->set_attribute( 'label', $vw );
	190	$vwnode->set_attribute( 'class', 'variant' );
	191	$graph->add_edge( $last_node, $vwnode, $edge_name );
	192	$var_start = $vwnode unless $var_start;
	193	$last_node = $vwnode;
	194	}
	195	# Now hook it up at the end.
	196	$graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
	197	$edge_name );
	198
e49731d7	199	if( $var_start ) { # if it wasn't an empty reading
	200	push( @readings, $var_start, $last_node );
	201	}
b49c4318	202	}
e49731d7	203
	204	# Now collate and collapse the identical nodes within the graph.
	205	collate_variants( $graph, @readings );
b49c4318	206	}
	207
	208	## Now in theory I have a graph. I want to make it a little easier to
	209	## read. So I collapse nodes that have only one edge in and one edge
	210	## out, and I do this by looking at the edges.
	211
52ce987f	212	# foreach my $edge ( $graph->edges() ) {
	213	# my @out_edges = $edge->from()->outgoing();
	214	# my @in_edges = $edge->to()->incoming();
b49c4318	215
52ce987f	216	# next if $edge->from() eq $graph->start();
	217	# next if $edge->to()->name() eq '#END#';
	218	# next unless scalar( @out_edges ) == 1;
	219	# next unless scalar( @in_edges ) == 1;
	220	# next unless $out_edges[0] eq $in_edges[0];
	221	# # In theory if we've got this far, we're safe, but just to
	222	# # double-check...
	223	# next unless $out_edges[0] eq $edge;
b49c4318	224
52ce987f	225	# $graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
	226	# }
	227
	228	# Now walk the path for each witness, so that we can do the
	229	# position calculations.
	230	my $paths = {};
	231	foreach my $w ( keys %all_witnesses ) {
	232	my $back = undef;
	233	if( $w =~ /^(.)\s\(p\.\s*c\.\)/ ) {
	234	$back = $1;
	235	}
	236	my @wit_nodes = $graph->node_sequence( $graph->start,
	237	$graph->node( '#END#' ),
	238	$w, $back );
	239	my @wn_names = map { $_->name() } @wit_nodes;
	240	$paths->{$w} = \@wn_names;
b49c4318	241	}
52ce987f	242	$DB::single = 1;
	243	my @common_nodes = grep { $graph->is_common( $_ ) } $graph->nodes();
	244	$graph->make_positions( \@common_nodes, $paths );
b49c4318	245	}
b49c4318	246
2ceca8c3	247	=item B<read_base>
	248
	249	my @line_beginnings = read_base( 'reference.txt', $graph );
	250
	251	Takes a text file and a (presumed empty) graph object, adds the words
	252	as simple linear nodes to the graph, and returns a list of nodes that
	253	represent the beginning of lines. This graph is now the starting point
	254	for application of apparatus entries in merge_base, e.g. from a CSV
	255	file or a Classical Text Editor file.
	256
	257	=cut
b49c4318	258
	259	sub read_base {
	260	my( $base_file, $graph ) = @_;
	261
	262	# This array gives the first node for each line. We put the
	263	# common starting point in line zero.
	264	my $last_node = $graph->start();
	265	my $lineref_array = [ $last_node ]; # There is no line zero.
	266
	267	open( BASE, $base_file ) or die "Could not open file $base_file: $!";
	268	while(<BASE>) {
	269	# Make the nodes, and connect them up for the base, but also
	270	# save the first node of each line in an array for the purpose.
	271	chomp;
	272	my @words = split;
	273	my $started = 0;
	274	my $wordref = 0;
	275	my $lineref = scalar @$lineref_array;
	276	foreach my $w ( @words ) {
	277	my $noderef = join( ',', $lineref, ++$wordref );
	278	my $node = $graph->add_node( $noderef );
	279	$node->set_attribute( 'label', $w );
	280	$node->set_attribute( 'class', 'common' );
	281	unless( $started ) {
	282	push( @$lineref_array, $node );
	283	$started = 1;
	284	}
	285	if( $last_node ) {
e49731d7	286	my $edge = $graph->add_edge( $last_node, $node, "base text" );
e49731d7	287	$edge->set_attribute( 'class', 'basetext' );
b49c4318	288	$last_node = $node;
	289	} # TODO there should be no else here...
	290	}
	291	}
	292	close BASE;
	293	# Ending point for all texts
	294	my $endpoint = $graph->add_node( '#END#' );
	295	$graph->add_edge( $last_node, $endpoint, "base text" );
	296	push( @$lineref_array, $endpoint );
	297
	298	return( @$lineref_array );
	299	}
	300
e49731d7	301	=item B<collate_variants>
2ceca8c3	302
e49731d7	303	collate_variants( $graph, @readings )
2ceca8c3	304
e49731d7	305	Given a set of readings in the form
e49731d7	306	( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
2ceca8c3	307	walks through each to identify those nodes that are identical. The
e49731d7	308	graph is a Text::Tradition::Graph object; the elements of @readings are
2ceca8c3	309	Graph::Easy::Node objects that appear on the graph.
b49c4318	310
2ceca8c3	311	TODO: Handle collapsed and non-collapsed transpositions.
	312
	313	=cut
b49c4318	314
e49731d7	315	sub collate_variants {
	316	my( $graph, @readings ) = @_;
	317	my $lemma_start = shift @readings;
	318	my $lemma_end = shift @readings;
52ce987f	319	my $detranspose = 0;
b49c4318	320
e49731d7	321	# Start the list of distinct nodes with those nodes in the lemma.
e49731d7	322	my @distinct_nodes;
b49c4318	323	while( $lemma_start ne $lemma_end ) {
e49731d7	324	push( @distinct_nodes, [ $lemma_start, 'base text' ] );
b49c4318	325	$lemma_start = $graph->next_word( $lemma_start );
b49c4318	326	}
e49731d7	327	push( @distinct_nodes, [ $lemma_end, 'base text' ] );
b49c4318	328
e49731d7	329
	330	while( scalar @readings ) {
	331	my( $var_start, $var_end ) = splice( @readings, 0, 2 );
	332
	333	# I want to look at the nodes in the variant and lemma, and
	334	# collapse nodes that are the same word. This is mini-collation.
	335	# Each word in the 'main' list can only be collapsed once with a
	336	# word from the current reading.
	337	my %collapsed = ();
	338
	339	# Get the label. There will only be one outgoing edge to start
	340	# with, so this is safe.
	341	my @out = $var_start->outgoing();
	342	my $var_label = $out[0]->label();
	343
	344	my @variant_nodes;
	345	while( $var_start ne $var_end ) {
	346	push( @variant_nodes, $var_start );
	347	$var_start = $graph->next_word( $var_start, $var_label );
	348	}
	349	push( @variant_nodes, $var_end );
	350
	351	# Go through the variant nodes, and if we find a lemma node that
	352	# hasn't yet been collapsed with a node, equate them. If we do
	353	# not, keep them to push onto the end of all_nodes.
	354	my @remaining_nodes;
	355	my $last_index = 0;
	356	foreach my $w ( @variant_nodes ) {
	357	my $word = $w->label();
	358	my $matched = 0;
	359	foreach my $idx ( $last_index .. $#distinct_nodes ) {
	360	my( $l, $edgelabel ) = @{$distinct_nodes[$idx]};
	361	if( $word eq cmp_str( $l ) ) {
	362	next if exists( $collapsed{ $l->label } )
	363	&& $collapsed{ $l->label } eq $l;
	364	$matched = 1;
	365	$last_index = $idx if $detranspose;
	366	# Collapse the nodes.
	367	printf STDERR "Merging nodes %s/%s and %s/%s\n",
	368	$l->name, $l->label, $w->name, $w->label;
	369	$graph->merge_nodes( $l, $w );
	370	$collapsed{ $l->label } = $l;
	371	# Now collapse any multiple edges to and from the node.
	372	remove_duplicate_edges( $graph,
	373	$graph->prior_word( $l, $edgelabel ), $l );
	374	remove_duplicate_edges( $graph, $l,
	375	$graph->next_word( $l, $edgelabel ) );
52ce987f	376	last if $matched;
e49731d7	377	}
b49c4318	378	}
e49731d7	379	push( @remaining_nodes, [ $w, $var_label ] ) unless $matched;
b49c4318	380	}
e49731d7	381	push( @distinct_nodes, @remaining_nodes) if scalar( @remaining_nodes );
b49c4318	382	}
	383	}
	384
2ceca8c3	385	=item B<remove_duplicate_edges>
	386
	387	remove_duplicate_edges( $graph, $from, $to );
	388
	389	Given two nodes, reduce the number of edges between those nodes to
	390	one. If neither edge represents a base text, combine their labels.
	391
	392	=cut
	393
b49c4318	394	sub remove_duplicate_edges {
	395	my( $graph, $from, $to ) = @_;
	396	my @edges = $from->edges_to( $to );
	397	if( scalar @edges > 1 ) {
	398	my @base = grep { $_->label eq 'base text' } @edges;
	399	if ( scalar @base ) {
	400	# Remove the edges that are not base.
	401	foreach my $e ( @edges ) {
	402	$graph->del_edge( $e )
	403	unless $e eq $base[0];
	404	}
	405	} else {
	406	# Combine the edges into one.
	407	my $new_edge_name = join( ', ', map { $_->label() } @edges );
	408	my $new_edge = shift @edges;
	409	$new_edge->set_attribute( 'label', $new_edge_name );
	410	foreach my $e ( @edges ) {
	411	$graph->del_edge( $e );
	412	}
	413	}
	414	}
	415	}
	416
2ceca8c3	417	=item B<cmp_str>
	418
	419	Pretend you never saw this method. Really it needs to not be hardcoded.
	420
	421	=cut
	422
b49c4318	423	sub cmp_str {
	424	my( $node ) = @_;
	425	my $word = $node->label();
	426	$word = lc( $word );
	427	$word =~ s/\W//g;
	428	$word =~ s/v/u/g;
	429	$word =~ s/j/i/g;
	430	$word =~ s/cha/ca/g;
	431	$word =~ s/quatuor/quattuor/g;
	432	$word =~ s/ioannes/iohannes/g;
	433	return $word;
	434	}
	435
2ceca8c3	436	=back
	437
	438	=head1 LICENSE
	439
	440	This package is free software and is provided "as is" without express
	441	or implied warranty. You can redistribute it and/or modify it under
	442	the same terms as Perl itself.
	443
	444	=head1 AUTHOR
	445
	446	Tara L Andrews, aurum@cpan.org
	447
	448	=cut
	449
b49c4318	450	1;