[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / BaseText.pm

package Text::Tradition::Parser::BaseText;

use strict;
use warnings;
use Exporter 'import';
use vars qw( @EXPORT_OK );
@EXPORT_OK = qw( merge_base );

sub merge_base {
    my( $graph, $base_file, @app_entries ) = @_;
    my @base_line_starts = read_base( $base_file, $graph );

    foreach my $app ( @app_entries ) {
	my( $line, $num ) = split( /\./, $app->{_id} );
	# DEBUG with a short graph
	# last if $line > 2;
	my $scrutinize = "21.8";
	my $first_line_node = $base_line_starts[ $line ];
	my $too_far = $base_line_starts[ $line+1 ];
	
	my $lemma = $app->{rdg_0};
	my $seq = 1; 
	# Is this the Nth occurrence of this reading in the line?
	if( $lemma =~ s/(_)?(\d)$// ) {
	    $seq = $2;
	}
	my @lemma_words = split( /\s+/, $lemma );
	
	# Now search for the lemma words within this line.
	my $lemma_start = $first_line_node;
	my $lemma_end;
	my %seen;
	while( $lemma_start ne $too_far ) {
	    # Loop detection
	    if( $seen{ $lemma_start->name() } ) {
		warn "Detected loop at " . $lemma_start->name() . 
		    ", ref $line,$num";
		last;
	    }
	    $seen{ $lemma_start->name() } = 1;
	    
	    # Try to match the lemma.
	    my $unmatch = 0;
	    print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
		$lemma_words[0] . "...\n"
		if "$line.$num" eq $scrutinize;
	    if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
		# Skip it if we need a match that is not the first.
		if( --$seq < 1 ) {
		    # Now we have to compare the rest of the words here.
		    if( scalar( @lemma_words ) > 1 ) {
			my $next_node = $graph->next_word( $lemma_start );
			foreach my $w ( @lemma_words[1..$#lemma_words] ) {
			    printf STDERR "Now matching %s against %s\n", 
				    cmp_str($next_node), $w
				if "$line.$num" eq $scrutinize;
			    if( $w ne cmp_str($next_node) ) {
				$unmatch = 1;
				last;
			    } else {
				$lemma_end = $next_node;
				$next_node = $graph->next_word( $lemma_end );
			    }
			}
		    } else {
			$lemma_end = $lemma_start;
		    }
		} else {
		    $unmatch = 1;
		}
	    }
	    last unless ( $unmatch || !defined( $lemma_end ) );
	    $lemma_end = undef;
	    $lemma_start = $graph->next_word( $lemma_start );
	}
	
	unless( $lemma_end ) {
	    warn "No match found for @lemma_words at $line.$num";
	    next;
	} else {
	    # These are no longer common nodes; unmark them as such.
	    my @lemma_nodes = $graph->node_sequence( $lemma_start, 
						     $lemma_end );
	    map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes;
	}
	
	# Now we have our lemma nodes; we add the variant nodes to the graph.
	
	# For each reading that is not rdg_0, we make a chain of nodes
	# and connect them to the anchor.  Edges are named after the mss
	# that are relevant.
	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
	    next if $k eq 'rdg_0'; # that's the lemma.
	    my @variant = split( /\s+/, $app->{$k} );
	    @variant = () if $app->{$k} eq '/'; # This is an omission.
	    my @mss = grep { $app->{$_} eq $k } keys( %$app );
	    
	    unless( @mss ) {
		print STDERR "Skipping '@variant' at $line.$num: no mss\n";
		next;
	    }
	    
	    # Determine the label name for the edges here.
	    my $edge_name = join(', ', @mss );
	    
	    # Make the variant into a set of nodes.
	    my $ctr = 0;
	    my $last_node = $graph->prior_word( $lemma_start );
	    my $var_start;
	    foreach my $vw ( @variant ) {
		my $vwname = "$k/$line.$num.$ctr"; $ctr++;
		my $vwnode = $graph->add_node( $vwname );
		$vwnode->set_attribute( 'label', $vw );
		$vwnode->set_attribute( 'class', 'variant' );
		$graph->add_edge( $last_node, $vwnode, $edge_name );
		$var_start = $vwnode unless $var_start;
		$last_node = $vwnode;
	    }
	    # Now hook it up at the end.
	    $graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
					$edge_name );
	    
	    # Now collate and collapse the identical nodes within the graph.
	    collate_variant( $graph, $lemma_start, $lemma_end, 
			     $var_start, $last_node );
	    
	}
    }

    ## Now in theory I have a graph.  I want to make it a little easier to
    ## read.  So I collapse nodes that have only one edge in and one edge
    ## out, and I do this by looking at the edges.
    
    foreach my $edge ( $graph->edges() ) {
	my @out_edges = $edge->from()->outgoing();
	my @in_edges = $edge->to()->incoming();
	
	next unless scalar( @out_edges ) == 1;
	next unless scalar( @in_edges ) == 1;
	next unless $out_edges[0] eq $in_edges[0];
	# In theory if we've got this far, we're safe, but just to
	# double-check...
	next unless $out_edges[0] eq $edge;
	
	$graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
    }
}

# read_base: Takes a text file and a (presumed empty) graph object,
# adds the words as simple linear nodes to the graph, and returns a
# list of nodes that represent the beginning of lines. This graph is
# now the starting point for application of apparatus entries in
# merge_base, e.g. from a CSV file or a CTE file.

sub read_base {
    my( $base_file, $graph ) = @_;
    
    # This array gives the first node for each line.  We put the
    # common starting point in line zero.
    my $last_node = $graph->start();
    my $lineref_array = [ $last_node ]; # There is no line zero.

    open( BASE, $base_file ) or die "Could not open file $base_file: $!";
    while(<BASE>) {
	# Make the nodes, and connect them up for the base, but also
	# save the first node of each line in an array for the purpose.
	chomp;
	my @words = split;
	my $started = 0;
	my $wordref = 0;
	my $lineref = scalar @$lineref_array;
	foreach my $w ( @words ) {
	    my $noderef = join( ',', $lineref, ++$wordref );
	    my $node = $graph->add_node( $noderef );
	    $node->set_attribute( 'label', $w );
	    $node->set_attribute( 'class', 'common' );
	    unless( $started ) {
		push( @$lineref_array, $node );
		$started = 1;
	    }
	    if( $last_node ) {
		$graph->add_edge( $last_node, $node, "base text" );
		$last_node = $node;
	    } # TODO there should be no else here...
	}
    }
    close BASE;
    # Ending point for all texts
    my $endpoint = $graph->add_node( '#END#' );
    $graph->add_edge( $last_node, $endpoint, "base text" );
    push( @$lineref_array, $endpoint );

    return( @$lineref_array );
}


## Helper methods for merge_base

sub collate_variant {
    my( $graph, $lemma_start, $lemma_end, $var_start, $var_end ) = @_;
    # If var_start is undef, then the variant is an omission and
    # there's nothing to collate. Return.
    return unless $var_start;

    # I want to look at the nodes in the variant and lemma, and
    # collapse nodes that are the same word.  This is mini-collation.
    my %collapsed = ();
    # There will only be one outgoing edge at first, so this is safe.
    my @out = $var_start->outgoing();
    my $var_label = $out[0]->label();

    my @lemma_nodes;
    while( $lemma_start ne $lemma_end ) {
	push( @lemma_nodes, $lemma_start );
	$lemma_start = $graph->next_word( $lemma_start );
    } 
    push( @lemma_nodes, $lemma_end );
    
    my @variant_nodes;
    while( $var_start ne $var_end ) {
	push( @variant_nodes, $var_start );
	$var_start = $graph->next_word( $var_start, $var_label );
    }
    push( @variant_nodes, $var_end );

    # Go through the variant nodes, and if we find a lemma node that
    # hasn't yet been collapsed with a node, equate them.

    foreach my $w ( @variant_nodes ) {
	my $word = $w->label();
	foreach my $l ( @lemma_nodes ) {
	    if( $word eq cmp_str( $l ) ) {
		next if exists( $collapsed{ $l->label } )
		    && $collapsed{ $l->label } eq $l;
		# Collapse the nodes.
		printf STDERR "Merging nodes %s/%s and %s/%s\n", 
		    $l->name, $l->label, $w->name, $w->label;
		$graph->merge_nodes( $l, $w );
		$collapsed{ $l->label } = $l;
		# Now collapse any multiple edges to and from the node.
		# Rely on the presence of the 'base text' edge.
		remove_duplicate_edges( $graph, $graph->prior_word( $l ), $l );
		remove_duplicate_edges( $graph, $l, $graph->next_word( $l ) );
	    }
	}
    }
}

sub remove_duplicate_edges {
    my( $graph, $from, $to ) = @_;
    my @edges = $from->edges_to( $to );
    if( scalar @edges > 1 ) {
	my @base = grep { $_->label eq 'base text' } @edges;
	if ( scalar @base ) {
	    # Remove the edges that are not base.
	    foreach my $e ( @edges ) {
		$graph->del_edge( $e )
		    unless $e eq $base[0];
	    }
	} else {
	    # Combine the edges into one.
	    my $new_edge_name = join( ', ', map { $_->label() } @edges );
	    my $new_edge = shift @edges;
	    $new_edge->set_attribute( 'label', $new_edge_name );
	    foreach my $e ( @edges ) {
		$graph->del_edge( $e );
	    }
	}
    }
}

# TODO need to make this configurable!
sub cmp_str {
    my( $node ) = @_;
    my $word = $node->label();
    $word = lc( $word );
    $word =~ s/\W//g;
    $word =~ s/v/u/g;
    $word =~ s/j/i/g;
    $word =~ s/cha/ca/g;
    $word =~ s/quatuor/quattuor/g;
    $word =~ s/ioannes/iohannes/g;
    return $word;
}

1;
Commit	Line	Data
e58153d6	1	package Text::Tradition::Parser::BaseText;
b49c4318	2
	3	use strict;
	4	use warnings;
	5	use Exporter 'import';
	6	use vars qw( @EXPORT_OK );
	7	@EXPORT_OK = qw( merge_base );
	8
	9	sub merge_base {
	10	my( $graph, $base_file, @app_entries ) = @_;
	11	my @base_line_starts = read_base( $base_file, $graph );
	12
	13	foreach my $app ( @app_entries ) {
	14	my( $line, $num ) = split( /\./, $app->{_id} );
	15	# DEBUG with a short graph
	16	# last if $line > 2;
	17	my $scrutinize = "21.8";
	18	my $first_line_node = $base_line_starts[ $line ];
	19	my $too_far = $base_line_starts[ $line+1 ];
	20
	21	my $lemma = $app->{rdg_0};
	22	my $seq = 1;
	23	# Is this the Nth occurrence of this reading in the line?
	24	if( $lemma =~ s/(_)?(\d)$// ) {
	25	$seq = $2;
	26	}
	27	my @lemma_words = split( /\s+/, $lemma );
	28
	29	# Now search for the lemma words within this line.
	30	my $lemma_start = $first_line_node;
	31	my $lemma_end;
	32	my %seen;
	33	while( $lemma_start ne $too_far ) {
	34	# Loop detection
	35	if( $seen{ $lemma_start->name() } ) {
	36	warn "Detected loop at " . $lemma_start->name() .
	37	", ref $line,$num";
	38	last;
	39	}
	40	$seen{ $lemma_start->name() } = 1;
	41
	42	# Try to match the lemma.
	43	my $unmatch = 0;
	44	print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
	45	$lemma_words[0] . "...\n"
	46	if "$line.$num" eq $scrutinize;
	47	if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
	48	# Skip it if we need a match that is not the first.
	49	if( --$seq < 1 ) {
	50	# Now we have to compare the rest of the words here.
	51	if( scalar( @lemma_words ) > 1 ) {
	52	my $next_node = $graph->next_word( $lemma_start );
	53	foreach my $w ( @lemma_words[1..$#lemma_words] ) {
	54	printf STDERR "Now matching %s against %s\n",
	55	cmp_str($next_node), $w
	56	if "$line.$num" eq $scrutinize;
	57	if( $w ne cmp_str($next_node) ) {
	58	$unmatch = 1;
	59	last;
	60	} else {
	61	$lemma_end = $next_node;
	62	$next_node = $graph->next_word( $lemma_end );
	63	}
	64	}
	65	} else {
66	$lemma_end = $lemma_start;
67	}
68	} else {
69	$unmatch = 1;
70	}
71	}
72	last unless ( $unmatch \|\| !defined( $lemma_end ) );
73	$lemma_end = undef;
74	$lemma_start = $graph->next_word( $lemma_start );
75	}
76
77	unless( $lemma_end ) {
78	warn "No match found for @lemma_words at $line.$num";
79	next;
80	} else {
81	# These are no longer common nodes; unmark them as such.
82	my @lemma_nodes = $graph->node_sequence( $lemma_start,
83	$lemma_end );
84	map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes;
85	}
86
87	# Now we have our lemma nodes; we add the variant nodes to the graph.
88
89	# For each reading that is not rdg_0, we make a chain of nodes
90	# and connect them to the anchor. Edges are named after the mss
91	# that are relevant.
92	foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
93	next if $k eq 'rdg_0'; # that's the lemma.
94	my @variant = split( /\s+/, $app->{$k} );
95	@variant = () if $app->{$k} eq '/'; # This is an omission.
96	my @mss = grep { $app->{$_} eq $k } keys( %$app );
97
98	unless( @mss ) {
99	print STDERR "Skipping '@variant' at $line.$num: no mss\n";
100	next;
101	}
102
103	# Determine the label name for the edges here.
104	my $edge_name = join(', ', @mss );
105
106	# Make the variant into a set of nodes.
107	my $ctr = 0;
108	my $last_node = $graph->prior_word( $lemma_start );
109	my $var_start;
110	foreach my $vw ( @variant ) {
111	my $vwname = "$k/$line.$num.$ctr"; $ctr++;
112	my $vwnode = $graph->add_node( $vwname );
113	$vwnode->set_attribute( 'label', $vw );
114	$vwnode->set_attribute( 'class', 'variant' );
115	$graph->add_edge( $last_node, $vwnode, $edge_name );
116	$var_start = $vwnode unless $var_start;
117	$last_node = $vwnode;
118	}
119	# Now hook it up at the end.
120	$graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
121	$edge_name );
122
123	# Now collate and collapse the identical nodes within the graph.
124	collate_variant( $graph, $lemma_start, $lemma_end,
125	$var_start, $last_node );
126
127	}
128	}
129
130	## Now in theory I have a graph. I want to make it a little easier to
131	## read. So I collapse nodes that have only one edge in and one edge
132	## out, and I do this by looking at the edges.
133
134	foreach my $edge ( $graph->edges() ) {
135	my @out_edges = $edge->from()->outgoing();
136	my @in_edges = $edge->to()->incoming();
137
138	next unless scalar( @out_edges ) == 1;
139	next unless scalar( @in_edges ) == 1;
140	next unless $out_edges[0] eq $in_edges[0];
141	# In theory if we've got this far, we're safe, but just to
142	# double-check...
143	next unless $out_edges[0] eq $edge;
144
145	$graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
146	}
147	}
148
149	# read_base: Takes a text file and a (presumed empty) graph object,
150	# adds the words as simple linear nodes to the graph, and returns a
151	# list of nodes that represent the beginning of lines. This graph is
152	# now the starting point for application of apparatus entries in
153	# merge_base, e.g. from a CSV file or a CTE file.
154
155	sub read_base {
156	my( $base_file, $graph ) = @_;
157
158	# This array gives the first node for each line. We put the
159	# common starting point in line zero.
160	my $last_node = $graph->start();
161	my $lineref_array = [ $last_node ]; # There is no line zero.
162
163	open( BASE, $base_file ) or die "Could not open file $base_file: $!";
164	while(<BASE>) {
165	# Make the nodes, and connect them up for the base, but also
166	# save the first node of each line in an array for the purpose.
167	chomp;
168	my @words = split;
169	my $started = 0;
170	my $wordref = 0;
171	my $lineref = scalar @$lineref_array;
172	foreach my $w ( @words ) {
173	my $noderef = join( ',', $lineref, ++$wordref );
174	my $node = $graph->add_node( $noderef );
175	$node->set_attribute( 'label', $w );
176	$node->set_attribute( 'class', 'common' );
177	unless( $started ) {
178	push( @$lineref_array, $node );
179	$started = 1;
180	}
181	if( $last_node ) {
182	$graph->add_edge( $last_node, $node, "base text" );
183	$last_node = $node;
184	} # TODO there should be no else here...
185	}
186	}
187	close BASE;
188	# Ending point for all texts
189	my $endpoint = $graph->add_node( '#END#' );
190	$graph->add_edge( $last_node, $endpoint, "base text" );
191	push( @$lineref_array, $endpoint );
192
193	return( @$lineref_array );
194	}
195
196
197	## Helper methods for merge_base
198
199	sub collate_variant {
200	my( $graph, $lemma_start, $lemma_end, $var_start, $var_end ) = @_;
201	# If var_start is undef, then the variant is an omission and
202	# there's nothing to collate. Return.
203	return unless $var_start;
204
205	# I want to look at the nodes in the variant and lemma, and
206	# collapse nodes that are the same word. This is mini-collation.
207	my %collapsed = ();
208	# There will only be one outgoing edge at first, so this is safe.
209	my @out = $var_start->outgoing();
210	my $var_label = $out[0]->label();
211
212	my @lemma_nodes;
213	while( $lemma_start ne $lemma_end ) {
214	push( @lemma_nodes, $lemma_start );
215	$lemma_start = $graph->next_word( $lemma_start );
216	}
217	push( @lemma_nodes, $lemma_end );
218
219	my @variant_nodes;
220	while( $var_start ne $var_end ) {
221	push( @variant_nodes, $var_start );
222	$var_start = $graph->next_word( $var_start, $var_label );
223	}
224	push( @variant_nodes, $var_end );
225
226	# Go through the variant nodes, and if we find a lemma node that
227	# hasn't yet been collapsed with a node, equate them.
228
229	foreach my $w ( @variant_nodes ) {
230	my $word = $w->label();
231	foreach my $l ( @lemma_nodes ) {
232	if( $word eq cmp_str( $l ) ) {
233	next if exists( $collapsed{ $l->label } )
234	&& $collapsed{ $l->label } eq $l;
235	# Collapse the nodes.
236	printf STDERR "Merging nodes %s/%s and %s/%s\n",
237	$l->name, $l->label, $w->name, $w->label;
238	$graph->merge_nodes( $l, $w );
239	$collapsed{ $l->label } = $l;
240	# Now collapse any multiple edges to and from the node.
241	# Rely on the presence of the 'base text' edge.
242	remove_duplicate_edges( $graph, $graph->prior_word( $l ), $l );
243	remove_duplicate_edges( $graph, $l, $graph->next_word( $l ) );
244	}
245	}
246	}
247	}
248
249	sub remove_duplicate_edges {
250	my( $graph, $from, $to ) = @_;
251	my @edges = $from->edges_to( $to );
252	if( scalar @edges > 1 ) {
253	my @base = grep { $_->label eq 'base text' } @edges;
254	if ( scalar @base ) {
255	# Remove the edges that are not base.
256	foreach my $e ( @edges ) {
257	$graph->del_edge( $e )
258	unless $e eq $base[0];
259	}
260	} else {
261	# Combine the edges into one.
262	my $new_edge_name = join( ', ', map { $_->label() } @edges );
263	my $new_edge = shift @edges;
264	$new_edge->set_attribute( 'label', $new_edge_name );
265	foreach my $e ( @edges ) {
266	$graph->del_edge( $e );
267	}
268	}
269	}
270	}
271
272	# TODO need to make this configurable!
273	sub cmp_str {
274	my( $node ) = @_;
275	my $word = $node->label();
276	$word = lc( $word );
277	$word =~ s/\W//g;
278	$word =~ s/v/u/g;
279	$word =~ s/j/i/g;
280	$word =~ s/cha/ca/g;
281	$word =~ s/quatuor/quattuor/g;
282	$word =~ s/ioannes/iohannes/g;
283	return $word;
284	}
285
286	1;