[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Util.pm

package Text::Tradition::Parser::Util;

use strict;
use warnings;
use Algorithm::Diff;
use Exporter 'import';
use vars qw/ @EXPORT_OK /;
@EXPORT_OK = qw/ add_hash_entry check_for_repeated cmp_str collate_variants is_monotonic /;

=head1 NAME

Text::Tradition::Parser::Util

=head1 DESCRIPTION

A collection of utilities used by multiple Text::Tradition parsers.  
Probably not of external interest.

=head1 METHODS

=head2 B<collate_variants>( $collation, @reading_ranges )

Given a set of readings in the form 
( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
walks through each to identify those readings that are identical.  The
collation is a Text::Tradition::Collation object; the elements of
@readings are Text::Tradition::Collation::Reading objects that appear
on the collation graph.

=cut

sub collate_variants {
    my( $collation, @reading_sets ) = @_;
    
    # Make sure the reading sets are unique, but keep
    # the lemma first.
    my $lemma = shift @reading_sets;
    my %unique_sets;
    map { $unique_sets{$_} = $_ } @reading_sets;
	delete $unique_sets{$lemma};
	my @sets = values %unique_sets;
	unshift( @sets, $lemma );

    # Two different ways to do this, depending on whether we want
    # transposed reading nodes to be merged into one (producing a
    # nonlinear, bidirectional graph) or not (producing a relatively
    # linear, unidirectional graph.)
    return $collation->linear ? _collate_linearly( $collation, @sets )
        : _collate_nonlinearly( $collation, @sets );
}

sub _collate_linearly {
    my( $collation, $lemma_set, @variant_sets ) = @_;

    my @unique;
    my $substitutions = {};
    push( @unique, @$lemma_set );
    while( @variant_sets ) {
        my $variant_set = shift @variant_sets;
        # Use diff to do this job
        my $diff = Algorithm::Diff->new( \@unique, $variant_set, 
                                         {'keyGen' => \&_collation_hash} );
        my @new_unique;
        my %merged;
        while( $diff->Next ) {
            if( $diff->Same ) {
                # merge the nodes
                my @l = $diff->Items( 1 );
                my @v = $diff->Items( 2 );
                foreach my $i ( 0 .. $#l ) {
                    if( !$merged{$l[$i]->id} ) {
                        next if $v[$i] eq $l[$i];
#                         print STDERR sprintf( "Merging %s into %s\n", 
#                                              $v[$i]->id,
#                                              $l[$i]->id );
                        $collation->merge_readings( $l[$i], $v[$i] );
                        $merged{$l[$i]->id} = 1;
                        $substitutions->{$v[$i]->id} = $l[$i];
                    } else {
                        print STDERR "Would have double merged " . $l[$i]->id . "\n";
                    }
                }
                # splice the lemma nodes into the variant set
                my( $offset ) = $diff->Get( 'min2' );
                splice( @$variant_set, $offset, scalar( @l ), @l );
                push( @new_unique, @l );
            } else {
                # Keep the old unique readings
                push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
                # Add the new readings to the 'unique' list
                push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
            }
        }
        @unique = @new_unique;
    }
    return $substitutions;
}

sub _collate_nonlinearly {
    my( $collation, $lemma_set, @variant_sets ) = @_;
    
    my @unique;
    my $substitutions = {};
    push( @unique, @$lemma_set );
    while( @variant_sets ) {
        my $variant_set = shift @variant_sets;
        # Simply match the first reading that carries the same word, so
        # long as that reading has not yet been used to match another
        # word in this variant. That way lies loopy madness.
        my @distinct;
        my %merged;
        foreach my $idx ( 0 .. $#{$variant_set} ) {
            my $vw = $variant_set->[$idx];
            my @same = grep { cmp_str( $_ ) eq $vw->text } @unique;
            my $matched;
            if( @same ) {
                foreach my $i ( 0 .. $#same ) {
                    unless( $merged{$same[$i]->id} ) {
                        #print STDERR sprintf( "Merging %s into %s\n", 
                        #                     $vw->id,
                        #                     $same[$i]->id );
                        $collation->merge_readings( $same[$i], $vw );
                        $merged{$same[$i]->id} = 1;
                        $matched = $i;
                        $variant_set->[$idx] = $same[$i];
                        $substitutions->{$vw->id} = $same[$i];
                    }
                }
            }
            unless( @same && defined($matched) ) {
                push( @distinct, $vw );
            }
        }
        push( @unique, @distinct );
    }
    return $substitutions;
}

sub _collation_hash {
    my $node = shift;
    return cmp_str( $node );
}

=head2 B<cmp_str>

Don't use this. Really.

=cut

sub cmp_str {
    my( $reading ) = @_;
    my $word = $reading->text();
    return $word unless $reading->collation->tradition->name =~ /158/;
    $word = lc( $word );
    $word =~ s/\W//g;
    $word =~ s/v/u/g;
    $word =~ s/j/i/g;
    $word =~ s/cha/ca/g;
    $word =~ s/quatuor/quattuor/g;
    $word =~ s/ioannes/iohannes/g;
    return $word;
}

=head2 B<check_for_repeated>( @readings )

Given an array of items, returns any items that appear in the array more
than once.

=cut

sub check_for_repeated {
    my @seq = @_;
    my %unique;
    my @repeated;
    foreach ( @seq ) {
        if( exists $unique{$_->id} ) {
            push( @repeated, $_->id );
        } else {
            $unique{$_->id} = 1;
        }
    }
    return @repeated;
}

=head2 B<add_hash_entry>( $hash, $key, $entry )

Very simple utility for adding $entry to the list at $hash->{$key}.

=cut

sub add_hash_entry {
    my( $hash, $key, $entry ) = @_;
    if( exists $hash->{$key} ) {
        push( @{$hash->{$key}}, $entry );
    } else {
        $hash->{$key} = [ $entry ];
    }
}

1;

=head1 BUGS / TODO

=over

=item * Get rid of abomination that is cmp_str.

=back

=head1 LICENSE

This package is free software and is provided "as is" without express
or implied warranty.  You can redistribute it and/or modify it under
the same terms as Perl itself.

=head1 AUTHOR

Tara L Andrews E<lt>aurum@cpan.orgE<gt>
Commit	Line	Data
910a0a6d	1	package Text::Tradition::Parser::Util;
	2
	3	use strict;
	4	use warnings;
	5	use Algorithm::Diff;
	6	use Exporter 'import';
	7	use vars qw/ @EXPORT_OK /;
	8	@EXPORT_OK = qw/ add_hash_entry check_for_repeated cmp_str collate_variants is_monotonic /;
	9
25331c49	10	=head1 NAME
910a0a6d	11
25331c49	12	Text::Tradition::Parser::Util
	13
	14	=head1 DESCRIPTION
	15
	16	A collection of utilities used by multiple Text::Tradition parsers.
	17	Probably not of external interest.
	18
	19	=head1 METHODS
	20
	21	=head2 B<collate_variants>( $collation, @reading_ranges )
910a0a6d	22
	23	Given a set of readings in the form
	24	( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
	25	walks through each to identify those readings that are identical. The
	26	collation is a Text::Tradition::Collation object; the elements of
	27	@readings are Text::Tradition::Collation::Reading objects that appear
	28	on the collation graph.
	29
910a0a6d	30	=cut
	31
	32	sub collate_variants {
	33	my( $collation, @reading_sets ) = @_;
f8b07f32	34
	35	# Make sure the reading sets are unique, but keep
	36	# the lemma first.
	37	my $lemma = shift @reading_sets;
	38	my %unique_sets;
	39	map { $unique_sets{$_} = $_ } @reading_sets;
	40	delete $unique_sets{$lemma};
	41	my @sets = values %unique_sets;
	42	unshift( @sets, $lemma );
910a0a6d	43
	44	# Two different ways to do this, depending on whether we want
	45	# transposed reading nodes to be merged into one (producing a
	46	# nonlinear, bidirectional graph) or not (producing a relatively
	47	# linear, unidirectional graph.)
f8b07f32	48	return $collation->linear ? _collate_linearly( $collation, @sets )
f8b07f32	49	: _collate_nonlinearly( $collation, @sets );
910a0a6d	50	}
910a0a6d	51
027d819c	52	sub _collate_linearly {
910a0a6d	53	my( $collation, $lemma_set, @variant_sets ) = @_;
	54
	55	my @unique;
	56	my $substitutions = {};
	57	push( @unique, @$lemma_set );
	58	while( @variant_sets ) {
	59	my $variant_set = shift @variant_sets;
	60	# Use diff to do this job
	61	my $diff = Algorithm::Diff->new( \@unique, $variant_set,
	62	{'keyGen' => \&_collation_hash} );
	63	my @new_unique;
	64	my %merged;
	65	while( $diff->Next ) {
	66	if( $diff->Same ) {
	67	# merge the nodes
	68	my @l = $diff->Items( 1 );
	69	my @v = $diff->Items( 2 );
	70	foreach my $i ( 0 .. $#l ) {
e4b0f464	71	if( !$merged{$l[$i]->id} ) {
f6e19c7c	72	next if $v[$i] eq $l[$i];
0068967c	73	# print STDERR sprintf( "Merging %s into %s\n",
	74	# $v[$i]->id,
	75	# $l[$i]->id );
910a0a6d	76	$collation->merge_readings( $l[$i], $v[$i] );
e4b0f464	77	$merged{$l[$i]->id} = 1;
e4b0f464	78	$substitutions->{$v[$i]->id} = $l[$i];
910a0a6d	79	} else {
e4b0f464	80	print STDERR "Would have double merged " . $l[$i]->id . "\n";
910a0a6d	81	}
	82	}
	83	# splice the lemma nodes into the variant set
	84	my( $offset ) = $diff->Get( 'min2' );
	85	splice( @$variant_set, $offset, scalar( @l ), @l );
	86	push( @new_unique, @l );
	87	} else {
	88	# Keep the old unique readings
	89	push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
	90	# Add the new readings to the 'unique' list
	91	push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
	92	}
	93	}
	94	@unique = @new_unique;
	95	}
	96	return $substitutions;
	97	}
	98
027d819c	99	sub _collate_nonlinearly {
910a0a6d	100	my( $collation, $lemma_set, @variant_sets ) = @_;
	101
	102	my @unique;
	103	my $substitutions = {};
	104	push( @unique, @$lemma_set );
	105	while( @variant_sets ) {
	106	my $variant_set = shift @variant_sets;
	107	# Simply match the first reading that carries the same word, so
	108	# long as that reading has not yet been used to match another
	109	# word in this variant. That way lies loopy madness.
	110	my @distinct;
	111	my %merged;
	112	foreach my $idx ( 0 .. $#{$variant_set} ) {
	113	my $vw = $variant_set->[$idx];
e4b0f464	114	my @same = grep { cmp_str( $_ ) eq $vw->text } @unique;
910a0a6d	115	my $matched;
	116	if( @same ) {
	117	foreach my $i ( 0 .. $#same ) {
e4b0f464	118	unless( $merged{$same[$i]->id} ) {
910a0a6d	119	#print STDERR sprintf( "Merging %s into %s\n",
e4b0f464	120	# $vw->id,
e4b0f464	121	# $same[$i]->id );
910a0a6d	122	$collation->merge_readings( $same[$i], $vw );
e4b0f464	123	$merged{$same[$i]->id} = 1;
910a0a6d	124	$matched = $i;
910a0a6d	125	$variant_set->[$idx] = $same[$i];
e4b0f464	126	$substitutions->{$vw->id} = $same[$i];
910a0a6d	127	}
	128	}
	129	}
	130	unless( @same && defined($matched) ) {
	131	push( @distinct, $vw );
	132	}
	133	}
	134	push( @unique, @distinct );
	135	}
	136	return $substitutions;
	137	}
	138
	139	sub _collation_hash {
	140	my $node = shift;
	141	return cmp_str( $node );
	142	}
	143
027d819c	144	=head2 B<cmp_str>
	145
	146	Don't use this. Really.
	147
	148	=cut
	149
910a0a6d	150	sub cmp_str {
910a0a6d	151	my( $reading ) = @_;
e4b0f464	152	my $word = $reading->text();
44a6e7af	153	return $word unless $reading->collation->tradition->name =~ /158/;
910a0a6d	154	$word = lc( $word );
	155	$word =~ s/\W//g;
	156	$word =~ s/v/u/g;
	157	$word =~ s/j/i/g;
	158	$word =~ s/cha/ca/g;
	159	$word =~ s/quatuor/quattuor/g;
	160	$word =~ s/ioannes/iohannes/g;
	161	return $word;
	162	}
	163
25331c49	164	=head2 B<check_for_repeated>( @readings )
910a0a6d	165
	166	Given an array of items, returns any items that appear in the array more
	167	than once.
	168
	169	=cut
	170
	171	sub check_for_repeated {
	172	my @seq = @_;
	173	my %unique;
	174	my @repeated;
	175	foreach ( @seq ) {
e4b0f464	176	if( exists $unique{$_->id} ) {
e4b0f464	177	push( @repeated, $_->id );
910a0a6d	178	} else {
e4b0f464	179	$unique{$_->id} = 1;
910a0a6d	180	}
	181	}
	182	return @repeated;
	183	}
	184
027d819c	185	=head2 B<add_hash_entry>( $hash, $key, $entry )
	186
	187	Very simple utility for adding $entry to the list at $hash->{$key}.
	188
	189	=cut
	190
910a0a6d	191	sub add_hash_entry {
	192	my( $hash, $key, $entry ) = @_;
	193	if( exists $hash->{$key} ) {
	194	push( @{$hash->{$key}}, $entry );
	195	} else {
	196	$hash->{$key} = [ $entry ];
	197	}
	198	}
	199
25331c49	200	1;
	201
	202	=head1 BUGS / TODO
	203
	204	=over
	205
	206	=item * Get rid of abomination that is cmp_str.
	207
	208	=back
	209
	210	=head1 LICENSE
	211
	212	This package is free software and is provided "as is" without express
	213	or implied warranty. You can redistribute it and/or modify it under
	214	the same terms as Perl itself.
	215
	216	=head1 AUTHOR
	217
	218	Tara L Andrews E<lt>aurum@cpan.orgE<gt>