[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / Util.pm

package Text::Tradition::Parser::Util;

use strict;
use warnings;
use Algorithm::Diff;
use Exporter 'import';
use vars qw/ @EXPORT_OK /;
@EXPORT_OK = qw/ add_hash_entry check_for_repeated cmp_str collate_variants is_monotonic /;

=item B<collate_variants>

collate_variants( $collation, @reading_ranges )

Given a set of readings in the form 
( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
walks through each to identify those readings that are identical.  The
collation is a Text::Tradition::Collation object; the elements of
@readings are Text::Tradition::Collation::Reading objects that appear
on the collation graph.

TODO: Handle collapsed and non-collapsed transpositions.

=cut

sub collate_variants {
    my( $collation, @reading_sets ) = @_;

    # Two different ways to do this, depending on whether we want
    # transposed reading nodes to be merged into one (producing a
    # nonlinear, bidirectional graph) or not (producing a relatively
    # linear, unidirectional graph.)
    return $collation->linear ? collate_linearly( @_ )
        : collate_nonlinearly( @_ );
}

sub collate_linearly {
    my( $collation, $lemma_set, @variant_sets ) = @_;

    my @unique;
    my $substitutions = {};
    push( @unique, @$lemma_set );
    while( @variant_sets ) {
        my $variant_set = shift @variant_sets;
        # Use diff to do this job
        my $diff = Algorithm::Diff->new( \@unique, $variant_set, 
                                         {'keyGen' => \&_collation_hash} );
        my @new_unique;
        my %merged;
        while( $diff->Next ) {
            if( $diff->Same ) {
                # merge the nodes
                my @l = $diff->Items( 1 );
                my @v = $diff->Items( 2 );
                foreach my $i ( 0 .. $#l ) {
                    if( !$merged{$l[$i]->name} ) {
                        print STDERR sprintf( "Merging %s into %s\n", 
                                             $v[$i]->name,
                                             $l[$i]->name );
                        $collation->merge_readings( $l[$i], $v[$i] );
                        $merged{$l[$i]->name} = 1;
                        $substitutions->{$v[$i]->name} = $l[$i];
                    } else {
                        print STDERR "Would have double merged " . $l[$i]->name . "\n";
                    }
                }
                # splice the lemma nodes into the variant set
                my( $offset ) = $diff->Get( 'min2' );
                splice( @$variant_set, $offset, scalar( @l ), @l );
                push( @new_unique, @l );
            } else {
                # Keep the old unique readings
                push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
                # Add the new readings to the 'unique' list
                push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
            }
        }
        @unique = @new_unique;
    }
    return $substitutions;
}

sub collate_nonlinearly {
    my( $collation, $lemma_set, @variant_sets ) = @_;
    
    my @unique;
    my $substitutions = {};
    push( @unique, @$lemma_set );
    while( @variant_sets ) {
        my $variant_set = shift @variant_sets;
        # Simply match the first reading that carries the same word, so
        # long as that reading has not yet been used to match another
        # word in this variant. That way lies loopy madness.
        my @distinct;
        my %merged;
        foreach my $idx ( 0 .. $#{$variant_set} ) {
            my $vw = $variant_set->[$idx];
            my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
            my $matched;
            if( @same ) {
                foreach my $i ( 0 .. $#same ) {
                    unless( $merged{$same[$i]->name} ) {
                        #print STDERR sprintf( "Merging %s into %s\n", 
                        #                     $vw->name,
                        #                     $same[$i]->name );
                        $collation->merge_readings( $same[$i], $vw );
                        $merged{$same[$i]->name} = 1;
                        $matched = $i;
                        $variant_set->[$idx] = $same[$i];
                        $substitutions->{$vw->name} = $same[$i];
                    }
                }
            }
            unless( @same && defined($matched) ) {
                push( @distinct, $vw );
            }
        }
        push( @unique, @distinct );
    }
    return $substitutions;
}

sub _collation_hash {
    my $node = shift;
    return cmp_str( $node );
}

=item B<cmp_str>

Pretend you never saw this method.  Really it needs to not be hardcoded.

=cut

sub cmp_str {
    my( $reading ) = @_;
    my $word = $reading->label();
    $word = lc( $word );
    $word =~ s/\W//g;
    $word =~ s/v/u/g;
    $word =~ s/j/i/g;
    $word =~ s/cha/ca/g;
    $word =~ s/quatuor/quattuor/g;
    $word =~ s/ioannes/iohannes/g;
    return $word;
}

=item B<collate_variants>

my @rep = check_for_repeated( @readings )

Given an array of items, returns any items that appear in the array more
than once.

=cut

sub check_for_repeated {
    my @seq = @_;
    my %unique;
    my @repeated;
    foreach ( @seq ) {
        if( exists $unique{$_->name} ) {
            push( @repeated, $_->name );
        } else {
            $unique{$_->name} = 1;
        }
    }
    return @repeated;
}

sub add_hash_entry {
    my( $hash, $key, $entry ) = @_;
    if( exists $hash->{$key} ) {
        push( @{$hash->{$key}}, $entry );
    } else {
        $hash->{$key} = [ $entry ];
    }
}

sub is_monotonic {
    my( @readings ) = @_;
    my( $common, $min, $max ) = ( -1, -1, -1 );
    foreach my $rdg ( @readings ) {
#         print STDERR "Checking reading " . $rdg->name . "/" . $rdg->text . " - " 
#         . $rdg->position->reference ."\n";
        return 0 if $rdg->position->common < $common;
        if( $rdg->position->common == $common ) {
            return 0 if $rdg->position->min <= $min;
            return 0 if $rdg->position->max <= $max;
        }
        $common = $rdg->position->common;
        $min = $rdg->position->min;
        $max = $rdg->position->max;
    }
    return 1;
}
Commit	Line	Data
910a0a6d	1	package Text::Tradition::Parser::Util;
	2
	3	use strict;
	4	use warnings;
	5	use Algorithm::Diff;
	6	use Exporter 'import';
	7	use vars qw/ @EXPORT_OK /;
	8	@EXPORT_OK = qw/ add_hash_entry check_for_repeated cmp_str collate_variants is_monotonic /;
	9
	10	=item B<collate_variants>
	11
	12	collate_variants( $collation, @reading_ranges )
	13
	14	Given a set of readings in the form
	15	( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
	16	walks through each to identify those readings that are identical. The
	17	collation is a Text::Tradition::Collation object; the elements of
	18	@readings are Text::Tradition::Collation::Reading objects that appear
	19	on the collation graph.
	20
	21	TODO: Handle collapsed and non-collapsed transpositions.
	22
	23	=cut
	24
	25	sub collate_variants {
	26	my( $collation, @reading_sets ) = @_;
	27
	28	# Two different ways to do this, depending on whether we want
	29	# transposed reading nodes to be merged into one (producing a
	30	# nonlinear, bidirectional graph) or not (producing a relatively
	31	# linear, unidirectional graph.)
	32	return $collation->linear ? collate_linearly( @_ )
	33	: collate_nonlinearly( @_ );
	34	}
	35
	36	sub collate_linearly {
	37	my( $collation, $lemma_set, @variant_sets ) = @_;
	38
	39	my @unique;
	40	my $substitutions = {};
	41	push( @unique, @$lemma_set );
	42	while( @variant_sets ) {
	43	my $variant_set = shift @variant_sets;
	44	# Use diff to do this job
	45	my $diff = Algorithm::Diff->new( \@unique, $variant_set,
	46	{'keyGen' => \&_collation_hash} );
	47	my @new_unique;
	48	my %merged;
	49	while( $diff->Next ) {
	50	if( $diff->Same ) {
	51	# merge the nodes
	52	my @l = $diff->Items( 1 );
	53	my @v = $diff->Items( 2 );
	54	foreach my $i ( 0 .. $#l ) {
	55	if( !$merged{$l[$i]->name} ) {
	56	print STDERR sprintf( "Merging %s into %s\n",
	57	$v[$i]->name,
	58	$l[$i]->name );
	59	$collation->merge_readings( $l[$i], $v[$i] );
	60	$merged{$l[$i]->name} = 1;
	61	$substitutions->{$v[$i]->name} = $l[$i];
	62	} else {
	63	print STDERR "Would have double merged " . $l[$i]->name . "\n";
	64	}
65	}
66	# splice the lemma nodes into the variant set
67	my( $offset ) = $diff->Get( 'min2' );
68	splice( @$variant_set, $offset, scalar( @l ), @l );
69	push( @new_unique, @l );
70	} else {
71	# Keep the old unique readings
72	push( @new_unique, $diff->Items( 1 ) ) if $diff->Items( 1 );
73	# Add the new readings to the 'unique' list
74	push( @new_unique, $diff->Items( 2 ) ) if $diff->Items( 2 );
75	}
76	}
77	@unique = @new_unique;
78	}
79	return $substitutions;
80	}
81
82	sub collate_nonlinearly {
83	my( $collation, $lemma_set, @variant_sets ) = @_;
84
85	my @unique;
86	my $substitutions = {};
87	push( @unique, @$lemma_set );
88	while( @variant_sets ) {
89	my $variant_set = shift @variant_sets;
90	# Simply match the first reading that carries the same word, so
91	# long as that reading has not yet been used to match another
92	# word in this variant. That way lies loopy madness.
93	my @distinct;
94	my %merged;
95	foreach my $idx ( 0 .. $#{$variant_set} ) {
96	my $vw = $variant_set->[$idx];
97	my @same = grep { cmp_str( $_ ) eq $vw->label } @unique;
98	my $matched;
99	if( @same ) {
100	foreach my $i ( 0 .. $#same ) {
101	unless( $merged{$same[$i]->name} ) {
102	#print STDERR sprintf( "Merging %s into %s\n",
103	# $vw->name,
104	# $same[$i]->name );
105	$collation->merge_readings( $same[$i], $vw );
106	$merged{$same[$i]->name} = 1;
107	$matched = $i;
108	$variant_set->[$idx] = $same[$i];
109	$substitutions->{$vw->name} = $same[$i];
110	}
111	}
112	}
113	unless( @same && defined($matched) ) {
114	push( @distinct, $vw );
115	}
116	}
117	push( @unique, @distinct );
118	}
119	return $substitutions;
120	}
121
122	sub _collation_hash {
123	my $node = shift;
124	return cmp_str( $node );
125	}
126
127	=item B<cmp_str>
128
129	Pretend you never saw this method. Really it needs to not be hardcoded.
130
131	=cut
132
133	sub cmp_str {
134	my( $reading ) = @_;
135	my $word = $reading->label();
136	$word = lc( $word );
137	$word =~ s/\W//g;
138	$word =~ s/v/u/g;
139	$word =~ s/j/i/g;
140	$word =~ s/cha/ca/g;
141	$word =~ s/quatuor/quattuor/g;
142	$word =~ s/ioannes/iohannes/g;
143	return $word;
144	}
145
146	=item B<collate_variants>
147
148	my @rep = check_for_repeated( @readings )
149
150	Given an array of items, returns any items that appear in the array more
151	than once.
152
153	=cut
154
155	sub check_for_repeated {
156	my @seq = @_;
157	my %unique;
158	my @repeated;
159	foreach ( @seq ) {
160	if( exists $unique{$_->name} ) {
161	push( @repeated, $_->name );
162	} else {
163	$unique{$_->name} = 1;
164	}
165	}
166	return @repeated;
167	}
168
169	sub add_hash_entry {
170	my( $hash, $key, $entry ) = @_;
171	if( exists $hash->{$key} ) {
172	push( @{$hash->{$key}}, $entry );
173	} else {
174	$hash->{$key} = [ $entry ];
175	}
176	}
177
178	sub is_monotonic {
179	my( @readings ) = @_;
180	my( $common, $min, $max ) = ( -1, -1, -1 );
181	foreach my $rdg ( @readings ) {
182	# print STDERR "Checking reading " . $rdg->name . "/" . $rdg->text . " - "
183	# . $rdg->position->reference ."\n";
184	return 0 if $rdg->position->common < $common;
185	if( $rdg->position->common == $common ) {
186	return 0 if $rdg->position->min <= $min;
187	return 0 if $rdg->position->max <= $max;
188	}
189	$common = $rdg->position->common;
190	$min = $rdg->position->min;
191	$max = $rdg->position->max;
192	}
193	return 1;
194	}