[scpubgit/stemmatology.git] / lib / Text / Tradition / Parser / CollateText.pm

package Text::Tradition::Parser::CollateText;

use strict;
use warnings;

=head1 NAME

Text::Tradition::Parser::CollateText

=head1 DESCRIPTION

For an overview of the package, see the documentation for the
Text::Tradition module.

This module is meant for use with a set of text files saved from Word docs, 
which originated with the COLLATE collation program.  

=head1 SUBROUTINES

=over

=item B<parse>

parse( $graph, $opts );

Takes an initialized graph and a hashref of options, which must include:
- 'base' - the base text referenced by the variants
- 'format' - the format of the variant list
- 'data' - the variants, in the given format.

=cut

my %ALL_SIGLA;

sub parse {
    my( $tradition, $opts ) = @_;
    # String together the base text.
    my $lineref_hash = read_stone_base( $opts->{'base'}, $tradition->collation );
    # Note the sigla.
    foreach my $sigil ( @{$opts->{'sigla'}} ) {
        $ALL_SIGLA{$sigil} = 1;
        $tradition->add_witness( 'sigil' => $sigil );
    }
    # Now merge on the apparatus entries.
    merge_stone_apparatus( $tradition->collation, $lineref_hash, $opts->{'input'} );
}

=item B<read_stone_base>

my $text_list = read_base( 'reference.txt', $collation );

Takes a text file and a (presumed empty) collation object, adds the words
as simple linear readings to the collation, and returns a hash of texts
with line keys. This collation is now the starting point for application of
apparatus entries in merge_base, e.g. from a CSV file or a Classical Text
Editor file.

The hash is of the form 

 { chapter_name => { line_ref => { start => node, end => node } } }

=cut

sub read_stone_base {
    my( $base_file, $collation ) = @_;
    
    # This array gives the first reading for each line.  We put the
    # common starting point in line zero.
    my $last_reading = $collation->start();
    my $lineref_hash = {};
    my $last_lineref;

    my $curr_text;
    open( BASE, $base_file ) or die "Could not open file $base_file: $!";
    my $i = 1;
    while(<BASE>) {
        # Make the readings, and connect them up for the base, but
        # also save the first reading of each line in a hash for the
        # purpose.
        chomp;
        next if /^\s+$/; # skip blank lines
        s/^(\d)\x{589}/$1:/; # turn Armenian full stops into colons
        if( /^TESTAMENT/ ) {
            # Initialize the base hash for this section.
            $lineref_hash->{$_} = {};
            $curr_text = $lineref_hash->{$_};
            next;
        } 
        my @words = split;
        my $lineref;
        if( /^\d/ ) {
            # The first "word" is a line reference; keep it.
            $lineref = shift @words;
        } else {
            # Assume we are dealing with the title.
            $lineref = 'Title:';
        }
        
        # Now turn the remaining words into readings.
        my $wordref = 0;
        foreach my $w ( @words ) {
            my $readingref = join( ',', $lineref, ++$wordref );
            my $reading = $collation->add_reading( $readingref );
            $reading->text( $w );
            unless( exists $curr_text->{$lineref}->{'start'} ) {
                $curr_text->{$lineref}->{'start'} = $reading;
            }
            # Add edge paths in the graph, for easier tracking when
            # we start applying corrections.  These paths will be
            # removed when we're done.
            my $path = $collation->add_path( $last_reading, $reading, 
                                             $collation->baselabel );
            $last_reading = $reading;
        }
        $curr_text->{$lineref}->{'end'} = $last_reading;
    }

    close BASE;
    # Ending point for all texts
    $collation->add_path( $last_reading, $collation->end, $collation->baselabel );
    return( $lineref_hash );
}

=item B<merge_stone_apparatus>

Read an apparatus as output (presumably) by Collate.  It should be reasonably
regular in form, I hope.  Merge the apparatus variants onto the appropriate 
lemma readings.

=cut

sub merge_stone_apparatus {
    my( $c, $lineref_hash, $file ) = @_;
    
    my $text_apps = {};    
    my $current_text;
    open( APP, $file ) or die "Could not read apparatus file $file";
    while( <APP> ) {
        chomp;
        next if /^\s+$/;
        if( /^TESTAMENT/ ) {
            $current_text = $lineref_hash->{$_};
            next;
        }
        
        # Otherwise, the first word of the line is the base text line reference.
        my $i = 0;
        my $lineref;
        if( s/^(\S+)// ) {
            $lineref = $1;
        } else {
            warn "Unrecognized line $_";
        }
        my $baseline = $current_text->{$lineref};
        # The start and end readings for this line are now in $baseline->{start}
        # and $baseline->{end}.
            
        # Now look at the apparatus entries for this line. They are
        # split with |.
        my @apps = split( '|' );
        foreach my $app ( @apps ) {
            my( $lemma, $rest ) = split( ']', $app );
            
            # Find the lemma reading.
            my( $lemma_start, $lemma_end ) = 
                _find_reading_on_line( $c, $lemma, $baseline );
            my @lemma_chain = $c->reading_sequence( $lemma_start, $lemma_end );
            
            # Splice in "start" and "end" placeholders on either
            # side of the lemma.
            my ( $rdg_start, $rdg_end ) =
                _add_reading_placeholders( $c, $lemma_start, $lemma_end );
                
            # For each reading, attach it to the lemma.
            my @indiv = split( '  ', $rest );
            foreach my $rdg ( @indiv ) {
                # Parse the string.
                my( $words, $sigla, $recurse ) = parse_app_entry( $rdg );
                my @readings;
                foreach my $i ( 0 .. $#$words ) {
                    next if $i == 0 && $words->[$i] =~ /^__/;
                    my $reading_id = $rdg_start->text . '_' . $rdg_end->text . '/' . $i;
                    my $reading = $c->add_reading( $reading_id );
                    $reading->text( $words->[$i] );
                    push( @readings, $reading );
                }
                
                # Deal with any specials.
                my $lemma_sequence;
                if( $words->[0] eq '__LEMMA__' ) {
                    $lemma_sequence = [ $lemma_end, $rdg_end ];
                } elsif ( $rdg->[0] eq '__TRANSPOSE__' ) {
                    # Hope it is only two or three words in the lemma.
                    # TODO figure out how we really want to handle this
                    @readings = reverse @lemma_chain;
                }
                $lemma_sequence = [ $rdg_start, @lemma_chain, $rdg_end ]
                    unless $lemma_sequence;
                
                # Now hook up the paths.
                unshift( @readings, $rdg_start );
                push( @readings, $rdg_end );
                foreach my $i ( 1 .. $#readings ) {
                    if( $recurse->{$i} ) {
                        my( $rwords, $rsig ) = parse_app_entry( $recurse->{$i} );
                        # Get the local "lemma" sequence
                        my $llseq = [ $readings[$i], $readings[$i+1] ];
                        if( $rwords->[0] ne '__LEMMA__' ) {
                            # Treat it as an addition to the last word
                            unshift( @$llseq, $readings[$i-1] );
                        } 
                        # Create the reading nodes in $rwords
                        # TODO Hope we don't meet ~ in a recursion
                        my $local_rdg = [];
                        foreach my $i ( 0 .. $#$rwords ) {
                            next if $i == 0 && $rwords->[$i] =~ /^__/;
                            my $reading_id = $llseq->[0]->text . '_' . 
                                $llseq->[-1]->text . '/' . $i;
                            my $reading = $c->add_reading( $reading_id );
                            $reading->text( $words->[$i] );
                            push( @$local_rdg, $reading );
                        }
                        # Add the path(s) necessary
                        _add_sigil_path( $c, $rsig, $local_rdg, $llseq );
                    }
                }
                _add_sigil_path( $c, $sigla, \@readings, $lemma_sequence );
            } # end processing of $app
        } # end foreach my $app in line
    } # end while <line>
    
    # Now reconcile all the paths in the collation, and delete our
    # temporary anchor nodes.
    expand_all_paths( $c );    
    
    # Finally, calculate the ranks we've got.
    $c->calculate_ranks;
}

sub _find_reading_on_line {
    my( $c, $lemma, $baseline ) = @_;
    
    my $lemma_start = $baseline->{'start'};
    my $lemma_end;
    my $too_far = $baseline->{'end'}->next_reading;
    my @lemma_words = split( /\s+/, $lemma );
    
    my %seen;
    my $scrutinize = '';   # DEBUG variable
    my $seq = 1;
    while( $lemma_start ne $too_far ) {
        # Loop detection
        if( $seen{ $lemma_start->name() } ) {
            warn "Detected loop at " . $lemma_start->name . " for lemma $lemma";
            last;
        }
        $seen{ $lemma_start->name() } = 1;
        
        # Try to match the lemma.
        # TODO move next/prior reading methods into the reading classes,
        # to make this more self-contained and not need to pass $c.
        my $unmatch = 0;
        my ( $lw, $seq ) = _get_seq( $lemma_words[0] );
        print STDERR "Matching $lemma_start against $lw...\n" 
            if $scrutinize;
        if( $lemma_start->text eq $lw ) {
            # Skip it if we need a match that is not the first.
            if( --$seq < 1 ) {
                # Now we have to compare the rest of the words here.
                if( scalar( @lemma_words ) > 1 ) {
                    my $next_reading = 
                        $c->next_reading( $lemma_start );
                    my $wildcard = 0;
                    foreach my $w ( @lemma_words[1..$#lemma_words] ) {
                        if( $w eq '---' ) {
                            # We match everything to the next word.
                            $wildcard = 1;
                            next;
                        } else {
                            $wildcard = 0;
                        }
                        ( $lw, $seq ) = _get_seq( $w );
                        printf STDERR "Now matching %s against %s\n", 
                                $next_reading->text, $lw
                            if $scrutinize;
                        if( !$wildcard && $w ne $next_reading->text) {
                            $unmatch = 1;
                            last;
                        } else {
                            $lemma_end = $next_reading;
                            $next_reading = 
                                $c->next_reading( $lemma_end );
                        }
                    }
                } else { # single-word match, easy.
                    $lemma_end = $lemma_start;
                }
            } else { # we need the Nth match and aren't there yet
                $unmatch = 1;
            }
        }
        last unless ( $unmatch || !defined( $lemma_end ) );
        $lemma_end = undef;
        $lemma_start = $c->next_reading( $lemma_start );
    }
    
    unless( $lemma_end ) {
        warn "No match found for @lemma_words";
        return undef;
    }   
    return( $lemma_start, $lemma_end );
}

sub _add_reading_placeholders {
    my( $collation, $lemma_start, $lemma_end ) = @_;
    # We will splice in a 'begin' and 'end' marker on either side of the 
    # lemma, as sort of a double-endpoint attachment in the graph.

    my $attachlabel = "ATTACH";
    my( $start_node, $end_node );
    my @start_id = grep { $_->label eq $attachlabel } $lemma_start->incoming;
    if( @start_id ) {
        # There already exists an app-begin node. Use that.
        $start_node = $start_id[0]->from;
    } else {
        $start_node = $collation->add_reading( $app_info->{_id} );
        $collation->add_path( 
            $collation->prior_reading( $lemma_start, $collation->baselabel ),    
            $start_node, $attachlabel );
        $collation->add_path( $start_node, $lemma_start, $attachlabel );
    }
    # Now the converse for the end.
    my @end_id = grep { $_->label eq $attachlabel } $lemma_end->outgoing;
    if( @end_id ) {
        # There already exists an app-begin node. Use that.
        $end_node = $end_id[0]->to;
    } else {
        $end_node = $collation->add_reading( $app_info->{_id} . "E" );
        $collation->add_path( $lemma_end, $end_node, $attachlabel );
        $collation->add_path( $end_node, 
            $collation->next_reading( $lemma_end, $collation->baselabel ),
            $attachlabel );
    }
    return( $start_node, $end_node ); 
}

# Function to parse an apparatus reading string, with reference to no other
# data.  Need to do this separately as readings can include readings (ugh).
# Try to give whatever information we might need, including recursive app
# entries that might need to be parsed.

sub parse_app_entry {
    my( $rdg, ) = @_;
    $rdg =~ s/^\s+//;
    $rdg =~ s/\s+$//;
    next unless $rdg;  # just in case
    my @words = split( /\s+/, $rdg );
    # Zero or more sigils e.g. +, followed by Armenian, 
    # followed by (possibly modified) sigla, followed by 
    # optional : with note.
    my $is_add;
    my $is_omission;
    my $is_transposition;
    my @reading;
    my %reading_sigla;
    my $recursed;
    my $sig_regex = join( '|', keys %ALL_SIGLA );
    while( @words ) {
        my $bit = shift @words;
        if( $bit eq '+' ) {
            $is_add = 1;
        } elsif( $bit eq 'om' ) {
            $is_omission = 1;
        } elsif( $bit eq '~' ) {
            $is_transposition = 1;
        } elsif( $bit =~ /\p{Armenian}/ ) {
            warn "Found text in omission?!" if $is_omission;
            push( @reading, $bit );
        } elsif( $bit eq ':' ) {
            # Stop processing.
            last;
        } elsif( $bit =~ /^\($/ ) { 
            # It's a recursive reading within a reading. Lemmatize what we
            # have so far and grab the extra.
            my @new = ( $1 );
            until( $new[-1] =~ /\)$/ ) {
                push( @new, shift @words );
            }
            my $recursed_reading = join( ' ', @new );
            $recursed_reading =~ s/^\((.*)\)//;
            # This recursive entry refers to the last reading word(s) we
            # saw.  Push its index+1.  We will have to come back to parse
            # it when we are dealing with the main reading.
            # TODO handle () as first element
            # TODO handle - as suffix to add, i.e. make new word
            $recursed->{@reading} = $recursed_reading;
        } elsif( $bit =~ /^(\Q$sig_regex\E)(.*)$/ {
            # It must be a sigil.
            my( $sigil, $mod ) = ( $1, $2 );
            if( $mod eq "\x{80}" ) {
                $reading_sigla->{$sig} = '_PC_';
                $ALL_SIGLA{$sig} = 2;  # a pre- and post-corr version exists
            } elsif( $mod eq '*' ) {
                $reading_sigla->{$sig} = '_AC_';
                $ALL_SIGLA{$sig} = 2;  # a pre- and post-corr version exists
            } else {
                $reading_sigla->{$sig} = 1 unless $mod; # skip secondhand corrections
            }
        } elsif( $bit =~ /transpos/ ) {
            # There are some transpositions not coded rigorously; skip them.
            warn "Found hard transposition in $rdg; fix manually";
            last;
        } else {
            warn "Not sure what to do with bit $bit in $rdg";
        }
    }

    # Transmogrify the reading if necessary.
    unshift( @reading, '__LEMMA__' ) if $is_add;
    unshift( @reading, '__TRANSPOSE__' ) if $is_transposition;
    @reading = () if $is_omission;
   
    return( \@reading, $reading_sigla, $recursed );  
}

# Add a path for the specified sigla to connect the reading sequence.
# Add an a.c. path to the base sequence if we have an explicitly p.c.
# reading.
# Also handle the paths for sigla we have already added in recursive
# apparatus readings (i.e. don't add a path if one already exists.)

sub _add_sigil_path {
    my( $c, $sigla, $base_sequence, $reading_sequence ) = @_;
    my %skip;
    foreach my $sig ( keys %$sigla ) {
        my $use_sig = $sigla->{$sig} eq '_AC_' ? $sig.$c->ac_label : $sig;
        foreach my $i ( 0 .. $#$reading_sequence-1 ) {
            if( $skip{$use_sig} ) {
                next if !_has_prior_reading( $reading_sequence[$i], $use_sig );
                $skip{$use_sig} = 0;
            if( _has_next_reading( $reading_sequence[$i], $use_sig ) ) {
                $skip{$use_sig} = 1;
                next;
            }
            $c->add_path( $reading_sequence[$i], $reading_sequence[$i+1], $use_sig);
        }
        if( $sigla->{$sig} eq '_PC_') {
            $use_sig = $sig.$c->ac_label
            foreach my $i ( 0 .. @$base_sequence ) {
                if( $skip{$use_sig} ) {
                    next if !_has_prior_reading( $reading_sequence[$i], $use_sig );
                    $skip{$use_sig} = 0;
                if( _has_next_reading( $reading_sequence[$i], $use_sig ) ) {
                    $skip{$use_sig} = 1;
                    next;
                }
                $c->add_path( $base_sequence[$i], $base_sequence[$i+1], $use_sig );
            }
        }
    }
}

# Remove all ATTACH* nodes, linking the readings on either side of them.
# Then walk the collation for all witness paths, and make sure those paths
# explicitly exist.  Then delete all the 'base' paths.

sub expand_all_paths { 
    my( $c ) = @_;
    
    # Delete the anchors
    foreach my $anchor ( grep { $_->name =~ /ATTACH/ } $c->readings ) {
        # Map each path to its incoming/outgoing node.
        my %incoming;
        map { $incoming{$_->label} = $_->from } $anchor->incoming();
        my %outgoing;
        map { $outgoing{$_->label} = $_->to } $anchor->outgoing();
        $c->del_reading( $anchor );
        
        # Connect in and out.
        my $aclabel = $c->ac_label;
        foreach my $edge ( keys %incoming ) {
            my $from = $incoming{$edge};
            my $to = $outgoing{$edge};
            if( !$to && $edge =~ /^(.*)\Q$aclabel\E$/ ) {
                $to = $outgoing{$1};
            }
            $to = $outgoing{$c->baselabel} unless $to;
            warn "Have no outbound base link on " . $anchor->name . "!"
                unless $to;
            $c->add_path( $from, $to, $edge );
        }
        # TODO Think about deleting outgoing/edge as we use them to make this faster.
        foreach my $edge ( keys %outgoing ) {
            my $to = $outgoing{$edge};
            my $from = incoming{$edge};
            if( !$from && $edge =~ /^(.*)\Q$aclabel\E$/ ) {
                $from = $incoming{$1};
            }
            $from = $incoming{$c->baselabel} unless $to;
            warn "Have no inbound base link on " . $anchor->name . "!"
                unless $from;
            $c->add_path( $from, $to, $edge )
                unless _has_prior_reading( $to, $edge );
            }
        }
    }
    
    # Walk the collation and add paths if necessary
    foreach my $sig ( keys %ALL_SIGLA ) {
        my $wit = $c->tradition->witness( $sig );
        my @path = $c->reading_sequence( $c->start, $c->end, $sig );
        $wit->path( \@path );
        if( $ALL_SIGLA{$sig} > 1 ) {
            my @ac_path = $c->reading_sequence( $c->start, $c->end, 
                                                $sig.$c->ac_label, $sig );
            $wit->uncorrected_path( \@path );
            # a.c. paths are already there by default.
        }
        foreach my $i ( 1 .. $#$path ) {
            # If there is no explicit path for this sigil between n-1 and n,
            # add it.
            unless( grep { $_->label eq $sig } $path[$i]->edges_from( $path[$i-1] ) ) {
                $c->add_path( $path[$i-1], $path[$i], $sig );
            }
        }
    }
    
    # Delete all baselabel edges
    foreach my $edge ( grep { $_->label eq $c->baselabel } $c->paths ) {
        $c->del_edge( $edge );
    }
    
    # Calculate ranks on graph nodes
    $c->calculate_ranks();
}

sub _get_seq {
    my( $str ) = @_;
    my $seq = 1;
    my $lw = $str;
    if( $str =~ /^(.*)(\d)\x{80}$/ ) {
        ( $lw, $seq) = ( $1, $2 );
    }
    return( $lw, $seq );
}

sub _has_next_reading {
    my( $rdg, $sigil ) = @_;
    return grep { $_->label eq $sigil } $rdg->outgoing();
}
sub _has_prior_reading {
    my( $rdg, $sigil ) = @_;
    return grep { $_->label eq $sigil } $rdg->incoming();
}
Commit	Line	Data
fa954f4c	1	package Text::Tradition::Parser::CollateText;
	2
	3	use strict;
	4	use warnings;
	5
	6	=head1 NAME
	7
	8	Text::Tradition::Parser::CollateText
	9
	10	=head1 DESCRIPTION
	11
	12	For an overview of the package, see the documentation for the
	13	Text::Tradition module.
	14
	15	This module is meant for use with a set of text files saved from Word docs,
	16	which originated with the COLLATE collation program.
	17
	18	=head1 SUBROUTINES
	19
	20	=over
	21
	22	=item B<parse>
	23
	24	parse( $graph, $opts );
	25
	26	Takes an initialized graph and a hashref of options, which must include:
	27	- 'base' - the base text referenced by the variants
	28	- 'format' - the format of the variant list
	29	- 'data' - the variants, in the given format.
	30
	31	=cut
	32
	33	my %ALL_SIGLA;
	34
	35	sub parse {
	36	my( $tradition, $opts ) = @_;
	37	# String together the base text.
	38	my $lineref_hash = read_stone_base( $opts->{'base'}, $tradition->collation );
	39	# Note the sigla.
	40	foreach my $sigil ( @{$opts->{'sigla'}} ) {
	41	$ALL_SIGLA{$sigil} = 1;
	42	$tradition->add_witness( 'sigil' => $sigil );
	43	}
	44	# Now merge on the apparatus entries.
	45	merge_stone_apparatus( $tradition->collation, $lineref_hash, $opts->{'input'} );
	46	}
	47
	48	=item B<read_stone_base>
	49
	50	my $text_list = read_base( 'reference.txt', $collation );
	51
	52	Takes a text file and a (presumed empty) collation object, adds the words
	53	as simple linear readings to the collation, and returns a hash of texts
	54	with line keys. This collation is now the starting point for application of
	55	apparatus entries in merge_base, e.g. from a CSV file or a Classical Text
	56	Editor file.
	57
	58	The hash is of the form
	59
	60	{ chapter_name => { line_ref => { start => node, end => node } } }
	61
	62	=cut
	63
	64	sub read_stone_base {
65	my( $base_file, $collation ) = @_;
66
67	# This array gives the first reading for each line. We put the
68	# common starting point in line zero.
69	my $last_reading = $collation->start();
70	my $lineref_hash = {};
71	my $last_lineref;
72
73	my $curr_text;
74	open( BASE, $base_file ) or die "Could not open file $base_file: $!";
75	my $i = 1;
76	while(<BASE>) {
77	# Make the readings, and connect them up for the base, but
78	# also save the first reading of each line in a hash for the
79	# purpose.
80	chomp;
81	next if /^\s+$/; # skip blank lines
82	s/^(\d)\x{589}/$1:/; # turn Armenian full stops into colons
83	if( /^TESTAMENT/ ) {
84	# Initialize the base hash for this section.
85	$lineref_hash->{$_} = {};
86	$curr_text = $lineref_hash->{$_};
87	next;
88	}
89	my @words = split;
90	my $lineref;
91	if( /^\d/ ) {
92	# The first "word" is a line reference; keep it.
93	$lineref = shift @words;
94	} else {
95	# Assume we are dealing with the title.
96	$lineref = 'Title:';
97	}
98
99	# Now turn the remaining words into readings.
100	my $wordref = 0;
101	foreach my $w ( @words ) {
102	my $readingref = join( ',', $lineref, ++$wordref );
103	my $reading = $collation->add_reading( $readingref );
104	$reading->text( $w );
105	unless( exists $curr_text->{$lineref}->{'start'} ) {
106	$curr_text->{$lineref}->{'start'} = $reading;
107	}
108	# Add edge paths in the graph, for easier tracking when
109	# we start applying corrections. These paths will be
110	# removed when we're done.
111	my $path = $collation->add_path( $last_reading, $reading,
112	$collation->baselabel );
113	$last_reading = $reading;
114	}
115	$curr_text->{$lineref}->{'end'} = $last_reading;
116	}
117
118	close BASE;
119	# Ending point for all texts
120	$collation->add_path( $last_reading, $collation->end, $collation->baselabel );
121	return( $lineref_hash );
122	}
123
124	=item B<merge_stone_apparatus>
125
126	Read an apparatus as output (presumably) by Collate. It should be reasonably
127	regular in form, I hope. Merge the apparatus variants onto the appropriate
128	lemma readings.
129
130	=cut
131
132	sub merge_stone_apparatus {
133	my( $c, $lineref_hash, $file ) = @_;
134
135	my $text_apps = {};
136	my $current_text;
137	open( APP, $file ) or die "Could not read apparatus file $file";
138	while( <APP> ) {
139	chomp;
140	next if /^\s+$/;
141	if( /^TESTAMENT/ ) {
142	$current_text = $lineref_hash->{$_};
143	next;
144	}
145
146	# Otherwise, the first word of the line is the base text line reference.
147	my $i = 0;
148	my $lineref;
149	if( s/^(\S+)// ) {
150	$lineref = $1;
151	} else {
152	warn "Unrecognized line $_";
153	}
154	my $baseline = $current_text->{$lineref};
155	# The start and end readings for this line are now in $baseline->{start}
156	# and $baseline->{end}.
157
158	# Now look at the apparatus entries for this line. They are
159	# split with \|.
160	my @apps = split( '\|' );
161	foreach my $app ( @apps ) {
162	my( $lemma, $rest ) = split( ']', $app );
163
164	# Find the lemma reading.
165	my( $lemma_start, $lemma_end ) =
166	_find_reading_on_line( $c, $lemma, $baseline );
167	my @lemma_chain = $c->reading_sequence( $lemma_start, $lemma_end );
168
169	# Splice in "start" and "end" placeholders on either
170	# side of the lemma.
171	my ( $rdg_start, $rdg_end ) =
172	_add_reading_placeholders( $c, $lemma_start, $lemma_end );
173
174	# For each reading, attach it to the lemma.
175	my @indiv = split( ' ', $rest );
176	foreach my $rdg ( @indiv ) {
177	# Parse the string.
178	my( $words, $sigla, $recurse ) = parse_app_entry( $rdg );
179	my @readings;
180	foreach my $i ( 0 .. $#$words ) {
181	next if $i == 0 && $words->[$i] =~ /^__/;
182	my $reading_id = $rdg_start->text . '_' . $rdg_end->text . '/' . $i;
183	my $reading = $c->add_reading( $reading_id );
184	$reading->text( $words->[$i] );
185	push( @readings, $reading );
186	}
187
188	# Deal with any specials.
189	my $lemma_sequence;
190	if( $words->[0] eq '__LEMMA__' ) {
191	$lemma_sequence = [ $lemma_end, $rdg_end ];
192	} elsif ( $rdg->[0] eq '__TRANSPOSE__' ) {
193	# Hope it is only two or three words in the lemma.
194	# TODO figure out how we really want to handle this
195	@readings = reverse @lemma_chain;
196	}
197	$lemma_sequence = [ $rdg_start, @lemma_chain, $rdg_end ]
198	unless $lemma_sequence;
199
200	# Now hook up the paths.
201	unshift( @readings, $rdg_start );
202	push( @readings, $rdg_end );
203	foreach my $i ( 1 .. $#readings ) {
204	if( $recurse->{$i} ) {
205	my( $rwords, $rsig ) = parse_app_entry( $recurse->{$i} );
206	# Get the local "lemma" sequence
207	my $llseq = [ $readings[$i], $readings[$i+1] ];
208	if( $rwords->[0] ne '__LEMMA__' ) {
209	# Treat it as an addition to the last word
210	unshift( @$llseq, $readings[$i-1] );
211	}
212	# Create the reading nodes in $rwords
213	# TODO Hope we don't meet ~ in a recursion
214	my $local_rdg = [];
215	foreach my $i ( 0 .. $#$rwords ) {
216	next if $i == 0 && $rwords->[$i] =~ /^__/;
217	my $reading_id = $llseq->[0]->text . '_' .
218	$llseq->[-1]->text . '/' . $i;
219	my $reading = $c->add_reading( $reading_id );
220	$reading->text( $words->[$i] );
221	push( @$local_rdg, $reading );
222	}
223	# Add the path(s) necessary
224	_add_sigil_path( $c, $rsig, $local_rdg, $llseq );
225	}
226	}
227	_add_sigil_path( $c, $sigla, \@readings, $lemma_sequence );
228	} # end processing of $app
229	} # end foreach my $app in line
230	} # end while <line>
231
232	# Now reconcile all the paths in the collation, and delete our
233	# temporary anchor nodes.
234	expand_all_paths( $c );
235
236	# Finally, calculate the ranks we've got.
237	$c->calculate_ranks;
238	}
239
240	sub _find_reading_on_line {
241	my( $c, $lemma, $baseline ) = @_;
242
243	my $lemma_start = $baseline->{'start'};
244	my $lemma_end;
245	my $too_far = $baseline->{'end'}->next_reading;
246	my @lemma_words = split( /\s+/, $lemma );
247
248	my %seen;
249	my $scrutinize = ''; # DEBUG variable
250	my $seq = 1;
251	while( $lemma_start ne $too_far ) {
252	# Loop detection
253	if( $seen{ $lemma_start->name() } ) {
254	warn "Detected loop at " . $lemma_start->name . " for lemma $lemma";
255	last;
256	}
257	$seen{ $lemma_start->name() } = 1;
258
259	# Try to match the lemma.
260	# TODO move next/prior reading methods into the reading classes,
261	# to make this more self-contained and not need to pass $c.
262	my $unmatch = 0;
263	my ( $lw, $seq ) = _get_seq( $lemma_words[0] );
264	print STDERR "Matching $lemma_start against $lw...\n"
265	if $scrutinize;
266	if( $lemma_start->text eq $lw ) {
267	# Skip it if we need a match that is not the first.
268	if( --$seq < 1 ) {
269	# Now we have to compare the rest of the words here.
270	if( scalar( @lemma_words ) > 1 ) {
271	my $next_reading =
272	$c->next_reading( $lemma_start );
273	my $wildcard = 0;
274	foreach my $w ( @lemma_words[1..$#lemma_words] ) {
275	if( $w eq '---' ) {
276	# We match everything to the next word.
277	$wildcard = 1;
278	next;
279	} else {
280	$wildcard = 0;
281	}
282	( $lw, $seq ) = _get_seq( $w );
283	printf STDERR "Now matching %s against %s\n",
284	$next_reading->text, $lw
285	if $scrutinize;
286	if( !$wildcard && $w ne $next_reading->text) {
287	$unmatch = 1;
288	last;
289	} else {
290	$lemma_end = $next_reading;
291	$next_reading =
292	$c->next_reading( $lemma_end );
293	}
294	}
295	} else { # single-word match, easy.
296	$lemma_end = $lemma_start;
297	}
298	} else { # we need the Nth match and aren't there yet
299	$unmatch = 1;
300	}
301	}
302	last unless ( $unmatch \|\| !defined( $lemma_end ) );
303	$lemma_end = undef;
304	$lemma_start = $c->next_reading( $lemma_start );
305	}
306
307	unless( $lemma_end ) {
308	warn "No match found for @lemma_words";
309	return undef;
310	}
311	return( $lemma_start, $lemma_end );
312	}
313
314	sub _add_reading_placeholders {
315	my( $collation, $lemma_start, $lemma_end ) = @_;
316	# We will splice in a 'begin' and 'end' marker on either side of the
317	# lemma, as sort of a double-endpoint attachment in the graph.
318
319	my $attachlabel = "ATTACH";
320	my( $start_node, $end_node );
321	my @start_id = grep { $_->label eq $attachlabel } $lemma_start->incoming;
322	if( @start_id ) {
323	# There already exists an app-begin node. Use that.
324	$start_node = $start_id[0]->from;
325	} else {
326	$start_node = $collation->add_reading( $app_info->{_id} );
327	$collation->add_path(
328	$collation->prior_reading( $lemma_start, $collation->baselabel ),
329	$start_node, $attachlabel );
330	$collation->add_path( $start_node, $lemma_start, $attachlabel );
331	}
332	# Now the converse for the end.
333	my @end_id = grep { $_->label eq $attachlabel } $lemma_end->outgoing;
334	if( @end_id ) {
335	# There already exists an app-begin node. Use that.
336	$end_node = $end_id[0]->to;
337	} else {
338	$end_node = $collation->add_reading( $app_info->{_id} . "E" );
339	$collation->add_path( $lemma_end, $end_node, $attachlabel );
340	$collation->add_path( $end_node,
341	$collation->next_reading( $lemma_end, $collation->baselabel ),
342	$attachlabel );
343	}
344	return( $start_node, $end_node );
345	}
346
347	# Function to parse an apparatus reading string, with reference to no other
348	# data. Need to do this separately as readings can include readings (ugh).
349	# Try to give whatever information we might need, including recursive app
350	# entries that might need to be parsed.
351
352	sub parse_app_entry {
353	my( $rdg, ) = @_;
354	$rdg =~ s/^\s+//;
355	$rdg =~ s/\s+$//;
356	next unless $rdg; # just in case
357	my @words = split( /\s+/, $rdg );
358	# Zero or more sigils e.g. +, followed by Armenian,
359	# followed by (possibly modified) sigla, followed by
360	# optional : with note.
361	my $is_add;
362	my $is_omission;
363	my $is_transposition;
364	my @reading;
365	my %reading_sigla;
366	my $recursed;
367	my $sig_regex = join( '\|', keys %ALL_SIGLA );
368	while( @words ) {
369	my $bit = shift @words;
370	if( $bit eq '+' ) {
371	$is_add = 1;
372	} elsif( $bit eq 'om' ) {
373	$is_omission = 1;
374	} elsif( $bit eq '~' ) {
375	$is_transposition = 1;
376	} elsif( $bit =~ /\p{Armenian}/ ) {
377	warn "Found text in omission?!" if $is_omission;
378	push( @reading, $bit );
379	} elsif( $bit eq ':' ) {
380	# Stop processing.
381	last;
382	} elsif( $bit =~ /^\($/ ) {
383	# It's a recursive reading within a reading. Lemmatize what we
384	# have so far and grab the extra.
385	my @new = ( $1 );
386	until( $new[-1] =~ /\)$/ ) {
387	push( @new, shift @words );
388	}
389	my $recursed_reading = join( ' ', @new );
390	$recursed_reading =~ s/^\((.*)\)//;
391	# This recursive entry refers to the last reading word(s) we
392	# saw. Push its index+1. We will have to come back to parse
393	# it when we are dealing with the main reading.
394	# TODO handle () as first element
395	# TODO handle - as suffix to add, i.e. make new word
396	$recursed->{@reading} = $recursed_reading;
397	} elsif( $bit =~ /^(\Q$sig_regex\E)(.*)$/ {
398	# It must be a sigil.
399	my( $sigil, $mod ) = ( $1, $2 );
400	if( $mod eq "\x{80}" ) {
401	$reading_sigla->{$sig} = '_PC_';
402	$ALL_SIGLA{$sig} = 2; # a pre- and post-corr version exists
403	} elsif( $mod eq '*' ) {
404	$reading_sigla->{$sig} = '_AC_';
405	$ALL_SIGLA{$sig} = 2; # a pre- and post-corr version exists
406	} else {
407	$reading_sigla->{$sig} = 1 unless $mod; # skip secondhand corrections
408	}
409	} elsif( $bit =~ /transpos/ ) {
410	# There are some transpositions not coded rigorously; skip them.
411	warn "Found hard transposition in $rdg; fix manually";
412	last;
413	} else {
414	warn "Not sure what to do with bit $bit in $rdg";
415	}
416	}
417
418	# Transmogrify the reading if necessary.
419	unshift( @reading, '__LEMMA__' ) if $is_add;
420	unshift( @reading, '__TRANSPOSE__' ) if $is_transposition;
421	@reading = () if $is_omission;
422
423	return( \@reading, $reading_sigla, $recursed );
424	}
425
426	# Add a path for the specified sigla to connect the reading sequence.
427	# Add an a.c. path to the base sequence if we have an explicitly p.c.
428	# reading.
429	# Also handle the paths for sigla we have already added in recursive
430	# apparatus readings (i.e. don't add a path if one already exists.)
431
432	sub _add_sigil_path {
433	my( $c, $sigla, $base_sequence, $reading_sequence ) = @_;
434	my %skip;
435	foreach my $sig ( keys %$sigla ) {
436	my $use_sig = $sigla->{$sig} eq '_AC_' ? $sig.$c->ac_label : $sig;
437	foreach my $i ( 0 .. $#$reading_sequence-1 ) {
438	if( $skip{$use_sig} ) {
439	next if !_has_prior_reading( $reading_sequence[$i], $use_sig );
440	$skip{$use_sig} = 0;
441	if( _has_next_reading( $reading_sequence[$i], $use_sig ) ) {
442	$skip{$use_sig} = 1;
443	next;
444	}
445	$c->add_path( $reading_sequence[$i], $reading_sequence[$i+1], $use_sig);
446	}
447	if( $sigla->{$sig} eq '_PC_') {
448	$use_sig = $sig.$c->ac_label
449	foreach my $i ( 0 .. @$base_sequence ) {
450	if( $skip{$use_sig} ) {
451	next if !_has_prior_reading( $reading_sequence[$i], $use_sig );
452	$skip{$use_sig} = 0;
453	if( _has_next_reading( $reading_sequence[$i], $use_sig ) ) {
454	$skip{$use_sig} = 1;
455	next;
456	}
457	$c->add_path( $base_sequence[$i], $base_sequence[$i+1], $use_sig );
458	}
459	}
460	}
461	}
462
463	# Remove all ATTACH* nodes, linking the readings on either side of them.
464	# Then walk the collation for all witness paths, and make sure those paths
465	# explicitly exist. Then delete all the 'base' paths.
466
467	sub expand_all_paths {
468	my( $c ) = @_;
469
470	# Delete the anchors
471	foreach my $anchor ( grep { $_->name =~ /ATTACH/ } $c->readings ) {
472	# Map each path to its incoming/outgoing node.
473	my %incoming;
474	map { $incoming{$_->label} = $_->from } $anchor->incoming();
475	my %outgoing;
476	map { $outgoing{$_->label} = $_->to } $anchor->outgoing();
477	$c->del_reading( $anchor );
478
479	# Connect in and out.
480	my $aclabel = $c->ac_label;
481	foreach my $edge ( keys %incoming ) {
482	my $from = $incoming{$edge};
483	my $to = $outgoing{$edge};
484	if( !$to && $edge =~ /^(.*)\Q$aclabel\E$/ ) {
485	$to = $outgoing{$1};
486	}
487	$to = $outgoing{$c->baselabel} unless $to;
488	warn "Have no outbound base link on " . $anchor->name . "!"
489	unless $to;
490	$c->add_path( $from, $to, $edge );
491	}
492	# TODO Think about deleting outgoing/edge as we use them to make this faster.
493	foreach my $edge ( keys %outgoing ) {
494	my $to = $outgoing{$edge};
495	my $from = incoming{$edge};
496	if( !$from && $edge =~ /^(.*)\Q$aclabel\E$/ ) {
497	$from = $incoming{$1};
498	}
499	$from = $incoming{$c->baselabel} unless $to;
500	warn "Have no inbound base link on " . $anchor->name . "!"
501	unless $from;
502	$c->add_path( $from, $to, $edge )
503	unless _has_prior_reading( $to, $edge );
504	}
505	}
506	}
507
508	# Walk the collation and add paths if necessary
509	foreach my $sig ( keys %ALL_SIGLA ) {
510	my $wit = $c->tradition->witness( $sig );
511	my @path = $c->reading_sequence( $c->start, $c->end, $sig );
512	$wit->path( \@path );
513	if( $ALL_SIGLA{$sig} > 1 ) {
514	my @ac_path = $c->reading_sequence( $c->start, $c->end,
515	$sig.$c->ac_label, $sig );
516	$wit->uncorrected_path( \@path );
517	# a.c. paths are already there by default.
518	}
519	foreach my $i ( 1 .. $#$path ) {
520	# If there is no explicit path for this sigil between n-1 and n,
521	# add it.
522	unless( grep { $_->label eq $sig } $path[$i]->edges_from( $path[$i-1] ) ) {
523	$c->add_path( $path[$i-1], $path[$i], $sig );
524	}
525	}
526	}
527
528	# Delete all baselabel edges
529	foreach my $edge ( grep { $_->label eq $c->baselabel } $c->paths ) {
530	$c->del_edge( $edge );
531	}
532
533	# Calculate ranks on graph nodes
534	$c->calculate_ranks();
535	}
536
537	sub _get_seq {
538	my( $str ) = @_;
539	my $seq = 1;
540	my $lw = $str;
541	if( $str =~ /^(.*)(\d)\x{80}$/ ) {
542	( $lw, $seq) = ( $1, $2 );
543	}
544	return( $lw, $seq );
545	}
546
547	sub _has_next_reading {
548	my( $rdg, $sigil ) = @_;
549	return grep { $_->label eq $sigil } $rdg->outgoing();
550	}
551	sub _has_prior_reading {
552	my( $rdg, $sigil ) = @_;
553	return grep { $_->label eq $sigil } $rdg->incoming();
554	}