From: Tara L Andrews Date: Sat, 29 Oct 2011 20:29:26 +0000 (+0200) Subject: stop tracking bbedit stuff; first pass at Collate! parsing X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=fa954f4c97d955f5f5ceca2ea1ca3f744e9b985f;p=scpubgit%2Fstemmatology.git stop tracking bbedit stuff; first pass at Collate! parsing --- diff --git a/Tradition.bbprojectd/Scratchpad.txt b/Tradition.bbprojectd/Scratchpad.txt deleted file mode 100644 index e69de29..0000000 diff --git a/Tradition.bbprojectd/Unix Worksheet.worksheet b/Tradition.bbprojectd/Unix Worksheet.worksheet deleted file mode 100644 index e69de29..0000000 diff --git a/Tradition.bbprojectd/project.bbprojectdata b/Tradition.bbprojectd/project.bbprojectdata deleted file mode 100644 index 777e914..0000000 --- a/Tradition.bbprojectd/project.bbprojectdata +++ /dev/null @@ -1,122 +0,0 @@ - - - - - HierarchyData - - BA45EE3F-4E79-4734-A808-E988ECE32C18 - E5B4FCC5-00C7-4E1D-963F-D72E7027869A - 8752D0C9-D9C0-484A-ADD9-3243A186536F - - ProjectItems - - 8752D0C9-D9C0-484A-ADD9-3243A186536F - - ItemData - - AliasData - - AAAAAAFkAAIAAQxNYWNpbnRvc2ggSEQAAAAAAAAAAAAA - AAAAAADKPI0jSCsAAAAILogBdAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAgx4sose4kAAAAAAAAAAP// - //8AAAkgAAAAAAAAAAAAAAAAAAAADHN0ZW1tYXRvbG9n - eQAQAAgAAMo8cQMAAAARAAgAAMosX2kAAAABABAACC6I - AAckkwAFBAYAAL8xAAIAMk1hY2ludG9zaCBIRDpVc2Vy - czoAdGxhOgBQcm9qZWN0czoAc3RlbW1hdG9sb2d5OgB0 - AA4ABAABAHQADwAaAAwATQBhAGMAaQBuAHQAbwBzAGgA - IABIAEQAEgAhVXNlcnMvdGxhL1Byb2plY3RzL3N0ZW1t - YXRvbG9neS90AAATAAEvAAAVAAIACv//AAA= - - FileURL - file://localhost/Users/tla/Projects/stemmatology/t/ - RelativePath - ./t - TypeID - _CFileLocator - Version - 1 - - ItemName - t - ItemType - FolderReference - UserOverrideItemName - - - BA45EE3F-4E79-4734-A808-E988ECE32C18 - - ItemData - - AliasData - - AAAAAAF0AAIAAQxNYWNpbnRvc2ggSEQAAAAAAAAAAAAA - AAAAAADKPI0jSCsAAAAIMZkEVGV4dAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAgxmsovk3cAAAAAAAAAAP// - //8AAAkgAAAAAAAAAAAAAAAAAAAAA2xpYgAAEAAIAADK - PHEDAAAAEQAIAADKL3dXAAAAAQAUAAgxmQAILogAByST - AAUEBgAAvzEAAgA6TWFjaW50b3NoIEhEOlVzZXJzOgB0 - bGE6AFByb2plY3RzOgBzdGVtbWF0b2xvZ3k6AGxpYjoA - VGV4dAAOAAoABABUAGUAeAB0AA8AGgAMAE0AYQBjAGkA - bgB0AG8AcwBoACAASABEABIAKFVzZXJzL3RsYS9Qcm9q - ZWN0cy9zdGVtbWF0b2xvZ3kvbGliL1RleHQAEwABLwAA - FQACAAr//wAA - - FileURL - file://localhost/Users/tla/Projects/stemmatology/lib/Text/ - RelativePath - ./lib/Text - TypeID - _CFileLocator - Version - 1 - - ItemName - Text - ItemType - FolderReference - UserOverrideItemName - - - E5B4FCC5-00C7-4E1D-963F-D72E7027869A - - ItemData - - AliasData - - AAAAAAGkAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAA - AAAAAADKPI0jSCsAAAAILogRbWFrZV90cmFkaXRpb24u - cGwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAADZNAcqzM/JURVhUAAAAAP// - //8AAAkgAAAAAAAAAAAAAAAAAAAADHN0ZW1tYXRvbG9n - eQAQAAgAAMo8cQMAAAARAAgAAMqzF9IAAAABABAACC6I - AAckkwAFBAYAAL8xAAIAQk1hY2ludG9zaCBIRDpVc2Vy - czoAdGxhOgBQcm9qZWN0czoAc3RlbW1hdG9sb2d5OgBt - YWtlX3RyYWRpdGlvbi5wbAAOACQAEQBtAGEAawBlAF8A - dAByAGEAZABpAHQAaQBvAG4ALgBwAGwADwAaAAwATQBh - AGMAaQBuAHQAbwBzAGgAIABIAEQAEgAxVXNlcnMvdGxh - L1Byb2plY3RzL3N0ZW1tYXRvbG9neS9tYWtlX3RyYWRp - dGlvbi5wbAAAEwABLwAAFQACAAr//wAA - - FileURL - file://localhost/Users/tla/Projects/stemmatology/make_tradition.pl - RelativePath - ./make_tradition.pl - TypeID - _CFileLocator - Version - 1 - - ItemName - make_tradition.pl - ItemType - FileReference - - - com.barebones.DocumentFormatVersion - 5 - com.barebones.DocumentType - Project - - diff --git a/TreeOfTexts/TreeOfTexts.bbprojectd/Scratchpad.txt b/TreeOfTexts/TreeOfTexts.bbprojectd/Scratchpad.txt deleted file mode 100644 index e69de29..0000000 diff --git a/TreeOfTexts/TreeOfTexts.bbprojectd/Unix Worksheet.worksheet b/TreeOfTexts/TreeOfTexts.bbprojectd/Unix Worksheet.worksheet deleted file mode 100644 index e69de29..0000000 diff --git a/TreeOfTexts/TreeOfTexts.bbprojectd/project.bbprojectdata b/TreeOfTexts/TreeOfTexts.bbprojectd/project.bbprojectdata deleted file mode 100644 index b5dd89a..0000000 --- a/TreeOfTexts/TreeOfTexts.bbprojectd/project.bbprojectdata +++ /dev/null @@ -1,88 +0,0 @@ - - - - - HierarchyData - - 5BD9F128-FD1C-4666-923B-0B81815346E3 - 9FC1475A-0700-42A7-9BB9-5BAAAC11CF84 - - ProjectItems - - 5BD9F128-FD1C-4666-923B-0B81815346E3 - - ItemData - - AliasData - - AAAAAAGMAAIAAQxNYWNpbnRvc2ggSEQAAAAAAAAAAAAA - AAAAAADHK9wvSCsAAACCFmUEcm9vdAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAIIWaMquj3gAAAAAAAAAAP// - //8AAAkgAAAAAAAAAAAAAAAAAAAAC1RyZWVPZlRleHRz - AAAQAAgAAMcrzh8AAAARAAgAAMquc1gAAAABABQAghZl - AFA22wAI934ACAgzAACRjQACAEJNYWNpbnRvc2ggSEQ6 - VXNlcnM6AHRsYToAUHJvamVjdHM6AHN0ZW1tYXRvbG9n - eToAVHJlZU9mVGV4dHM6AHJvb3QADgAKAAQAcgBvAG8A - dAAPABoADABNAGEAYwBpAG4AdABvAHMAaAAgAEgARAAS - ADBVc2Vycy90bGEvUHJvamVjdHMvc3RlbW1hdG9sb2d5 - L1RyZWVPZlRleHRzL3Jvb3QAEwABLwAAFQACAAr//wAA - - FileURL - file://localhost/Users/tla/Projects/stemmatology/TreeOfTexts/root/ - RelativePath - ./root - TypeID - _CFileLocator - Version - 1 - - ItemName - root - ItemType - FolderReference - UserOverrideItemName - - - 9FC1475A-0700-42A7-9BB9-5BAAAC11CF84 - - ItemData - - AliasData - - AAAAAAGKAAIAAQxNYWNpbnRvc2ggSEQAAAAAAAAAAAAA - AAAAAADHK9wvSCsAAACCFmUDbGliAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA - AAAAAAAAAAAAAAAAAAAAAIIWZ8quj3gAAAAAAAAAAP// - //8AAAkgAAAAAAAAAAAAAAAAAAAAC1RyZWVPZlRleHRz - AAAQAAgAAMcrzh8AAAARAAgAAMquc1gAAAABABQAghZl - AFA22wAI934ACAgzAACRjQACAEFNYWNpbnRvc2ggSEQ6 - VXNlcnM6AHRsYToAUHJvamVjdHM6AHN0ZW1tYXRvbG9n - eToAVHJlZU9mVGV4dHM6AGxpYgAADgAIAAMAbABpAGIA - DwAaAAwATQBhAGMAaQBuAHQAbwBzAGgAIABIAEQAEgAv - VXNlcnMvdGxhL1Byb2plY3RzL3N0ZW1tYXRvbG9neS9U - cmVlT2ZUZXh0cy9saWIAABMAAS8AABUAAgAK//8AAA== - - FileURL - file://localhost/Users/tla/Projects/stemmatology/TreeOfTexts/lib/ - RelativePath - ./lib - TypeID - _CFileLocator - Version - 1 - - ItemName - lib - ItemType - FolderReference - UserOverrideItemName - - - - com.barebones.DocumentFormatVersion - 5 - com.barebones.DocumentType - Project - - diff --git a/lib/Text/Tradition.pm b/lib/Text/Tradition.pm index f2ae1f6..e10198d 100644 --- a/lib/Text/Tradition.pm +++ b/lib/Text/Tradition.pm @@ -246,7 +246,7 @@ sub BUILD { $self->_save_collation( $collation ); # Call the appropriate parser on the given data - my @format_standalone = qw/ Self CollateX CTE TEI Tabular /; + my @format_standalone = qw/ Self CollateText CollateX CTE TEI Tabular /; my @format_basetext = qw/ KUL /; my $use_base; my $format = $init_args->{'input'}; diff --git a/lib/Text/Tradition/Analysis.pm b/lib/Text/Tradition/Analysis.pm index 49cbe95..a717529 100644 --- a/lib/Text/Tradition/Analysis.pm +++ b/lib/Text/Tradition/Analysis.pm @@ -55,7 +55,7 @@ sub run_analysis { my $col_wits = shift @$all_wits_table; # Any witness in the stemma that has no row should be noted. foreach ( @$col_wits ) { - $wits->{$_}++; + $wits->{$_}++; # Witnesses present in table and stemma now have value 2. } my @not_collated = grep { $wits->{$_} == 1 } keys %$wits; diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index d28d7d7..d903191 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -534,7 +534,7 @@ sub make_alignment_table { return; } my $table; - my @all_pos = sort { $a <=> $b } $self->possible_positions; + my @all_pos = ( 0 .. $self->end->rank - 1 ); foreach my $wit ( $self->tradition->witnesses ) { # print STDERR "Making witness row(s) for " . $wit->sigil . "\n"; my @row = _make_witness_row( $wit->path, \@all_pos, $noderefs ); @@ -1067,206 +1067,8 @@ sub flatten_ranks { } -sub possible_positions { - my $self = shift; - my %all_pos; - map { $all_pos{ $_->rank } = 1 } $self->readings; - return keys %all_pos; -} - -# TODO think about indexing this. -sub readings_at_position { - my( $self, $position, $strict ) = @_; - my @answer; - foreach my $r ( $self->readings ) { - push( @answer, $r ) if $r->is_at_position( $position, $strict ); - } - return @answer; -} - -## Lemmatizer functions - -sub init_lemmata { - my $self = shift; - - foreach my $position ( $self->possible_positions ) { - $self->lemmata->{$position} = undef; - } - - foreach my $cr ( $self->common_readings ) { - $self->lemmata->{$cr->position->maxref} = $cr->name; - } -} - -sub common_readings { - my $self = shift; - my @common = grep { $_->is_common } $self->readings(); - return sort { $a->rank <=> $b->rank } @common; -} - -=item B - -my @state = $graph->lemma_readings( @readings_delemmatized ); - -Takes a list of readings that have just been delemmatized, and returns -a set of tuples of the form ['reading', 'state'] that indicates what -changes need to be made to the graph. - -=over - -=item * - -A state of 1 means 'lemmatize this reading' - -=item * - -A state of 0 means 'delemmatize this reading' - -=item * - -A state of undef means 'an ellipsis belongs in the text here because -no decision has been made / an earlier decision was backed out' - -=back - -=cut - -sub lemma_readings { - my( $self, @toggled_off_nodes ) = @_; - - # First get the positions of those nodes which have been - # toggled off. - my $positions_off = {}; - map { $positions_off->{ $_->position->reference } = $_->name } - @toggled_off_nodes; - - # Now for each position, we have to see if a node is on, and we - # have to see if a node has been turned off. The lemmata hash - # should contain fixed positions, range positions whose node was - # just turned off, and range positions whose node is on. - my @answer; - my %fixed_positions; - # TODO One of these is probably redundant. - map { $fixed_positions{$_} = 0 } keys %{$self->lemmata}; - map { $fixed_positions{$_} = 0 } keys %{$positions_off}; - map { $fixed_positions{$_} = 1 } $self->possible_positions; - foreach my $pos ( sort { Text::Tradition::Collation::Position::str_cmp( $a, $b ) } keys %fixed_positions ) { - # Find the state of this position. If there is an active node, - # its name will be the state; otherwise the state will be 0 - # (nothing at this position) or undef (ellipsis at this position) - my $active = undef; - $active = $self->lemmata->{$pos} if exists $self->lemmata->{$pos}; - - # Is there a formerly active node that was toggled off? - if( exists( $positions_off->{$pos} ) ) { - my $off_node = $positions_off->{$pos}; - if( $active && $active ne $off_node) { - push( @answer, [ $off_node, 0 ], [ $active, 1 ] ); - } else { - unless( $fixed_positions{$pos} ) { - $active = 0; - delete $self->lemmata->{$pos}; - } - push( @answer, [ $off_node, $active ] ); - } - - # No formerly active node, so we just see if there is a currently - # active one. - } elsif( $active ) { - # Push the active node, whatever it is. - push( @answer, [ $active, 1 ] ); - } else { - # Push the state that is there. Arbitrarily use the first node - # at that position. - my @pos_nodes = $self->readings_at_position( $pos ); - push( @answer, [ $pos_nodes[0]->name, $self->lemmata->{$pos} ] ); - delete $self->lemmata->{$pos} unless $fixed_positions{$pos}; - } - } - - return @answer; -} - -=item B - -my @readings_delemmatized = $graph->toggle_reading( $reading_name ); - -Takes a reading node name, and either lemmatizes or de-lemmatizes -it. Returns a list of all readings that are de-lemmatized as a result -of the toggle. - -=cut - -sub toggle_reading { - my( $self, $rname ) = @_; - - return unless $rname; - my $reading = $self->reading( $rname ); - if( !$reading || $reading->is_common() ) { - # Do nothing, it's a common node. - return; - } +## Utility functions - my $pos = $reading->position; - my $fixed = $reading->position->fixed; - my $old_state = $self->lemmata->{$pos->reference}; - - my @readings_off; - if( $old_state && $old_state eq $rname ) { - # Turn off the node. We turn on no others by default. - push( @readings_off, $reading ); - } else { - # Turn on the node. - $self->lemmata->{$pos->reference} = $rname; - # Any other 'on' readings in the same position should be off - # if we have a fixed position. - push( @readings_off, $self->same_position_as( $reading, 1 ) ) - if $pos->fixed; - # Any node that is an identical transposed one should be off. - push( @readings_off, $reading->identical_readings ); - } - @readings_off = unique_list( @readings_off ); - - # Turn off the readings that need to be turned off. - my @readings_delemmatized; - foreach my $n ( @readings_off ) { - my $npos = $n->position; - my $state = undef; - $state = $self->lemmata->{$npos->reference} - if defined $self->lemmata->{$npos->reference}; - if( $state && $state eq $n->name ) { - # this reading is still on, so turn it off - push( @readings_delemmatized, $n ); - my $new_state = undef; - if( $npos->fixed && $n eq $reading ) { - # This is the reading that was clicked, so if there are no - # other readings there and this is a fixed position, turn off - # the position. In all other cases, restore the ellipsis. - my @other_n = $self->same_position_as( $n ); # TODO do we need strict? - $new_state = 0 unless @other_n; - } - $self->lemmata->{$npos->reference} = $new_state; - } elsif( $old_state && $old_state eq $n->name ) { - # another reading has already been turned on here - push( @readings_delemmatized, $n ); - } # else some other reading was on anyway, so pass. - } - return @readings_delemmatized; -} - -sub same_position_as { - my( $self, $reading, $strict ) = @_; - my $pos = $reading->position; - my %onpath = ( $reading->name => 1 ); - # TODO This might not always be sufficient. We really want to - # exclude all readings on this one's path between its two - # common points. - map { $onpath{$_->name} = 1 } $reading->neighbor_readings; - my @same = grep { !$onpath{$_->name} } - $self->readings_at_position( $reading->position, $strict ); - return @same; -} - # Return the string that joins together a list of witnesses for # display on a single path. sub path_label { @@ -1281,13 +1083,6 @@ sub witnesses_of_label { return @answer; } -sub unique_list { - my( @list ) = @_; - my %h; - map { $h{$_->name} = $_ } @list; - return values( %h ); -} - sub add_hash_entry { my( $hash, $key, $entry ) = @_; if( exists $hash->{$key} ) { diff --git a/lib/Text/Tradition/Parser/CollateText.pm b/lib/Text/Tradition/Parser/CollateText.pm new file mode 100644 index 0000000..a712bb8 --- /dev/null +++ b/lib/Text/Tradition/Parser/CollateText.pm @@ -0,0 +1,554 @@ +package Text::Tradition::Parser::CollateText; + +use strict; +use warnings; + +=head1 NAME + +Text::Tradition::Parser::CollateText + +=head1 DESCRIPTION + +For an overview of the package, see the documentation for the +Text::Tradition module. + +This module is meant for use with a set of text files saved from Word docs, +which originated with the COLLATE collation program. + +=head1 SUBROUTINES + +=over + +=item B + +parse( $graph, $opts ); + +Takes an initialized graph and a hashref of options, which must include: +- 'base' - the base text referenced by the variants +- 'format' - the format of the variant list +- 'data' - the variants, in the given format. + +=cut + +my %ALL_SIGLA; + +sub parse { + my( $tradition, $opts ) = @_; + # String together the base text. + my $lineref_hash = read_stone_base( $opts->{'base'}, $tradition->collation ); + # Note the sigla. + foreach my $sigil ( @{$opts->{'sigla'}} ) { + $ALL_SIGLA{$sigil} = 1; + $tradition->add_witness( 'sigil' => $sigil ); + } + # Now merge on the apparatus entries. + merge_stone_apparatus( $tradition->collation, $lineref_hash, $opts->{'input'} ); +} + +=item B + +my $text_list = read_base( 'reference.txt', $collation ); + +Takes a text file and a (presumed empty) collation object, adds the words +as simple linear readings to the collation, and returns a hash of texts +with line keys. This collation is now the starting point for application of +apparatus entries in merge_base, e.g. from a CSV file or a Classical Text +Editor file. + +The hash is of the form + + { chapter_name => { line_ref => { start => node, end => node } } } + +=cut + +sub read_stone_base { + my( $base_file, $collation ) = @_; + + # This array gives the first reading for each line. We put the + # common starting point in line zero. + my $last_reading = $collation->start(); + my $lineref_hash = {}; + my $last_lineref; + + my $curr_text; + open( BASE, $base_file ) or die "Could not open file $base_file: $!"; + my $i = 1; + while() { + # Make the readings, and connect them up for the base, but + # also save the first reading of each line in a hash for the + # purpose. + chomp; + next if /^\s+$/; # skip blank lines + s/^(\d)\x{589}/$1:/; # turn Armenian full stops into colons + if( /^TESTAMENT/ ) { + # Initialize the base hash for this section. + $lineref_hash->{$_} = {}; + $curr_text = $lineref_hash->{$_}; + next; + } + my @words = split; + my $lineref; + if( /^\d/ ) { + # The first "word" is a line reference; keep it. + $lineref = shift @words; + } else { + # Assume we are dealing with the title. + $lineref = 'Title:'; + } + + # Now turn the remaining words into readings. + my $wordref = 0; + foreach my $w ( @words ) { + my $readingref = join( ',', $lineref, ++$wordref ); + my $reading = $collation->add_reading( $readingref ); + $reading->text( $w ); + unless( exists $curr_text->{$lineref}->{'start'} ) { + $curr_text->{$lineref}->{'start'} = $reading; + } + # Add edge paths in the graph, for easier tracking when + # we start applying corrections. These paths will be + # removed when we're done. + my $path = $collation->add_path( $last_reading, $reading, + $collation->baselabel ); + $last_reading = $reading; + } + $curr_text->{$lineref}->{'end'} = $last_reading; + } + + close BASE; + # Ending point for all texts + $collation->add_path( $last_reading, $collation->end, $collation->baselabel ); + return( $lineref_hash ); +} + +=item B + +Read an apparatus as output (presumably) by Collate. It should be reasonably +regular in form, I hope. Merge the apparatus variants onto the appropriate +lemma readings. + +=cut + +sub merge_stone_apparatus { + my( $c, $lineref_hash, $file ) = @_; + + my $text_apps = {}; + my $current_text; + open( APP, $file ) or die "Could not read apparatus file $file"; + while( ) { + chomp; + next if /^\s+$/; + if( /^TESTAMENT/ ) { + $current_text = $lineref_hash->{$_}; + next; + } + + # Otherwise, the first word of the line is the base text line reference. + my $i = 0; + my $lineref; + if( s/^(\S+)// ) { + $lineref = $1; + } else { + warn "Unrecognized line $_"; + } + my $baseline = $current_text->{$lineref}; + # The start and end readings for this line are now in $baseline->{start} + # and $baseline->{end}. + + # Now look at the apparatus entries for this line. They are + # split with |. + my @apps = split( '|' ); + foreach my $app ( @apps ) { + my( $lemma, $rest ) = split( ']', $app ); + + # Find the lemma reading. + my( $lemma_start, $lemma_end ) = + _find_reading_on_line( $c, $lemma, $baseline ); + my @lemma_chain = $c->reading_sequence( $lemma_start, $lemma_end ); + + # Splice in "start" and "end" placeholders on either + # side of the lemma. + my ( $rdg_start, $rdg_end ) = + _add_reading_placeholders( $c, $lemma_start, $lemma_end ); + + # For each reading, attach it to the lemma. + my @indiv = split( ' ', $rest ); + foreach my $rdg ( @indiv ) { + # Parse the string. + my( $words, $sigla, $recurse ) = parse_app_entry( $rdg ); + my @readings; + foreach my $i ( 0 .. $#$words ) { + next if $i == 0 && $words->[$i] =~ /^__/; + my $reading_id = $rdg_start->text . '_' . $rdg_end->text . '/' . $i; + my $reading = $c->add_reading( $reading_id ); + $reading->text( $words->[$i] ); + push( @readings, $reading ); + } + + # Deal with any specials. + my $lemma_sequence; + if( $words->[0] eq '__LEMMA__' ) { + $lemma_sequence = [ $lemma_end, $rdg_end ]; + } elsif ( $rdg->[0] eq '__TRANSPOSE__' ) { + # Hope it is only two or three words in the lemma. + # TODO figure out how we really want to handle this + @readings = reverse @lemma_chain; + } + $lemma_sequence = [ $rdg_start, @lemma_chain, $rdg_end ] + unless $lemma_sequence; + + # Now hook up the paths. + unshift( @readings, $rdg_start ); + push( @readings, $rdg_end ); + foreach my $i ( 1 .. $#readings ) { + if( $recurse->{$i} ) { + my( $rwords, $rsig ) = parse_app_entry( $recurse->{$i} ); + # Get the local "lemma" sequence + my $llseq = [ $readings[$i], $readings[$i+1] ]; + if( $rwords->[0] ne '__LEMMA__' ) { + # Treat it as an addition to the last word + unshift( @$llseq, $readings[$i-1] ); + } + # Create the reading nodes in $rwords + # TODO Hope we don't meet ~ in a recursion + my $local_rdg = []; + foreach my $i ( 0 .. $#$rwords ) { + next if $i == 0 && $rwords->[$i] =~ /^__/; + my $reading_id = $llseq->[0]->text . '_' . + $llseq->[-1]->text . '/' . $i; + my $reading = $c->add_reading( $reading_id ); + $reading->text( $words->[$i] ); + push( @$local_rdg, $reading ); + } + # Add the path(s) necessary + _add_sigil_path( $c, $rsig, $local_rdg, $llseq ); + } + } + _add_sigil_path( $c, $sigla, \@readings, $lemma_sequence ); + } # end processing of $app + } # end foreach my $app in line + } # end while + + # Now reconcile all the paths in the collation, and delete our + # temporary anchor nodes. + expand_all_paths( $c ); + + # Finally, calculate the ranks we've got. + $c->calculate_ranks; +} + +sub _find_reading_on_line { + my( $c, $lemma, $baseline ) = @_; + + my $lemma_start = $baseline->{'start'}; + my $lemma_end; + my $too_far = $baseline->{'end'}->next_reading; + my @lemma_words = split( /\s+/, $lemma ); + + my %seen; + my $scrutinize = ''; # DEBUG variable + my $seq = 1; + while( $lemma_start ne $too_far ) { + # Loop detection + if( $seen{ $lemma_start->name() } ) { + warn "Detected loop at " . $lemma_start->name . " for lemma $lemma"; + last; + } + $seen{ $lemma_start->name() } = 1; + + # Try to match the lemma. + # TODO move next/prior reading methods into the reading classes, + # to make this more self-contained and not need to pass $c. + my $unmatch = 0; + my ( $lw, $seq ) = _get_seq( $lemma_words[0] ); + print STDERR "Matching $lemma_start against $lw...\n" + if $scrutinize; + if( $lemma_start->text eq $lw ) { + # Skip it if we need a match that is not the first. + if( --$seq < 1 ) { + # Now we have to compare the rest of the words here. + if( scalar( @lemma_words ) > 1 ) { + my $next_reading = + $c->next_reading( $lemma_start ); + my $wildcard = 0; + foreach my $w ( @lemma_words[1..$#lemma_words] ) { + if( $w eq '---' ) { + # We match everything to the next word. + $wildcard = 1; + next; + } else { + $wildcard = 0; + } + ( $lw, $seq ) = _get_seq( $w ); + printf STDERR "Now matching %s against %s\n", + $next_reading->text, $lw + if $scrutinize; + if( !$wildcard && $w ne $next_reading->text) { + $unmatch = 1; + last; + } else { + $lemma_end = $next_reading; + $next_reading = + $c->next_reading( $lemma_end ); + } + } + } else { # single-word match, easy. + $lemma_end = $lemma_start; + } + } else { # we need the Nth match and aren't there yet + $unmatch = 1; + } + } + last unless ( $unmatch || !defined( $lemma_end ) ); + $lemma_end = undef; + $lemma_start = $c->next_reading( $lemma_start ); + } + + unless( $lemma_end ) { + warn "No match found for @lemma_words"; + return undef; + } + return( $lemma_start, $lemma_end ); +} + +sub _add_reading_placeholders { + my( $collation, $lemma_start, $lemma_end ) = @_; + # We will splice in a 'begin' and 'end' marker on either side of the + # lemma, as sort of a double-endpoint attachment in the graph. + + my $attachlabel = "ATTACH"; + my( $start_node, $end_node ); + my @start_id = grep { $_->label eq $attachlabel } $lemma_start->incoming; + if( @start_id ) { + # There already exists an app-begin node. Use that. + $start_node = $start_id[0]->from; + } else { + $start_node = $collation->add_reading( $app_info->{_id} ); + $collation->add_path( + $collation->prior_reading( $lemma_start, $collation->baselabel ), + $start_node, $attachlabel ); + $collation->add_path( $start_node, $lemma_start, $attachlabel ); + } + # Now the converse for the end. + my @end_id = grep { $_->label eq $attachlabel } $lemma_end->outgoing; + if( @end_id ) { + # There already exists an app-begin node. Use that. + $end_node = $end_id[0]->to; + } else { + $end_node = $collation->add_reading( $app_info->{_id} . "E" ); + $collation->add_path( $lemma_end, $end_node, $attachlabel ); + $collation->add_path( $end_node, + $collation->next_reading( $lemma_end, $collation->baselabel ), + $attachlabel ); + } + return( $start_node, $end_node ); +} + +# Function to parse an apparatus reading string, with reference to no other +# data. Need to do this separately as readings can include readings (ugh). +# Try to give whatever information we might need, including recursive app +# entries that might need to be parsed. + +sub parse_app_entry { + my( $rdg, ) = @_; + $rdg =~ s/^\s+//; + $rdg =~ s/\s+$//; + next unless $rdg; # just in case + my @words = split( /\s+/, $rdg ); + # Zero or more sigils e.g. +, followed by Armenian, + # followed by (possibly modified) sigla, followed by + # optional : with note. + my $is_add; + my $is_omission; + my $is_transposition; + my @reading; + my %reading_sigla; + my $recursed; + my $sig_regex = join( '|', keys %ALL_SIGLA ); + while( @words ) { + my $bit = shift @words; + if( $bit eq '+' ) { + $is_add = 1; + } elsif( $bit eq 'om' ) { + $is_omission = 1; + } elsif( $bit eq '~' ) { + $is_transposition = 1; + } elsif( $bit =~ /\p{Armenian}/ ) { + warn "Found text in omission?!" if $is_omission; + push( @reading, $bit ); + } elsif( $bit eq ':' ) { + # Stop processing. + last; + } elsif( $bit =~ /^\($/ ) { + # It's a recursive reading within a reading. Lemmatize what we + # have so far and grab the extra. + my @new = ( $1 ); + until( $new[-1] =~ /\)$/ ) { + push( @new, shift @words ); + } + my $recursed_reading = join( ' ', @new ); + $recursed_reading =~ s/^\((.*)\)//; + # This recursive entry refers to the last reading word(s) we + # saw. Push its index+1. We will have to come back to parse + # it when we are dealing with the main reading. + # TODO handle () as first element + # TODO handle - as suffix to add, i.e. make new word + $recursed->{@reading} = $recursed_reading; + } elsif( $bit =~ /^(\Q$sig_regex\E)(.*)$/ { + # It must be a sigil. + my( $sigil, $mod ) = ( $1, $2 ); + if( $mod eq "\x{80}" ) { + $reading_sigla->{$sig} = '_PC_'; + $ALL_SIGLA{$sig} = 2; # a pre- and post-corr version exists + } elsif( $mod eq '*' ) { + $reading_sigla->{$sig} = '_AC_'; + $ALL_SIGLA{$sig} = 2; # a pre- and post-corr version exists + } else { + $reading_sigla->{$sig} = 1 unless $mod; # skip secondhand corrections + } + } elsif( $bit =~ /transpos/ ) { + # There are some transpositions not coded rigorously; skip them. + warn "Found hard transposition in $rdg; fix manually"; + last; + } else { + warn "Not sure what to do with bit $bit in $rdg"; + } + } + + # Transmogrify the reading if necessary. + unshift( @reading, '__LEMMA__' ) if $is_add; + unshift( @reading, '__TRANSPOSE__' ) if $is_transposition; + @reading = () if $is_omission; + + return( \@reading, $reading_sigla, $recursed ); +} + +# Add a path for the specified sigla to connect the reading sequence. +# Add an a.c. path to the base sequence if we have an explicitly p.c. +# reading. +# Also handle the paths for sigla we have already added in recursive +# apparatus readings (i.e. don't add a path if one already exists.) + +sub _add_sigil_path { + my( $c, $sigla, $base_sequence, $reading_sequence ) = @_; + my %skip; + foreach my $sig ( keys %$sigla ) { + my $use_sig = $sigla->{$sig} eq '_AC_' ? $sig.$c->ac_label : $sig; + foreach my $i ( 0 .. $#$reading_sequence-1 ) { + if( $skip{$use_sig} ) { + next if !_has_prior_reading( $reading_sequence[$i], $use_sig ); + $skip{$use_sig} = 0; + if( _has_next_reading( $reading_sequence[$i], $use_sig ) ) { + $skip{$use_sig} = 1; + next; + } + $c->add_path( $reading_sequence[$i], $reading_sequence[$i+1], $use_sig); + } + if( $sigla->{$sig} eq '_PC_') { + $use_sig = $sig.$c->ac_label + foreach my $i ( 0 .. @$base_sequence ) { + if( $skip{$use_sig} ) { + next if !_has_prior_reading( $reading_sequence[$i], $use_sig ); + $skip{$use_sig} = 0; + if( _has_next_reading( $reading_sequence[$i], $use_sig ) ) { + $skip{$use_sig} = 1; + next; + } + $c->add_path( $base_sequence[$i], $base_sequence[$i+1], $use_sig ); + } + } + } +} + +# Remove all ATTACH* nodes, linking the readings on either side of them. +# Then walk the collation for all witness paths, and make sure those paths +# explicitly exist. Then delete all the 'base' paths. + +sub expand_all_paths { + my( $c ) = @_; + + # Delete the anchors + foreach my $anchor ( grep { $_->name =~ /ATTACH/ } $c->readings ) { + # Map each path to its incoming/outgoing node. + my %incoming; + map { $incoming{$_->label} = $_->from } $anchor->incoming(); + my %outgoing; + map { $outgoing{$_->label} = $_->to } $anchor->outgoing(); + $c->del_reading( $anchor ); + + # Connect in and out. + my $aclabel = $c->ac_label; + foreach my $edge ( keys %incoming ) { + my $from = $incoming{$edge}; + my $to = $outgoing{$edge}; + if( !$to && $edge =~ /^(.*)\Q$aclabel\E$/ ) { + $to = $outgoing{$1}; + } + $to = $outgoing{$c->baselabel} unless $to; + warn "Have no outbound base link on " . $anchor->name . "!" + unless $to; + $c->add_path( $from, $to, $edge ); + } + # TODO Think about deleting outgoing/edge as we use them to make this faster. + foreach my $edge ( keys %outgoing ) { + my $to = $outgoing{$edge}; + my $from = incoming{$edge}; + if( !$from && $edge =~ /^(.*)\Q$aclabel\E$/ ) { + $from = $incoming{$1}; + } + $from = $incoming{$c->baselabel} unless $to; + warn "Have no inbound base link on " . $anchor->name . "!" + unless $from; + $c->add_path( $from, $to, $edge ) + unless _has_prior_reading( $to, $edge ); + } + } + } + + # Walk the collation and add paths if necessary + foreach my $sig ( keys %ALL_SIGLA ) { + my $wit = $c->tradition->witness( $sig ); + my @path = $c->reading_sequence( $c->start, $c->end, $sig ); + $wit->path( \@path ); + if( $ALL_SIGLA{$sig} > 1 ) { + my @ac_path = $c->reading_sequence( $c->start, $c->end, + $sig.$c->ac_label, $sig ); + $wit->uncorrected_path( \@path ); + # a.c. paths are already there by default. + } + foreach my $i ( 1 .. $#$path ) { + # If there is no explicit path for this sigil between n-1 and n, + # add it. + unless( grep { $_->label eq $sig } $path[$i]->edges_from( $path[$i-1] ) ) { + $c->add_path( $path[$i-1], $path[$i], $sig ); + } + } + } + + # Delete all baselabel edges + foreach my $edge ( grep { $_->label eq $c->baselabel } $c->paths ) { + $c->del_edge( $edge ); + } + + # Calculate ranks on graph nodes + $c->calculate_ranks(); +} + +sub _get_seq { + my( $str ) = @_; + my $seq = 1; + my $lw = $str; + if( $str =~ /^(.*)(\d)\x{80}$/ ) { + ( $lw, $seq) = ( $1, $2 ); + } + return( $lw, $seq ); +} + +sub _has_next_reading { + my( $rdg, $sigil ) = @_; + return grep { $_->label eq $sigil } $rdg->outgoing(); +} +sub _has_prior_reading { + my( $rdg, $sigil ) = @_; + return grep { $_->label eq $sigil } $rdg->incoming(); +} \ No newline at end of file diff --git a/make_tradition.pl b/make_tradition.pl index 5f75f33..f928c33 100755 --- a/make_tradition.pl +++ b/make_tradition.pl @@ -27,7 +27,7 @@ if( $help ) { help(); } -unless( $informat =~ /^(CSV|CTE|KUL|Self|TEI|CollateX|tab(ular)?)$/i ) { +unless( $informat =~ /^(CSV|CTE|KUL|Self|TEI|CollateX|tab(ular)?)|stone$/i ) { help( "Input format must be one of CollateX, CSV, CTE, Self, TEI" ); } $informat = 'CollateX' if $informat =~ /^c(ollate)?x$/i; @@ -36,16 +36,18 @@ $informat = 'CTE' if $informat =~ /^cte$/i; $informat = 'Self' if $informat =~ /^self$/i; $informat = 'TEI' if $informat =~ /^tei$/i; $informat = 'Tabular' if $informat =~ /^tab$/i; +$informat = 'CollateText' if $informat =~ /^stone$/i; unless( $outformat =~ /^(graphml|svg|dot|stemma|csv)$/ ) { help( "Output format must be one of graphml, svg, csv, stemma, or dot" ); } # Do we have a base if we need it? -if( $informat eq 'KUL' && !$inbase ) { +if( $informat =~ /^(KUL|CollateText)$/ && !$inbase ) { help( "$informat input needs a base text" ); } + my $input = $ARGV[0]; # First: read the base. Make a graph, but also note which @@ -55,6 +57,10 @@ my %args = ( 'input' => $informat, 'linear' => $linear ); $args{'base'} = $inbase if $inbase; $args{'name'} = $name if $name; +### Custom hacking for Stone +if( $informat eq 'CollateText' ) { + $args{'sigla'} = [ qw/ S M X V Z Bb B K W L / ]; +} my $tradition = Text::Tradition->new( %args ); ### Custom hacking