From: tla Date: Wed, 18 May 2011 15:07:39 +0000 (+0200) Subject: UNTESTED saving work on base text parsing with new library X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=e290206835dd4bc540d751ea2d1849255dd192f2;p=scpubgit%2Fstemmatology.git UNTESTED saving work on base text parsing with new library --- diff --git a/lib/Text/Tradition.pm b/lib/Text/Tradition.pm index 6cce95f..8e028a9 100644 --- a/lib/Text/Tradition.pm +++ b/lib/Text/Tradition.pm @@ -92,7 +92,8 @@ sub witness { foreach my $wit ( @{$self->witnesses} ) { $requested_wit = $wit if $wit->sigil eq $sigil; } - warn "No such witness $sigil" unless $requested_wit; + # We depend on an undef return value for no such witness. + # warn "No such witness $sigil" unless $requested_wit; return $requested_wit; } @@ -101,6 +102,7 @@ sub add_witness { my $self = shift; my $new_wit = Text::Tradition::Witness->new( @_ ); push( @{$self->witnesses}, $new_wit ); + return $new_wit; } # The user will usually be instantiating a Tradition object, and diff --git a/lib/Text/Tradition/Collation.pm b/lib/Text/Tradition/Collation.pm index 7a75c18..aa905d2 100644 --- a/lib/Text/Tradition/Collation.pm +++ b/lib/Text/Tradition/Collation.pm @@ -255,6 +255,45 @@ sub start { return $self->reading('#START#'); } +=item B + +my @readings = $graph->reading_sequence( $first, $last, $path[, $alt_path] ); + +Returns the ordered list of readings, starting with $first and ending +with $last, along the given witness path. If no path is specified, +assume that the path is that of the base text (if any.) + +=cut + +sub reading_sequence { + my( $self, $start, $end, $witness, $backup ) = @_; + + $witness = 'base text' unless $witness; + my @readings = ( $start ); + my %seen; + my $n = $start; + while( $n ne $end ) { + if( exists( $seen{$n->name()} ) ) { + warn "Detected loop at " . $n->name(); + last; + } + $seen{$n->name()} = 1; + + my $next = $self->next_reading( $n, $witness, $backup ); + warn "Did not find any path for $witness from reading " . $n->name + unless $next; + push( @readings, $next ); + $n = $next; + } + # Check that the last reading is our end reading. + my $last = $readings[$#readings]; + warn "Last reading found from " . $start->label() . + " for witness $witness is not the end!" + unless $last eq $end; + + return @readings; +} + =item B my $next_reading = $graph->next_reading( $reading, $witpath ); @@ -265,7 +304,7 @@ path. TODO These are badly named. =cut sub next_reading { - # Return the successor via the corresponding edge. + # Return the successor via the corresponding path. my $self = shift; return $self->_find_linked_reading( 'next', @_ ); } @@ -280,30 +319,48 @@ path. TODO These are badly named. =cut sub prior_reading { - # Return the predecessor via the corresponding edge. + # Return the predecessor via the corresponding path. my $self = shift; return $self->_find_linked_reading( 'prior', @_ ); } sub _find_linked_reading { - my( $self, $direction, $node, $edge ) = @_; - $edge = 'base text' unless $edge; - my @linked_edges = $direction eq 'next' + my( $self, $direction, $node, $path, $alt_path ) = @_; + my @linked_paths = $direction eq 'next' ? $node->outgoing() : $node->incoming(); - return undef unless scalar( @linked_edges ); + return undef unless scalar( @linked_paths ); - # We have to find the linked edge that contains all of the - # witnesses supplied in $edge. - my @edge_wits = $self->witnesses_of_label( $edge ); - foreach my $le ( @linked_edges ) { - my @le_wits = $self->witnesses_of_label( $le->name ); - if( _is_within( \@edge_wits, \@le_wits ) ) { - # This is the right edge. - return $direction eq 'next' ? $le->to() : $le->from(); + # We have to find the linked path that contains all of the + # witnesses supplied in $path. + my( @path_wits, @alt_path_wits ); + @path_wits = $self->witnesses_of_label( $path ) if $path; + @alt_path_wits = $self->witnesses_of_label( $alt_path ) if $alt_path; + my $base_le; + my $alt_le; + foreach my $le ( @linked_paths ) { + if( $le->name eq 'base text' ) { + $base_le = $le; + } else { + my @le_wits = $self->witnesses_of_label( $le->name ); + if( _is_within( \@path_wits, \@le_wits ) ) { + # This is the right path. + return $direction eq 'next' ? $le->to() : $le->from(); + } elsif( _is_within( \@alt_path_wits, \@le_wits ) ) { + $alt_le = $le; + } } } + # Got this far? Return the alternate path if it exists. + return $direction eq 'next' ? $alt_le->to() : $alt_le->from() + if $alt_le; + + # Got this far? Return the base path if it exists. + return $direction eq 'next' ? $base_le->to() : $base_le->from() + if $base_le; + + # Got this far? We have no appropriate path. warn "Could not find $direction node from " . $node->label - . " along edge $edge"; + . " along path $path"; return undef; } @@ -320,7 +377,8 @@ sub _is_within { ## INITIALIZATION METHODS - for use by parsers # Walk the paths for each witness in the graph, and return the nodes -# that the graph has in common. +# that the graph has in common. If $using_base is true, some +# different logic is needed. sub walk_witness_paths { my( $self, $end ) = @_; @@ -332,22 +390,11 @@ sub walk_witness_paths { my @common_readings; foreach my $wit ( @{$self->tradition->witnesses} ) { my $curr_reading = $self->start; - my @wit_path = ( $curr_reading ); - my %seen_readings; - # TODO Detect loops at some point - while( $curr_reading->name ne $end->name ) { - if( $seen_readings{$curr_reading->name} ) { - warn "Detected loop walking path for witness " . $wit->sigil - . " at reading " . $curr_reading->name; - last; - } - my $next_reading = $self->next_reading( $curr_reading, - $wit->sigil ); - push( @wit_path, $next_reading ); - $seen_readings{$curr_reading->name} = 1; - $curr_reading = $next_reading; - } + my @wit_path = $self->reading_sequence( $self->start, $end, + $wit->sigil ); $wit->path( \@wit_path ); + + # Detect the common readings. if( @common_readings ) { my @cn; foreach my $n ( @wit_path ) { @@ -362,7 +409,8 @@ sub walk_witness_paths { # Mark all the nodes as either common or not. foreach my $cn ( @common_readings ) { - print STDERR "Setting " . $cn->name . " / " . $cn->label . " as common node\n"; + print STDERR "Setting " . $cn->name . " / " . $cn->label + . " as common node\n"; $cn->make_common; } foreach my $n ( $self->readings() ) { @@ -372,6 +420,46 @@ sub walk_witness_paths { return @common_readings; } +# An alternative to walk_witness_paths, for use when a collation is +# constructed from a base text and an apparatus. Also modifies the +# collation graph to remove all 'base text' paths and replace them +# with real witness paths. + +sub walk_and_expand_base { + my( $self, $end ) = @_; + + foreach my $wit ( @{$self->tradition->witnesses} ) { + my $sig = $wit_sigil; + my $post_sig; + $post_sig = $wit->post_correctione + if $wit->has_post_correctione; + my @wit_path = ( $self->start ); + my @wit_pc_path; + my $curr_rdg = $self->start; + my %seen; + while( $curr_rdg ne $end ) { + if( $seen{$curr_reading->name} ) { + warn "Detected loop in walk_and_expand_base with witness " + . "$sig on reading " . $curr_reading->name . "\n"; + last; + } + my $next_rdg = $self->next_reading( $curr_reading, $sig ); + unless( $self->has_explicit_path( $curr_reading, + $next_reading, $sig ) ) { + $self->add_path( $curr_reading, $next_reading, $sig ); + } + push( @wit_path, $next_reading ); + $seen{$curr_reading->name} = 1; + } + $wit->path( \@wit_path ); + + # Now go through this path and look for p.c. divergences. + # TODO decide how to handle p.c. paths + # BIG TODO handle case where p.c. follows the base and a.c. doesn't! + + +} + sub common_readings { my $self = shift; my @common = grep { $_->is_common } $self->readings(); diff --git a/lib/Text/Tradition/Collation/Reading.pm b/lib/Text/Tradition/Collation/Reading.pm index 971a26b..59c60ca 100644 --- a/lib/Text/Tradition/Collation/Reading.pm +++ b/lib/Text/Tradition/Collation/Reading.pm @@ -52,6 +52,15 @@ sub BUILD { $self->same_as( [ $self ] ); } +sub text { + # Wrapper function around 'label' attribute. + my $self = shift; + if( @_ ) { + $self->set_attribute( 'label', $_[0] ); + } + return $self->get_attribute( 'label' ); +} + sub merge_from { my( $self, $merged_node ) = @_; # Adopt the identity pool of the other node. diff --git a/lib/Text/Tradition/Parser/BaseText.pm b/lib/Text/Tradition/Parser/BaseText.pm index d0f4816..d7d090b 100644 --- a/lib/Text/Tradition/Parser/BaseText.pm +++ b/lib/Text/Tradition/Parser/BaseText.pm @@ -40,12 +40,12 @@ Takes an initialized graph and a set of options, which must include: =cut sub parse { - my( $graph, %opts ) = @_; + my( $tradition, %opts ) = @_; my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'}; load( $format_mod ); my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} ); - merge_base( $graph, $opts{'base'}, @apparatus_entries ); + merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries ); } =item B @@ -76,8 +76,8 @@ underscore in its name. =cut sub merge_base { - my( $graph, $base_file, @app_entries ) = @_; - my @base_line_starts = read_base( $base_file, $graph ); + my( $collation, $base_file, @app_entries ) = @_; + my @base_line_starts = read_base( $base_file, $collation ); my %all_witnesses; foreach my $app ( @app_entries ) { @@ -86,7 +86,7 @@ sub merge_base { # last if $line > 2; # DEBUG for problematic entries my $scrutinize = ""; - my $first_line_node = $base_line_starts[ $line ]; + my $first_line_reading = $base_line_starts[ $line ]; my $too_far = $base_line_starts[ $line+1 ]; my $lemma = $app->{rdg_0}; @@ -98,7 +98,7 @@ sub merge_base { my @lemma_words = split( /\s+/, $lemma ); # Now search for the lemma words within this line. - my $lemma_start = $first_line_node; + my $lemma_start = $first_line_reading; my $lemma_end; my %seen; while( $lemma_start ne $too_far ) { @@ -120,17 +120,19 @@ sub merge_base { if( --$seq < 1 ) { # Now we have to compare the rest of the words here. if( scalar( @lemma_words ) > 1 ) { - my $next_node = $graph->next_word( $lemma_start ); + my $next_reading = + $collation->next_reading( $lemma_start ); foreach my $w ( @lemma_words[1..$#lemma_words] ) { printf STDERR "Now matching %s against %s\n", - cmp_str($next_node), $w + cmp_str($next_reading), $w if "$line.$num" eq $scrutinize; - if( $w ne cmp_str($next_node) ) { + if( $w ne cmp_str($next_reading) ) { $unmatch = 1; last; } else { - $lemma_end = $next_node; - $next_node = $graph->next_word( $lemma_end ); + $lemma_end = $next_reading; + $next_reading = + $collation->next_reading( $lemma_end ); } } } else { @@ -142,30 +144,33 @@ sub merge_base { } last unless ( $unmatch || !defined( $lemma_end ) ); $lemma_end = undef; - $lemma_start = $graph->next_word( $lemma_start ); + $lemma_start = $collation->next_reading( $lemma_start ); } unless( $lemma_end ) { warn "No match found for @lemma_words at $line.$num"; next; } else { - # These are no longer common nodes; unmark them as such. - my @lemma_nodes = $graph->node_sequence( $lemma_start, + # These are no longer common readings; unmark them as such. + my @lemma_readings = $collation->reading_sequence( $lemma_start, $lemma_end ); - map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes; + map { $_->set_attribute( 'class', 'lemma' ) } @lemma_readings; } - # Now we have our lemma nodes; we add the variant nodes to the graph. + # Now we have our lemma readings; we add the variant readings + # to the collation. # Keep track of the start and end point of each reading for later - # node collapse. + # reading collapse. my @readings = ( $lemma_start, $lemma_end ); - # For each reading that is not rdg_0, we make a chain of nodes + # For each reading that is not rdg_0, we make a chain of readings # and connect them to the anchor. Edges are named after the mss # that are relevant. foreach my $k ( grep { /^rdg/ } keys( %$app ) ) { next if $k eq 'rdg_0'; # that's the lemma. + # TODO look at the lemma for any p.c. readings, and add + # them explicitly! my @variant = split( /\s+/, $app->{$k} ); @variant = () if $app->{$k} eq '/'; # This is an omission. my @mss = grep { $app->{$_} eq $k } keys( %$app ); @@ -175,124 +180,114 @@ sub merge_base { next; } - # Determine the label name for the edges here. - my $edge_name = join(', ', @mss ); + # Keep track of what witnesses we have seen. @all_witnesses{ @mss } = ( 1 ) x scalar( @mss ); - # Make the variant into a set of nodes. + # Make the variant into a set of readings. my $ctr = 0; - my $last_node = $graph->prior_word( $lemma_start ); + my $last_reading = $collation->prior_reading( $lemma_start ); my $var_start; foreach my $vw ( @variant ) { my $vwname = "$k/$line.$num.$ctr"; $ctr++; - my $vwnode = $graph->add_node( $vwname ); - $vwnode->set_attribute( 'label', $vw ); - $vwnode->set_attribute( 'class', 'variant' ); - $graph->add_edge( $last_node, $vwnode, $edge_name ); - $var_start = $vwnode unless $var_start; - $last_node = $vwnode; + my $vwreading = $collation->add_reading( $vwname ); + $vwreading->text( $vw ); + $vwreading->make_variant(); + foreach ( @mss ) { + $collation->add_path( $last_reading, $vwreading, $_ ); + } + $var_start = $vwreading unless $var_start; + $last_reading = $vwreading; } # Now hook it up at the end. - $graph->add_edge( $last_node, $graph->next_word( $lemma_end ), - $edge_name ); + foreach ( @mss ) { + $collation->add_path( $last_reading, + $collation->next_word( $lemma_end ), + $_ ); + } if( $var_start ) { # if it wasn't an empty reading - push( @readings, $var_start, $last_node ); + push( @readings, $var_start, $last_reading ); } } - # Now collate and collapse the identical nodes within the graph. - collate_variants( $graph, @readings ); + # Now collate and collapse the identical readings within the collation. + collate_variants( $collation, @readings ); } - ## Now in theory I have a graph. I want to make it a little easier to - ## read. So I collapse nodes that have only one edge in and one edge - ## out, and I do this by looking at the edges. - -# foreach my $edge ( $graph->edges() ) { -# my @out_edges = $edge->from()->outgoing(); -# my @in_edges = $edge->to()->incoming(); - -# next if $edge->from() eq $graph->start(); -# next if $edge->to()->name() eq '#END#'; -# next unless scalar( @out_edges ) == 1; -# next unless scalar( @in_edges ) == 1; -# next unless $out_edges[0] eq $in_edges[0]; -# # In theory if we've got this far, we're safe, but just to -# # double-check... -# next unless $out_edges[0] eq $edge; - -# $graph->merge_nodes( $edge->from(), $edge->to(), ' ' ); -# } - - # Now walk the path for each witness, so that we can do the - # position calculations. - my $paths = {}; + # Now make the witness objects foreach my $w ( keys %all_witnesses ) { - my $back = undef; - if( $w =~ /^(.*)\s*\(p\.\s*c\.\)/ ) { - $back = $1; + my $base = _is_post_corr( $w ); + if( $base ) { + my $pctag = substr( $w, length( $base ) ); + my $existing_wit = $collation->tradition->witness( $base ); + unless( $existing_wit ) { + $existing_wit = $collation->tradition->add_witness( $base ); + } + $existing_wit->post_correctione( $pctag ); + } else { + $collation->tradition->add_witness( $w ) + unless $collation->tradition->witness( $w ); } - my @wit_nodes = $graph->node_sequence( $graph->start, - $graph->node( '#END#' ), - $w, $back ); - my @wn_names = map { $_->name() } @wit_nodes; - $paths->{$w} = \@wn_names; } - $DB::single = 1; - my @common_nodes = grep { $graph->is_common( $_ ) } $graph->nodes(); - $graph->make_positions( \@common_nodes, $paths ); + + # Now walk paths and calculate positions. + my @common_readings = + $collation->walk_and_expand_base( $collation->reading( '#END#' ) ); + $collation->calculate_positions( @common_readings ); } =item B -my @line_beginnings = read_base( 'reference.txt', $graph ); +my @line_beginnings = read_base( 'reference.txt', $collation ); -Takes a text file and a (presumed empty) graph object, adds the words -as simple linear nodes to the graph, and returns a list of nodes that -represent the beginning of lines. This graph is now the starting point -for application of apparatus entries in merge_base, e.g. from a CSV -file or a Classical Text Editor file. +Takes a text file and a (presumed empty) collation object, adds the +words as simple linear readings to the collation, and returns a +list of readings that represent the beginning of lines. This collation +is now the starting point for application of apparatus entries in +merge_base, e.g. from a CSV file or a Classical Text Editor file. =cut sub read_base { - my( $base_file, $graph ) = @_; + my( $base_file, $collation ) = @_; - # This array gives the first node for each line. We put the + # This array gives the first reading for each line. We put the # common starting point in line zero. - my $last_node = $graph->start(); - my $lineref_array = [ $last_node ]; # There is no line zero. + my $last_reading = $collation->start(); + my $lineref_array = [ $last_reading ]; # There is no line zero. open( BASE, $base_file ) or die "Could not open file $base_file: $!"; while() { - # Make the nodes, and connect them up for the base, but also - # save the first node of each line in an array for the purpose. + # Make the readings, and connect them up for the base, but + # also save the first reading of each line in an array for the + # purpose. + # TODO use configurable reading separator chomp; my @words = split; my $started = 0; my $wordref = 0; my $lineref = scalar @$lineref_array; foreach my $w ( @words ) { - my $noderef = join( ',', $lineref, ++$wordref ); - my $node = $graph->add_node( $noderef ); - $node->set_attribute( 'label', $w ); - $node->set_attribute( 'class', 'common' ); + my $readingref = join( ',', $lineref, ++$wordref ); + my $reading = $collation->add_reading( $readingref ); + $reading->text( $w ); + $reading->make_common(); unless( $started ) { - push( @$lineref_array, $node ); + push( @$lineref_array, $reading ); $started = 1; } - if( $last_node ) { - my $edge = $graph->add_edge( $last_node, $node, "base text" ); - $edge->set_attribute( 'class', 'basetext' ); - $last_node = $node; + if( $last_reading ) { + my $path = $collation->add_path( $last_reading, $reading, + "base text" ); + $path->set_attribute( 'class', 'basetext' ); + $last_reading = $reading; } # TODO there should be no else here... } } close BASE; # Ending point for all texts - my $endpoint = $graph->add_node( '#END#' ); - $graph->add_edge( $last_node, $endpoint, "base text" ); + my $endpoint = $collation->add_reading( '#END#' ); + $collation->add_path( $last_reading, $endpoint, "base text" ); push( @$lineref_array, $endpoint ); return( @$lineref_array ); @@ -300,20 +295,21 @@ sub read_base { =item B -collate_variants( $graph, @readings ) +collate_variants( $collation, @readings ) Given a set of readings in the form ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... ) -walks through each to identify those nodes that are identical. The -graph is a Text::Tradition::Graph object; the elements of @readings are -Graph::Easy::Node objects that appear on the graph. +walks through each to identify those readings that are identical. The +collation is a Text::Tradition::Collation object; the elements of +@readings are Text::Tradition::Collation::Reading objects that appear +on the collation graph. TODO: Handle collapsed and non-collapsed transpositions. =cut sub collate_variants { - my( $graph, @readings ) = @_; + my( $collation, @readings ) = @_; my $lemma_start = shift @readings; my $lemma_end = shift @readings; my $detranspose = 0; @@ -321,112 +317,122 @@ sub collate_variants { # We need to calculate positions at this point, which is where # we are getting the implicit information from the apparatus. - # Start the list of distinct nodes with those nodes in the lemma. - my @distinct_nodes; + # Start the list of distinct readings with those readings in the lemma. + my @distinct_readings; my $position = 0; while( $lemma_start ne $lemma_end ) { - push( @distinct_nodes, [ $lemma_start, 'base text', $position++ ] ); - $lemma_start = $graph->next_word( $lemma_start ); + push( @distinct_readings, [ $lemma_start, 'base text', $position++ ] ); + $lemma_start = $collation->next_word( $lemma_start ); } - push( @distinct_nodes, [ $lemma_end, 'base text', $position++ ] ); + push( @distinct_readings, [ $lemma_end, 'base text', $position++ ] ); while( scalar @readings ) { my( $var_start, $var_end ) = splice( @readings, 0, 2 ); - # I want to look at the nodes in the variant and lemma, and - # collapse nodes that are the same word. This is mini-collation. + # I want to look at the readings in the variant and lemma, and + # collapse readings that are the same word. This is mini-collation. # Each word in the 'main' list can only be collapsed once with a # word from the current reading. my %collapsed = (); - # Get the label. There will only be one outgoing edge to start + # Get the label. There will only be one outgoing path to start # with, so this is safe. my @out = $var_start->outgoing(); my $var_label = $out[0]->label(); - my @variant_nodes; + my @variant_readings; while( $var_start ne $var_end ) { - push( @variant_nodes, $var_start ); - $var_start = $graph->next_word( $var_start, $var_label ); + push( @variant_readings, $var_start ); + $var_start = $collation->next_word( $var_start, $var_label ); } - push( @variant_nodes, $var_end ); + push( @variant_readings, $var_end ); - # Go through the variant nodes, and if we find a lemma node that - # hasn't yet been collapsed with a node, equate them. If we do - # not, keep them to push onto the end of all_nodes. - my @remaining_nodes; + # Go through the variant readings, and if we find a lemma reading that + # hasn't yet been collapsed with a reading, equate them. If we do + # not, keep them to push onto the end of all_readings. + my @remaining_readings; my $last_index = 0; my $curr_pos = 0; - foreach my $w ( @variant_nodes ) { + foreach my $w ( @variant_readings ) { my $word = $w->label(); my $matched = 0; - foreach my $idx ( $last_index .. $#distinct_nodes ) { - my( $l, $edgelabel, $pos ) = @{$distinct_nodes[$idx]}; + foreach my $idx ( $last_index .. $#distinct_readings ) { + my( $l, $pathlabel, $pos ) = @{$distinct_readings[$idx]}; if( $word eq cmp_str( $l ) ) { next if exists( $collapsed{ $l->label } ) && $collapsed{ $l->label } eq $l; $matched = 1; $last_index = $idx if $detranspose; - # Collapse the nodes. - printf STDERR "Merging nodes %s/%s and %s/%s\n", + # Collapse the readings. + printf STDERR "Merging readings %s/%s and %s/%s\n", $l->name, $l->label, $w->name, $w->label; - $graph->merge_nodes( $l, $w ); + $collation->merge_readings( $l, $w ); $collapsed{ $l->label } = $l; - # Now collapse any multiple edges to and from the node. - remove_duplicate_edges( $graph, - $graph->prior_word( $l, $edgelabel ), $l ); - remove_duplicate_edges( $graph, $l, - $graph->next_word( $l, $edgelabel ) ); + # Now collapse any multiple paths to and from the reading. + remove_duplicate_paths( $collation, + $collation->prior_word( $l, $pathlabel ), $l ); + remove_duplicate_paths( $collation, $l, + $collation->next_word( $l, $pathlabel ) ); $curr_pos = $pos; last; } } - push( @remaining_nodes, [ $w, $var_label, $curr_pos++ ] ) unless $matched; + push( @remaining_readings, [ $w, $var_label, $curr_pos++ ] ) unless $matched; } - push( @distinct_nodes, @remaining_nodes) if scalar( @remaining_nodes ); + push( @distinct_readings, @remaining_readings) if scalar( @remaining_readings ); } - # Now set the positions of all the nodes in this variation. + # Now set the positions of all the readings in this variation. #$DB::single = 1; - print STDERR "Nodes and their positions are:\n"; - foreach my $n ( @distinct_nodes ) { + print STDERR "Readings and their positions are:\n"; + foreach my $n ( @distinct_readings ) { printf STDERR "\t%s (position %s)\n", $n->[0]->label(), $n->[2]; } } -=item B +=item B -remove_duplicate_edges( $graph, $from, $to ); +remove_duplicate_paths( $collation, $from, $to ); -Given two nodes, reduce the number of edges between those nodes to -one. If neither edge represents a base text, combine their labels. +Given two readings, reduce the number of paths between those readings to +one. If neither path represents a base text, combine their labels. =cut -sub remove_duplicate_edges { - my( $graph, $from, $to ) = @_; - my @edges = $from->edges_to( $to ); - if( scalar @edges > 1 ) { - my @base = grep { $_->label eq 'base text' } @edges; +sub remove_duplicate_paths { + my( $collation, $from, $to ) = @_; + my @paths = $from->paths_to( $to ); + if( scalar @paths > 1 ) { + my @base = grep { $_->label eq 'base text' } @paths; if ( scalar @base ) { - # Remove the edges that are not base. - foreach my $e ( @edges ) { - $graph->del_edge( $e ) + # Remove the paths that are not base. + foreach my $e ( @paths ) { + $collation->del_path( $e ) unless $e eq $base[0]; } } else { - # Combine the edges into one. - my $new_edge_name = join( ', ', map { $_->label() } @edges ); - my $new_edge = shift @edges; - $new_edge->set_attribute( 'label', $new_edge_name ); - foreach my $e ( @edges ) { - $graph->del_edge( $e ); + # Combine the paths into one. + my $new_path_name = join( ', ', map { $_->label() } @paths ); + my $new_path = shift @paths; + $new_path->set_attribute( 'label', $new_path_name ); + foreach my $e ( @paths ) { + $collation->del_path( $e ); } } } } +# Helper function. Given a witness sigil, if it is a post-correctione +# sigil,return the base witness. If not, return a false value. +sub _is_post_corr { + my( $sigil ) = @_; + if( $sigil =~ /^(.*?)(\s*\(p\.\s*c\.\))$/ ) { + return $1; + } + return undef; +} + =item B Pretend you never saw this method. Really it needs to not be hardcoded. @@ -434,8 +440,8 @@ Pretend you never saw this method. Really it needs to not be hardcoded. =cut sub cmp_str { - my( $node ) = @_; - my $word = $node->label(); + my( $reading ) = @_; + my $word = $reading->label(); $word = lc( $word ); $word =~ s/\W//g; $word =~ s/v/u/g; diff --git a/lib/Text/Tradition/Parser/GraphML.pm b/lib/Text/Tradition/Parser/GraphML.pm index 5c6244c..4e59b7e 100644 --- a/lib/Text/Tradition/Parser/GraphML.pm +++ b/lib/Text/Tradition/Parser/GraphML.pm @@ -79,10 +79,10 @@ sub parse { my @nodes = $xpc->findnodes( '//g:node' ); foreach my $n ( @nodes ) { my $id = _lookup_node_data( $n, 'number' ); - my $label = _lookup_node_data( $n, 'token' ); + my $token = _lookup_node_data( $n, 'token' ); my $gnode = $collation->add_reading( $id ); $node_name{ $n->getAttribute('id') } = $id; - $gnode->set_attribute( 'label', $label ); + $gnode->text( $token ); # Now get the rest of the data, i.e. not the ID or label my $extra = {}; @@ -101,8 +101,13 @@ sub parse { # Label according to the witnesses present. my @wit_ids = $xpc->findnodes( './g:data/attribute::key', $e ); my @wit_names = map { $witnesses{ $_->getValue() } } @wit_ids; - my $label = $collation->path_label( @wit_names ); - $collation->add_path( $from, $to, $label ); + # One path per witness + foreach( @wit_names ) { + $collation->add_path( $from, $to, $_ ); + } + # Only a single path between two readings + # my $label = $collation->path_label( @wit_names ); + # $collation->add_path( $from, $to, $label ); } ## Reverse the node_name hash so that we have two-way lookup. diff --git a/lib/Text/Tradition/Witness.pm b/lib/Text/Tradition/Witness.pm index cb461c5..d48a350 100644 --- a/lib/Text/Tradition/Witness.pm +++ b/lib/Text/Tradition/Witness.pm @@ -31,6 +31,13 @@ has 'path' => ( predicate => 'has_path', ); +has 'post_correctione' => ( + is => 'rw', + isa => 'Str', + predicate => 'has_post_correctione', + ); + + sub BUILD { my $self = shift; if( $self->has_source ) { diff --git a/script/svg_from_csv.pl b/script/svg_from_csv.pl index c469bbe..40fa7f5 100644 --- a/script/svg_from_csv.pl +++ b/script/svg_from_csv.pl @@ -3,9 +3,9 @@ use lib 'lib'; use strict; use warnings; -use Text::Tradition::Graph; +use Text::Tradition; -my $collation_graph = Text::Tradition::Graph->new( +my $collation_graph = Text::Tradition->new( 'CSV' => $ARGV[0], 'base' => $ARGV[1], );