lib/Traditions/Parser/BaseText.pm

   1 package Traditions::Parser::BaseText;
   2
   3 use strict;
   4 use warnings;
   5 use Exporter 'import';
   6 use vars qw( @EXPORT_OK );
   7 @EXPORT_OK = qw( merge_base );
   8
   9 sub merge_base {
  10     my( $graph, $base_file, @app_entries ) = @_;
  11     my @base_line_starts = read_base( $base_file, $graph );
  12
  13     foreach my $app ( @app_entries ) {
  14         my( $line, $num ) = split( /\./, $app->{_id} );
  15         # DEBUG with a short graph
  16         # last if $line > 2;
  17         my $scrutinize = "21.8";
  18         my $first_line_node = $base_line_starts[ $line ];
  19         my $too_far = $base_line_starts[ $line+1 ];
  20
  21         my $lemma = $app->{rdg_0};
  22         my $seq = 1;
  23         # Is this the Nth occurrence of this reading in the line?
  24         if( $lemma =~ s/(_)?(\d)$// ) {
  25             $seq = $2;
  26         }
  27         my @lemma_words = split( /\s+/, $lemma );
  28
  29         # Now search for the lemma words within this line.
  30         my $lemma_start = $first_line_node;
  31         my $lemma_end;
  32         my %seen;
  33         while( $lemma_start ne $too_far ) {
  34             # Loop detection
  35             if( $seen{ $lemma_start->name() } ) {
  36                 warn "Detected loop at " . $lemma_start->name() .
  37                     ", ref $line,$num";
  38                 last;
  39             }
  40             $seen{ $lemma_start->name() } = 1;
  41
  42             # Try to match the lemma.
  43             my $unmatch = 0;
  44             print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
  45                 $lemma_words[0] . "...\n"
  46                 if "$line.$num" eq $scrutinize;
  47             if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
  48                 # Skip it if we need a match that is not the first.
  49                 if( --$seq < 1 ) {
  50                     # Now we have to compare the rest of the words here.
  51                     if( scalar( @lemma_words ) > 1 ) {
  52                         my $next_node = $graph->next_word( $lemma_start );
  53                         foreach my $w ( @lemma_words[1..$#lemma_words] ) {
  54                             printf STDERR "Now matching %s against %s\n",
  55                                     cmp_str($next_node), $w
  56                                 if "$line.$num" eq $scrutinize;
  57                             if( $w ne cmp_str($next_node) ) {
  58                                 $unmatch = 1;
  59                                 last;
  60                             } else {
  61                                 $lemma_end = $next_node;
  62                                 $next_node = $graph->next_word( $lemma_end );
  63                             }
  64                         }
  65                     } else {
  66                         $lemma_end = $lemma_start;
  67                     }
  68                 } else {
  69                     $unmatch = 1;
  70                 }
  71             }
  72             last unless ( $unmatch || !defined( $lemma_end ) );
  73             $lemma_end = undef;
  74             $lemma_start = $graph->next_word( $lemma_start );
  75         }
  76
  77         unless( $lemma_end ) {
  78             warn "No match found for @lemma_words at $line.$num";
  79             next;
  80         } else {
  81             # These are no longer common nodes; unmark them as such.
  82             my @lemma_nodes = $graph->node_sequence( $lemma_start,
  83                                                      $lemma_end );
  84             map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes;
  85         }
  86
  87         # Now we have our lemma nodes; we add the variant nodes to the graph.
  88
  89         # For each reading that is not rdg_0, we make a chain of nodes
  90         # and connect them to the anchor.  Edges are named after the mss
  91         # that are relevant.
  92         foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
  93             next if $k eq 'rdg_0'; # that's the lemma.
  94             my @variant = split( /\s+/, $app->{$k} );
  95             @variant = () if $app->{$k} eq '/'; # This is an omission.
  96             my @mss = grep { $app->{$_} eq $k } keys( %$app );
  97
  98             unless( @mss ) {
  99                 print STDERR "Skipping '@variant' at $line.$num: no mss\n";
 100                 next;
 101             }
 102
 103             # Determine the label name for the edges here.
 104             my $edge_name = join(', ', @mss );
 105
 106             # Make the variant into a set of nodes.
 107             my $ctr = 0;
 108             my $last_node = $graph->prior_word( $lemma_start );
 109             my $var_start;
 110             foreach my $vw ( @variant ) {
 111                 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
 112                 my $vwnode = $graph->add_node( $vwname );
 113                 $vwnode->set_attribute( 'label', $vw );
 114                 $vwnode->set_attribute( 'class', 'variant' );
 115                 $graph->add_edge( $last_node, $vwnode, $edge_name );
 116                 $var_start = $vwnode unless $var_start;
 117                 $last_node = $vwnode;
 118             }
 119             # Now hook it up at the end.
 120             $graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
 121                                         $edge_name );
 122
 123             # Now collate and collapse the identical nodes within the graph.
 124             collate_variant( $graph, $lemma_start, $lemma_end,
 125                              $var_start, $last_node );
 126
 127         }
 128     }
 129
 130     ## Now in theory I have a graph.  I want to make it a little easier to
 131     ## read.  So I collapse nodes that have only one edge in and one edge
 132     ## out, and I do this by looking at the edges.
 133
 134     foreach my $edge ( $graph->edges() ) {
 135         my @out_edges = $edge->from()->outgoing();
 136         my @in_edges = $edge->to()->incoming();
 137
 138         next unless scalar( @out_edges ) == 1;
 139         next unless scalar( @in_edges ) == 1;
 140         next unless $out_edges[0] eq $in_edges[0];
 141         # In theory if we've got this far, we're safe, but just to
 142         # double-check...
 143         next unless $out_edges[0] eq $edge;
 144
 145         $graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
 146     }
 147 }
 148
 149 # read_base: Takes a text file and a (presumed empty) graph object,
 150 # adds the words as simple linear nodes to the graph, and returns a
 151 # list of nodes that represent the beginning of lines. This graph is
 152 # now the starting point for application of apparatus entries in
 153 # merge_base, e.g. from a CSV file or a CTE file.
 154
 155 sub read_base {
 156     my( $base_file, $graph ) = @_;
 157
 158     # This array gives the first node for each line.  We put the
 159     # common starting point in line zero.
 160     my $last_node = $graph->start();
 161     my $lineref_array = [ $last_node ]; # There is no line zero.
 162
 163     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
 164     while(<BASE>) {
 165         # Make the nodes, and connect them up for the base, but also
 166         # save the first node of each line in an array for the purpose.
 167         chomp;
 168         my @words = split;
 169         my $started = 0;
 170         my $wordref = 0;
 171         my $lineref = scalar @$lineref_array;
 172         foreach my $w ( @words ) {
 173             my $noderef = join( ',', $lineref, ++$wordref );
 174             my $node = $graph->add_node( $noderef );
 175             $node->set_attribute( 'label', $w );
 176             $node->set_attribute( 'class', 'common' );
 177             unless( $started ) {
 178                 push( @$lineref_array, $node );
 179                 $started = 1;
 180             }
 181             if( $last_node ) {
 182                 $graph->add_edge( $last_node, $node, "base text" );
 183                 $last_node = $node;
 184             } # TODO there should be no else here...
 185         }
 186     }
 187     close BASE;
 188     # Ending point for all texts
 189     my $endpoint = $graph->add_node( '#END#' );
 190     $graph->add_edge( $last_node, $endpoint, "base text" );
 191     push( @$lineref_array, $endpoint );
 192
 193     return( @$lineref_array );
 194 }
 195
 196
 197 ## Helper methods for merge_base
 198
 199 sub collate_variant {
 200     my( $graph, $lemma_start, $lemma_end, $var_start, $var_end ) = @_;
 201     # If var_start is undef, then the variant is an omission and
 202     # there's nothing to collate. Return.
 203     return unless $var_start;
 204
 205     # I want to look at the nodes in the variant and lemma, and
 206     # collapse nodes that are the same word.  This is mini-collation.
 207     my %collapsed = ();
 208     # There will only be one outgoing edge at first, so this is safe.
 209     my @out = $var_start->outgoing();
 210     my $var_label = $out[0]->label();
 211
 212     my @lemma_nodes;
 213     while( $lemma_start ne $lemma_end ) {
 214         push( @lemma_nodes, $lemma_start );
 215         $lemma_start = $graph->next_word( $lemma_start );
 216     }
 217     push( @lemma_nodes, $lemma_end );
 218
 219     my @variant_nodes;
 220     while( $var_start ne $var_end ) {
 221         push( @variant_nodes, $var_start );
 222         $var_start = $graph->next_word( $var_start, $var_label );
 223     }
 224     push( @variant_nodes, $var_end );
 225
 226     # Go through the variant nodes, and if we find a lemma node that
 227     # hasn't yet been collapsed with a node, equate them.
 228
 229     foreach my $w ( @variant_nodes ) {
 230         my $word = $w->label();
 231         foreach my $l ( @lemma_nodes ) {
 232             if( $word eq cmp_str( $l ) ) {
 233                 next if exists( $collapsed{ $l->label } )
 234                     && $collapsed{ $l->label } eq $l;
 235                 # Collapse the nodes.
 236                 printf STDERR "Merging nodes %s/%s and %s/%s\n",
 237                     $l->name, $l->label, $w->name, $w->label;
 238                 $graph->merge_nodes( $l, $w );
 239                 $collapsed{ $l->label } = $l;
 240                 # Now collapse any multiple edges to and from the node.
 241                 # Rely on the presence of the 'base text' edge.
 242                 remove_duplicate_edges( $graph, $graph->prior_word( $l ), $l );
 243                 remove_duplicate_edges( $graph, $l, $graph->next_word( $l ) );
 244             }
 245         }
 246     }
 247 }
 248
 249 sub remove_duplicate_edges {
 250     my( $graph, $from, $to ) = @_;
 251     my @edges = $from->edges_to( $to );
 252     if( scalar @edges > 1 ) {
 253         my @base = grep { $_->label eq 'base text' } @edges;
 254         if ( scalar @base ) {
 255             # Remove the edges that are not base.
 256             foreach my $e ( @edges ) {
 257                 $graph->del_edge( $e )
 258                     unless $e eq $base[0];
 259             }
 260         } else {
 261             # Combine the edges into one.
 262             my $new_edge_name = join( ', ', map { $_->label() } @edges );
 263             my $new_edge = shift @edges;
 264             $new_edge->set_attribute( 'label', $new_edge_name );
 265             foreach my $e ( @edges ) {
 266                 $graph->del_edge( $e );
 267             }
 268         }
 269     }
 270 }
 271
 272 # TODO need to make this configurable!
 273 sub cmp_str {
 274     my( $node ) = @_;
 275     my $word = $node->label();
 276     $word = lc( $word );
 277     $word =~ s/\W//g;
 278     $word =~ s/v/u/g;
 279     $word =~ s/j/i/g;
 280     $word =~ s/cha/ca/g;
 281     $word =~ s/quatuor/quattuor/g;
 282     $word =~ s/ioannes/iohannes/g;
 283     return $word;
 284 }
 285
 286 1;