lib/Text/Tradition/Parser/BaseText.pm

   1 package Text::Tradition::Parser::BaseText;
   2
   3 use strict;
   4 use warnings;
   5 use Exporter 'import';
   6 use vars qw( @EXPORT_OK );
   7 @EXPORT_OK = qw( merge_base );
   8
   9 =head1 NAME
  10
  11 Text::Tradition::Parser::BaseText
  12
  13 =head1 SYNOPSIS
  14
  15 use Text::Tradition::Parser::BaseText qw( merge_base );
  16 merge_base( $graph, 'reference.txt', @apparatus_entries )
  17
  18 =head1 DESCRIPTION
  19
  20 For an overview of the package, see the documentation for the
  21 Text::Tradition::Graph module.
  22
  23 This module is meant for use with certain of the other Parser classes
  24 - whenever a list of variants is given with reference to a base text,
  25 these must be joined into a single collation.  The parser should
  26 therefore make a list of variants and their locations, and BaseText
  27 will join those listed variants onto the reference text.
  28
  29 =head1 SUBROUTINES
  30
  31 =over
  32
  33 =item B<merge_base>
  34
  35 merge_base( $graph, 'reference.txt', @apparatus_entries )
  36
  37 Takes three arguments: a newly-initialized Text::Tradition::Graph
  38 object, a text file containing the reference text, and a list of
  39 variants (apparatus entries).  Adds the base text to the graph, and
  40 joins the variants to that.
  41
  42 The list of variants is an array of hash references; each hash takes
  43 the form
  44  { '_id' => line reference,
  45    'rdg_0' => lemma reading,
  46    'rdg_1' => first variant,
  47    ...  # and so on until all distinct readings are listed
  48    'WitnessA' => 'rdg_0',
  49    'WitnessB' => 'rdg_1',
  50    ...  # and so on until all witnesses are listed with their readings
  51  }
  52
  53 Any hash key that is not of the form /^rdg_\d+$/ and that does not
  54 begin with an underscore is assumed to be a witness name.  Any 'meta'
  55 information to be passed must be passed in a key with a leading
  56 underscore in its name.
  57
  58 =cut
  59
  60 sub merge_base {
  61     my( $graph, $base_file, @app_entries ) = @_;
  62     my @base_line_starts = read_base( $base_file, $graph );
  63
  64     foreach my $app ( @app_entries ) {
  65         my( $line, $num ) = split( /\./, $app->{_id} );
  66         # DEBUG with a short graph
  67         # last if $line > 2;
  68         # DEBUG for problematic entries
  69         my $scrutinize = "";
  70         my $first_line_node = $base_line_starts[ $line ];
  71         my $too_far = $base_line_starts[ $line+1 ];
  72
  73         my $lemma = $app->{rdg_0};
  74         my $seq = 1;
  75         # Is this the Nth occurrence of this reading in the line?
  76         if( $lemma =~ s/(_)?(\d)$// ) {
  77             $seq = $2;
  78         }
  79         my @lemma_words = split( /\s+/, $lemma );
  80
  81         # Now search for the lemma words within this line.
  82         my $lemma_start = $first_line_node;
  83         my $lemma_end;
  84         my %seen;
  85         while( $lemma_start ne $too_far ) {
  86             # Loop detection
  87             if( $seen{ $lemma_start->name() } ) {
  88                 warn "Detected loop at " . $lemma_start->name() .
  89                     ", ref $line,$num";
  90                 last;
  91             }
  92             $seen{ $lemma_start->name() } = 1;
  93
  94             # Try to match the lemma.
  95             my $unmatch = 0;
  96             print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
  97                 $lemma_words[0] . "...\n"
  98                 if "$line.$num" eq $scrutinize;
  99             if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
 100                 # Skip it if we need a match that is not the first.
 101                 if( --$seq < 1 ) {
 102                     # Now we have to compare the rest of the words here.
 103                     if( scalar( @lemma_words ) > 1 ) {
 104                         my $next_node = $graph->next_word( $lemma_start );
 105                         foreach my $w ( @lemma_words[1..$#lemma_words] ) {
 106                             printf STDERR "Now matching %s against %s\n",
 107                                     cmp_str($next_node), $w
 108                                 if "$line.$num" eq $scrutinize;
 109                             if( $w ne cmp_str($next_node) ) {
 110                                 $unmatch = 1;
 111                                 last;
 112                             } else {
 113                                 $lemma_end = $next_node;
 114                                 $next_node = $graph->next_word( $lemma_end );
 115                             }
 116                         }
 117                     } else {
 118                         $lemma_end = $lemma_start;
 119                     }
 120                 } else {
 121                     $unmatch = 1;
 122                 }
 123             }
 124             last unless ( $unmatch || !defined( $lemma_end ) );
 125             $lemma_end = undef;
 126             $lemma_start = $graph->next_word( $lemma_start );
 127         }
 128
 129         unless( $lemma_end ) {
 130             warn "No match found for @lemma_words at $line.$num";
 131             next;
 132         } else {
 133             # These are no longer common nodes; unmark them as such.
 134             my @lemma_nodes = $graph->node_sequence( $lemma_start,
 135                                                      $lemma_end );
 136             map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes;
 137         }
 138
 139         # Now we have our lemma nodes; we add the variant nodes to the graph.
 140
 141         # Keep track of the start and end point of each reading for later
 142         # node collapse.
 143         my @readings = ( $lemma_start, $lemma_end );
 144
 145         # For each reading that is not rdg_0, we make a chain of nodes
 146         # and connect them to the anchor.  Edges are named after the mss
 147         # that are relevant.
 148         foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
 149             next if $k eq 'rdg_0'; # that's the lemma.
 150             my @variant = split( /\s+/, $app->{$k} );
 151             @variant = () if $app->{$k} eq '/'; # This is an omission.
 152             my @mss = grep { $app->{$_} eq $k } keys( %$app );
 153
 154             unless( @mss ) {
 155                 print STDERR "Skipping '@variant' at $line.$num: no mss\n";
 156                 next;
 157             }
 158
 159             # Determine the label name for the edges here.
 160             my $edge_name = join(', ', @mss );
 161
 162             # Make the variant into a set of nodes.
 163             my $ctr = 0;
 164             my $last_node = $graph->prior_word( $lemma_start );
 165             my $var_start;
 166             foreach my $vw ( @variant ) {
 167                 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
 168                 my $vwnode = $graph->add_node( $vwname );
 169                 $vwnode->set_attribute( 'label', $vw );
 170                 $vwnode->set_attribute( 'class', 'variant' );
 171                 $graph->add_edge( $last_node, $vwnode, $edge_name );
 172                 $var_start = $vwnode unless $var_start;
 173                 $last_node = $vwnode;
 174             }
 175             # Now hook it up at the end.
 176             $graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
 177                                         $edge_name );
 178
 179             if( $var_start ) { # if it wasn't an empty reading
 180                 push( @readings, $var_start, $last_node );
 181             }
 182         }
 183
 184         # Now collate and collapse the identical nodes within the graph.
 185         collate_variants( $graph, @readings );
 186     }
 187
 188     ## Now in theory I have a graph.  I want to make it a little easier to
 189     ## read.  So I collapse nodes that have only one edge in and one edge
 190     ## out, and I do this by looking at the edges.
 191
 192     foreach my $edge ( $graph->edges() ) {
 193         my @out_edges = $edge->from()->outgoing();
 194         my @in_edges = $edge->to()->incoming();
 195
 196         next unless scalar( @out_edges ) == 1;
 197         next unless scalar( @in_edges ) == 1;
 198         next unless $out_edges[0] eq $in_edges[0];
 199         # In theory if we've got this far, we're safe, but just to
 200         # double-check...
 201         next unless $out_edges[0] eq $edge;
 202
 203         $graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
 204     }
 205 }
 206
 207 =item B<read_base>
 208
 209 my @line_beginnings = read_base( 'reference.txt', $graph );
 210
 211 Takes a text file and a (presumed empty) graph object, adds the words
 212 as simple linear nodes to the graph, and returns a list of nodes that
 213 represent the beginning of lines. This graph is now the starting point
 214 for application of apparatus entries in merge_base, e.g. from a CSV
 215 file or a Classical Text Editor file.
 216
 217 =cut
 218
 219 sub read_base {
 220     my( $base_file, $graph ) = @_;
 221
 222     # This array gives the first node for each line.  We put the
 223     # common starting point in line zero.
 224     my $last_node = $graph->start();
 225     my $lineref_array = [ $last_node ]; # There is no line zero.
 226
 227     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
 228     while(<BASE>) {
 229         # Make the nodes, and connect them up for the base, but also
 230         # save the first node of each line in an array for the purpose.
 231         chomp;
 232         my @words = split;
 233         my $started = 0;
 234         my $wordref = 0;
 235         my $lineref = scalar @$lineref_array;
 236         foreach my $w ( @words ) {
 237             my $noderef = join( ',', $lineref, ++$wordref );
 238             my $node = $graph->add_node( $noderef );
 239             $node->set_attribute( 'label', $w );
 240             $node->set_attribute( 'class', 'common' );
 241             unless( $started ) {
 242                 push( @$lineref_array, $node );
 243                 $started = 1;
 244             }
 245             if( $last_node ) {
 246                 my $edge = $graph->add_edge( $last_node, $node, "base text" );
 247                 $edge->set_attribute( 'class', 'basetext' );
 248                 $last_node = $node;
 249             } # TODO there should be no else here...
 250         }
 251     }
 252     close BASE;
 253     # Ending point for all texts
 254     my $endpoint = $graph->add_node( '#END#' );
 255     $graph->add_edge( $last_node, $endpoint, "base text" );
 256     push( @$lineref_array, $endpoint );
 257
 258     return( @$lineref_array );
 259 }
 260
 261 =item B<collate_variants>
 262
 263 collate_variants( $graph, @readings )
 264
 265 Given a set of readings in the form
 266 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
 267 walks through each to identify those nodes that are identical.  The
 268 graph is a Text::Tradition::Graph object; the elements of @readings are
 269 Graph::Easy::Node objects that appear on the graph.
 270
 271 TODO: Handle collapsed and non-collapsed transpositions.
 272
 273 =cut
 274
 275 sub collate_variants {
 276     my( $graph, @readings ) = @_;
 277     my $lemma_start = shift @readings;
 278     my $lemma_end = shift @readings;
 279     my $detranspose = 1;
 280
 281     # Start the list of distinct nodes with those nodes in the lemma.
 282     my @distinct_nodes;
 283     while( $lemma_start ne $lemma_end ) {
 284         push( @distinct_nodes, [ $lemma_start, 'base text' ] );
 285         $lemma_start = $graph->next_word( $lemma_start );
 286     }
 287     push( @distinct_nodes, [ $lemma_end, 'base text' ] );
 288
 289
 290     while( scalar @readings ) {
 291         my( $var_start, $var_end ) = splice( @readings, 0, 2 );
 292
 293         # I want to look at the nodes in the variant and lemma, and
 294         # collapse nodes that are the same word.  This is mini-collation.
 295         # Each word in the 'main' list can only be collapsed once with a
 296         # word from the current reading.
 297         my %collapsed = ();
 298
 299         # Get the label. There will only be one outgoing edge to start
 300         # with, so this is safe.
 301         my @out = $var_start->outgoing();
 302         my $var_label = $out[0]->label();
 303
 304         my @variant_nodes;
 305         while( $var_start ne $var_end ) {
 306             push( @variant_nodes, $var_start );
 307             $var_start = $graph->next_word( $var_start, $var_label );
 308         }
 309         push( @variant_nodes, $var_end );
 310
 311         # Go through the variant nodes, and if we find a lemma node that
 312         # hasn't yet been collapsed with a node, equate them.  If we do
 313         # not, keep them to push onto the end of all_nodes.
 314         my @remaining_nodes;
 315         my $last_index = 0;
 316         foreach my $w ( @variant_nodes ) {
 317             my $word = $w->label();
 318             my $matched = 0;
 319             foreach my $idx ( $last_index .. $#distinct_nodes ) {
 320                 my( $l, $edgelabel ) = @{$distinct_nodes[$idx]};
 321                 if( $word eq cmp_str( $l ) ) {
 322                     next if exists( $collapsed{ $l->label } )
 323                         && $collapsed{ $l->label } eq $l;
 324                     $matched = 1;
 325                     $last_index = $idx if $detranspose;
 326                     # Collapse the nodes.
 327                     printf STDERR "Merging nodes %s/%s and %s/%s\n",
 328                         $l->name, $l->label, $w->name, $w->label;
 329                     $graph->merge_nodes( $l, $w );
 330                     $collapsed{ $l->label } = $l;
 331                     # Now collapse any multiple edges to and from the node.
 332                     remove_duplicate_edges( $graph,
 333                                     $graph->prior_word( $l, $edgelabel ), $l );
 334                     remove_duplicate_edges( $graph, $l,
 335                                     $graph->next_word( $l, $edgelabel ) );
 336                 }
 337             }
 338             push( @remaining_nodes, [ $w, $var_label ] ) unless $matched;
 339         }
 340         push( @distinct_nodes, @remaining_nodes) if scalar( @remaining_nodes );
 341     }
 342 }
 343
 344 =item B<remove_duplicate_edges>
 345
 346 remove_duplicate_edges( $graph, $from, $to );
 347
 348 Given two nodes, reduce the number of edges between those nodes to
 349 one.  If neither edge represents a base text, combine their labels.
 350
 351 =cut
 352
 353 sub remove_duplicate_edges {
 354     my( $graph, $from, $to ) = @_;
 355     my @edges = $from->edges_to( $to );
 356     if( scalar @edges > 1 ) {
 357         my @base = grep { $_->label eq 'base text' } @edges;
 358         if ( scalar @base ) {
 359             # Remove the edges that are not base.
 360             foreach my $e ( @edges ) {
 361                 $graph->del_edge( $e )
 362                     unless $e eq $base[0];
 363             }
 364         } else {
 365             # Combine the edges into one.
 366             my $new_edge_name = join( ', ', map { $_->label() } @edges );
 367             my $new_edge = shift @edges;
 368             $new_edge->set_attribute( 'label', $new_edge_name );
 369             foreach my $e ( @edges ) {
 370                 $graph->del_edge( $e );
 371             }
 372         }
 373     }
 374 }
 375
 376 =item B<cmp_str>
 377
 378 Pretend you never saw this method.  Really it needs to not be hardcoded.
 379
 380 =cut
 381
 382 sub cmp_str {
 383     my( $node ) = @_;
 384     my $word = $node->label();
 385     $word = lc( $word );
 386     $word =~ s/\W//g;
 387     $word =~ s/v/u/g;
 388     $word =~ s/j/i/g;
 389     $word =~ s/cha/ca/g;
 390     $word =~ s/quatuor/quattuor/g;
 391     $word =~ s/ioannes/iohannes/g;
 392     return $word;
 393 }
 394
 395 =back
 396
 397 =head1 LICENSE
 398
 399 This package is free software and is provided "as is" without express
 400 or implied warranty.  You can redistribute it and/or modify it under
 401 the same terms as Perl itself.
 402
 403 =head1 AUTHOR
 404
 405 Tara L Andrews, aurum@cpan.org
 406
 407 =cut
 408
 409 1;