lib/Text/Tradition/Parser/BaseText.pm

   1 package Text::Tradition::Parser::BaseText;
   2
   3 use strict;
   4 use warnings;
   5 use Module::Load;
   6
   7 =head1 NAME
   8
   9 Text::Tradition::Parser::BaseText
  10
  11 =head1 SYNOPSIS
  12
  13 use Text::Tradition::Parser::BaseText qw( merge_base );
  14 merge_base( $graph, 'reference.txt', @apparatus_entries )
  15
  16 =head1 DESCRIPTION
  17
  18 For an overview of the package, see the documentation for the
  19 Text::Tradition::Graph module.
  20
  21 This module is meant for use with certain of the other Parser classes
  22 - whenever a list of variants is given with reference to a base text,
  23 these must be joined into a single collation.  The parser should
  24 therefore make a list of variants and their locations, and BaseText
  25 will join those listed variants onto the reference text.
  26
  27 =head1 SUBROUTINES
  28
  29 =over
  30
  31 =item B<parse>
  32
  33 parse( $graph, %opts );
  34
  35 Takes an initialized graph and a set of options, which must include:
  36 - 'base' - the base text referenced by the variants
  37 - 'format' - the format of the variant list
  38 - 'data' - the variants, in the given format.
  39
  40 =cut
  41
  42 sub parse {
  43     my( $graph, %opts ) = @_;
  44
  45     my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
  46     load( $format_mod );
  47     my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
  48     merge_base( $graph, $opts{'base'}, @apparatus_entries );
  49 }
  50
  51 =item B<merge_base>
  52
  53 merge_base( $graph, 'reference.txt', @apparatus_entries )
  54
  55 Takes three arguments: a newly-initialized Text::Tradition::Graph
  56 object, a text file containing the reference text, and a list of
  57 variants (apparatus entries).  Adds the base text to the graph, and
  58 joins the variants to that.
  59
  60 The list of variants is an array of hash references; each hash takes
  61 the form
  62  { '_id' => line reference,
  63    'rdg_0' => lemma reading,
  64    'rdg_1' => first variant,
  65    ...  # and so on until all distinct readings are listed
  66    'WitnessA' => 'rdg_0',
  67    'WitnessB' => 'rdg_1',
  68    ...  # and so on until all witnesses are listed with their readings
  69  }
  70
  71 Any hash key that is not of the form /^rdg_\d+$/ and that does not
  72 begin with an underscore is assumed to be a witness name.  Any 'meta'
  73 information to be passed must be passed in a key with a leading
  74 underscore in its name.
  75
  76 =cut
  77
  78 sub merge_base {
  79     my( $graph, $base_file, @app_entries ) = @_;
  80     my @base_line_starts = read_base( $base_file, $graph );
  81
  82     my %all_witnesses;
  83     foreach my $app ( @app_entries ) {
  84         my( $line, $num ) = split( /\./, $app->{_id} );
  85         # DEBUG with a short graph
  86         # last if $line > 2;
  87         # DEBUG for problematic entries
  88         my $scrutinize = "";
  89         my $first_line_node = $base_line_starts[ $line ];
  90         my $too_far = $base_line_starts[ $line+1 ];
  91
  92         my $lemma = $app->{rdg_0};
  93         my $seq = 1;
  94         # Is this the Nth occurrence of this reading in the line?
  95         if( $lemma =~ s/(_)?(\d)$// ) {
  96             $seq = $2;
  97         }
  98         my @lemma_words = split( /\s+/, $lemma );
  99
 100         # Now search for the lemma words within this line.
 101         my $lemma_start = $first_line_node;
 102         my $lemma_end;
 103         my %seen;
 104         while( $lemma_start ne $too_far ) {
 105             # Loop detection
 106             if( $seen{ $lemma_start->name() } ) {
 107                 warn "Detected loop at " . $lemma_start->name() .
 108                     ", ref $line,$num";
 109                 last;
 110             }
 111             $seen{ $lemma_start->name() } = 1;
 112
 113             # Try to match the lemma.
 114             my $unmatch = 0;
 115             print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
 116                 $lemma_words[0] . "...\n"
 117                 if "$line.$num" eq $scrutinize;
 118             if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
 119                 # Skip it if we need a match that is not the first.
 120                 if( --$seq < 1 ) {
 121                     # Now we have to compare the rest of the words here.
 122                     if( scalar( @lemma_words ) > 1 ) {
 123                         my $next_node = $graph->next_word( $lemma_start );
 124                         foreach my $w ( @lemma_words[1..$#lemma_words] ) {
 125                             printf STDERR "Now matching %s against %s\n",
 126                                     cmp_str($next_node), $w
 127                                 if "$line.$num" eq $scrutinize;
 128                             if( $w ne cmp_str($next_node) ) {
 129                                 $unmatch = 1;
 130                                 last;
 131                             } else {
 132                                 $lemma_end = $next_node;
 133                                 $next_node = $graph->next_word( $lemma_end );
 134                             }
 135                         }
 136                     } else {
 137                         $lemma_end = $lemma_start;
 138                     }
 139                 } else {
 140                     $unmatch = 1;
 141                 }
 142             }
 143             last unless ( $unmatch || !defined( $lemma_end ) );
 144             $lemma_end = undef;
 145             $lemma_start = $graph->next_word( $lemma_start );
 146         }
 147
 148         unless( $lemma_end ) {
 149             warn "No match found for @lemma_words at $line.$num";
 150             next;
 151         } else {
 152             # These are no longer common nodes; unmark them as such.
 153             my @lemma_nodes = $graph->node_sequence( $lemma_start,
 154                                                      $lemma_end );
 155             map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes;
 156         }
 157
 158         # Now we have our lemma nodes; we add the variant nodes to the graph.
 159
 160         # Keep track of the start and end point of each reading for later
 161         # node collapse.
 162         my @readings = ( $lemma_start, $lemma_end );
 163
 164         # For each reading that is not rdg_0, we make a chain of nodes
 165         # and connect them to the anchor.  Edges are named after the mss
 166         # that are relevant.
 167         foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
 168             next if $k eq 'rdg_0'; # that's the lemma.
 169             my @variant = split( /\s+/, $app->{$k} );
 170             @variant = () if $app->{$k} eq '/'; # This is an omission.
 171             my @mss = grep { $app->{$_} eq $k } keys( %$app );
 172
 173             unless( @mss ) {
 174                 print STDERR "Skipping '@variant' at $line.$num: no mss\n";
 175                 next;
 176             }
 177
 178             # Determine the label name for the edges here.
 179             my $edge_name = join(', ', @mss );
 180             @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
 181
 182             # Make the variant into a set of nodes.
 183             my $ctr = 0;
 184             my $last_node = $graph->prior_word( $lemma_start );
 185             my $var_start;
 186             foreach my $vw ( @variant ) {
 187                 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
 188                 my $vwnode = $graph->add_node( $vwname );
 189                 $vwnode->set_attribute( 'label', $vw );
 190                 $vwnode->set_attribute( 'class', 'variant' );
 191                 $graph->add_edge( $last_node, $vwnode, $edge_name );
 192                 $var_start = $vwnode unless $var_start;
 193                 $last_node = $vwnode;
 194             }
 195             # Now hook it up at the end.
 196             $graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
 197                                         $edge_name );
 198
 199             if( $var_start ) { # if it wasn't an empty reading
 200                 push( @readings, $var_start, $last_node );
 201             }
 202         }
 203
 204         # Now collate and collapse the identical nodes within the graph.
 205         collate_variants( $graph, @readings );
 206     }
 207
 208     ## Now in theory I have a graph.  I want to make it a little easier to
 209     ## read.  So I collapse nodes that have only one edge in and one edge
 210     ## out, and I do this by looking at the edges.
 211
 212 #     foreach my $edge ( $graph->edges() ) {
 213 #       my @out_edges = $edge->from()->outgoing();
 214 #       my @in_edges = $edge->to()->incoming();
 215
 216 #       next if $edge->from() eq $graph->start();
 217 #       next if $edge->to()->name() eq '#END#';
 218 #       next unless scalar( @out_edges ) == 1;
 219 #       next unless scalar( @in_edges ) == 1;
 220 #       next unless $out_edges[0] eq $in_edges[0];
 221 #       # In theory if we've got this far, we're safe, but just to
 222 #       # double-check...
 223 #       next unless $out_edges[0] eq $edge;
 224
 225 #       $graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
 226 #     }
 227
 228     # Now walk the path for each witness, so that we can do the
 229     # position calculations.
 230     my $paths = {};
 231     foreach my $w ( keys %all_witnesses ) {
 232         my $back = undef;
 233         if( $w =~ /^(.*)\s*\(p\.\s*c\.\)/ ) {
 234             $back = $1;
 235         }
 236         my @wit_nodes = $graph->node_sequence( $graph->start,
 237                                                $graph->node( '#END#' ),
 238                                                $w, $back );
 239         my @wn_names = map { $_->name() } @wit_nodes;
 240         $paths->{$w} = \@wn_names;
 241     }
 242     $DB::single = 1;
 243     my @common_nodes = grep { $graph->is_common( $_ ) } $graph->nodes();
 244     $graph->make_positions( \@common_nodes, $paths );
 245 }
 246
 247 =item B<read_base>
 248
 249 my @line_beginnings = read_base( 'reference.txt', $graph );
 250
 251 Takes a text file and a (presumed empty) graph object, adds the words
 252 as simple linear nodes to the graph, and returns a list of nodes that
 253 represent the beginning of lines. This graph is now the starting point
 254 for application of apparatus entries in merge_base, e.g. from a CSV
 255 file or a Classical Text Editor file.
 256
 257 =cut
 258
 259 sub read_base {
 260     my( $base_file, $graph ) = @_;
 261
 262     # This array gives the first node for each line.  We put the
 263     # common starting point in line zero.
 264     my $last_node = $graph->start();
 265     my $lineref_array = [ $last_node ]; # There is no line zero.
 266
 267     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
 268     while(<BASE>) {
 269         # Make the nodes, and connect them up for the base, but also
 270         # save the first node of each line in an array for the purpose.
 271         chomp;
 272         my @words = split;
 273         my $started = 0;
 274         my $wordref = 0;
 275         my $lineref = scalar @$lineref_array;
 276         foreach my $w ( @words ) {
 277             my $noderef = join( ',', $lineref, ++$wordref );
 278             my $node = $graph->add_node( $noderef );
 279             $node->set_attribute( 'label', $w );
 280             $node->set_attribute( 'class', 'common' );
 281             unless( $started ) {
 282                 push( @$lineref_array, $node );
 283                 $started = 1;
 284             }
 285             if( $last_node ) {
 286                 my $edge = $graph->add_edge( $last_node, $node, "base text" );
 287                 $edge->set_attribute( 'class', 'basetext' );
 288                 $last_node = $node;
 289             } # TODO there should be no else here...
 290         }
 291     }
 292     close BASE;
 293     # Ending point for all texts
 294     my $endpoint = $graph->add_node( '#END#' );
 295     $graph->add_edge( $last_node, $endpoint, "base text" );
 296     push( @$lineref_array, $endpoint );
 297
 298     return( @$lineref_array );
 299 }
 300
 301 =item B<collate_variants>
 302
 303 collate_variants( $graph, @readings )
 304
 305 Given a set of readings in the form
 306 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
 307 walks through each to identify those nodes that are identical.  The
 308 graph is a Text::Tradition::Graph object; the elements of @readings are
 309 Graph::Easy::Node objects that appear on the graph.
 310
 311 TODO: Handle collapsed and non-collapsed transpositions.
 312
 313 =cut
 314
 315 sub collate_variants {
 316     my( $graph, @readings ) = @_;
 317     my $lemma_start = shift @readings;
 318     my $lemma_end = shift @readings;
 319     my $detranspose = 0;
 320
 321     # We need to calculate positions at this point, which is where
 322     # we are getting the implicit information from the apparatus.
 323
 324     # Start the list of distinct nodes with those nodes in the lemma.
 325     my @distinct_nodes;
 326     my $position = 0;
 327     while( $lemma_start ne $lemma_end ) {
 328         push( @distinct_nodes, [ $lemma_start, 'base text', $position++ ] );
 329         $lemma_start = $graph->next_word( $lemma_start );
 330     }
 331     push( @distinct_nodes, [ $lemma_end, 'base text', $position++ ] );
 332
 333
 334     while( scalar @readings ) {
 335         my( $var_start, $var_end ) = splice( @readings, 0, 2 );
 336
 337         # I want to look at the nodes in the variant and lemma, and
 338         # collapse nodes that are the same word.  This is mini-collation.
 339         # Each word in the 'main' list can only be collapsed once with a
 340         # word from the current reading.
 341         my %collapsed = ();
 342
 343         # Get the label. There will only be one outgoing edge to start
 344         # with, so this is safe.
 345         my @out = $var_start->outgoing();
 346         my $var_label = $out[0]->label();
 347
 348         my @variant_nodes;
 349         while( $var_start ne $var_end ) {
 350             push( @variant_nodes, $var_start );
 351             $var_start = $graph->next_word( $var_start, $var_label );
 352         }
 353         push( @variant_nodes, $var_end );
 354
 355         # Go through the variant nodes, and if we find a lemma node that
 356         # hasn't yet been collapsed with a node, equate them.  If we do
 357         # not, keep them to push onto the end of all_nodes.
 358         my @remaining_nodes;
 359         my $last_index = 0;
 360         my $curr_pos = 0;
 361         foreach my $w ( @variant_nodes ) {
 362             my $word = $w->label();
 363             my $matched = 0;
 364             foreach my $idx ( $last_index .. $#distinct_nodes ) {
 365                 my( $l, $edgelabel, $pos ) = @{$distinct_nodes[$idx]};
 366                 if( $word eq cmp_str( $l ) ) {
 367                     next if exists( $collapsed{ $l->label } )
 368                         && $collapsed{ $l->label } eq $l;
 369                     $matched = 1;
 370                     $last_index = $idx if $detranspose;
 371                     # Collapse the nodes.
 372                     printf STDERR "Merging nodes %s/%s and %s/%s\n",
 373                         $l->name, $l->label, $w->name, $w->label;
 374                     $graph->merge_nodes( $l, $w );
 375                     $collapsed{ $l->label } = $l;
 376                     # Now collapse any multiple edges to and from the node.
 377                     remove_duplicate_edges( $graph,
 378                                     $graph->prior_word( $l, $edgelabel ), $l );
 379                     remove_duplicate_edges( $graph, $l,
 380                                     $graph->next_word( $l, $edgelabel ) );
 381                     $curr_pos = $pos;
 382                     last;
 383                 }
 384             }
 385             push( @remaining_nodes, [ $w, $var_label, $curr_pos++ ] ) unless $matched;
 386         }
 387         push( @distinct_nodes, @remaining_nodes) if scalar( @remaining_nodes );
 388     }
 389
 390     # Now set the positions of all the nodes in this variation.
 391     #$DB::single = 1;
 392     print STDERR "Nodes and their positions are:\n";
 393     foreach my $n ( @distinct_nodes ) {
 394         printf STDERR "\t%s (position %s)\n", $n->[0]->label(), $n->[2];
 395     }
 396 }
 397
 398 =item B<remove_duplicate_edges>
 399
 400 remove_duplicate_edges( $graph, $from, $to );
 401
 402 Given two nodes, reduce the number of edges between those nodes to
 403 one.  If neither edge represents a base text, combine their labels.
 404
 405 =cut
 406
 407 sub remove_duplicate_edges {
 408     my( $graph, $from, $to ) = @_;
 409     my @edges = $from->edges_to( $to );
 410     if( scalar @edges > 1 ) {
 411         my @base = grep { $_->label eq 'base text' } @edges;
 412         if ( scalar @base ) {
 413             # Remove the edges that are not base.
 414             foreach my $e ( @edges ) {
 415                 $graph->del_edge( $e )
 416                     unless $e eq $base[0];
 417             }
 418         } else {
 419             # Combine the edges into one.
 420             my $new_edge_name = join( ', ', map { $_->label() } @edges );
 421             my $new_edge = shift @edges;
 422             $new_edge->set_attribute( 'label', $new_edge_name );
 423             foreach my $e ( @edges ) {
 424                 $graph->del_edge( $e );
 425             }
 426         }
 427     }
 428 }
 429
 430 =item B<cmp_str>
 431
 432 Pretend you never saw this method.  Really it needs to not be hardcoded.
 433
 434 =cut
 435
 436 sub cmp_str {
 437     my( $node ) = @_;
 438     my $word = $node->label();
 439     $word = lc( $word );
 440     $word =~ s/\W//g;
 441     $word =~ s/v/u/g;
 442     $word =~ s/j/i/g;
 443     $word =~ s/cha/ca/g;
 444     $word =~ s/quatuor/quattuor/g;
 445     $word =~ s/ioannes/iohannes/g;
 446     return $word;
 447 }
 448
 449 =back
 450
 451 =head1 LICENSE
 452
 453 This package is free software and is provided "as is" without express
 454 or implied warranty.  You can redistribute it and/or modify it under
 455 the same terms as Perl itself.
 456
 457 =head1 AUTHOR
 458
 459 Tara L Andrews, aurum@cpan.org
 460
 461 =cut
 462
 463 1;