lib/Text/Tradition/Parser/BaseText.pm

   1 package Text::Tradition::Parser::BaseText;
   2
   3 use strict;
   4 use warnings;
   5 use Exporter 'import';
   6 use vars qw( @EXPORT_OK );
   7 @EXPORT_OK = qw( merge_base );
   8
   9 =head1 NAME
  10
  11 Text::Tradition::Parser::BaseText
  12
  13 =head1 SYNOPSIS
  14
  15 use Text::Tradition::Parser::BaseText qw( merge_base );
  16 merge_base( $graph, 'reference.txt', @apparatus_entries )
  17
  18 =head1 DESCRIPTION
  19
  20 For an overview of the package, see the documentation for the
  21 Text::Tradition::Graph module.
  22
  23 This module is meant for use with certain of the other Parser classes
  24 - whenever a list of variants is given with reference to a base text,
  25 these must be joined into a single collation.  The parser should
  26 therefore make a list of variants and their locations, and BaseText
  27 will join those listed variants onto the reference text.
  28
  29 =head1 SUBROUTINES
  30
  31 =over
  32
  33 =item B<merge_base>
  34
  35 merge_base( $graph, 'reference.txt', @apparatus_entries )
  36
  37 Takes three arguments: a newly-initialized Text::Tradition::Graph
  38 object, a text file containing the reference text, and a list of
  39 variants (apparatus entries).  Adds the base text to the graph, and
  40 joins the variants to that.
  41
  42 The list of variants is an array of hash references; each hash takes
  43 the form
  44  { '_id' => line reference,
  45    'rdg_0' => lemma reading,
  46    'rdg_1' => first variant,
  47    ...  # and so on until all distinct readings are listed
  48    'WitnessA' => 'rdg_0',
  49    'WitnessB' => 'rdg_1',
  50    ...  # and so on until all witnesses are listed with their readings
  51  }
  52
  53 Any hash key that is not of the form /^rdg_\d+$/ and that does not
  54 begin with an underscore is assumed to be a witness name.  Any 'meta'
  55 information to be passed must be passed in a key with a leading
  56 underscore in its name.
  57
  58 =cut
  59
  60 sub merge_base {
  61     my( $graph, $base_file, @app_entries ) = @_;
  62     my @base_line_starts = read_base( $base_file, $graph );
  63
  64     foreach my $app ( @app_entries ) {
  65         my( $line, $num ) = split( /\./, $app->{_id} );
  66         # DEBUG with a short graph
  67         # last if $line > 2;
  68         # DEBUG for problematic entries
  69         # my $scrutinize = "21.8";
  70         my $first_line_node = $base_line_starts[ $line ];
  71         my $too_far = $base_line_starts[ $line+1 ];
  72
  73         my $lemma = $app->{rdg_0};
  74         my $seq = 1;
  75         # Is this the Nth occurrence of this reading in the line?
  76         if( $lemma =~ s/(_)?(\d)$// ) {
  77             $seq = $2;
  78         }
  79         my @lemma_words = split( /\s+/, $lemma );
  80
  81         # Now search for the lemma words within this line.
  82         my $lemma_start = $first_line_node;
  83         my $lemma_end;
  84         my %seen;
  85         while( $lemma_start ne $too_far ) {
  86             # Loop detection
  87             if( $seen{ $lemma_start->name() } ) {
  88                 warn "Detected loop at " . $lemma_start->name() .
  89                     ", ref $line,$num";
  90                 last;
  91             }
  92             $seen{ $lemma_start->name() } = 1;
  93
  94             # Try to match the lemma.
  95             my $unmatch = 0;
  96             print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
  97                 $lemma_words[0] . "...\n"
  98                 if "$line.$num" eq $scrutinize;
  99             if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
 100                 # Skip it if we need a match that is not the first.
 101                 if( --$seq < 1 ) {
 102                     # Now we have to compare the rest of the words here.
 103                     if( scalar( @lemma_words ) > 1 ) {
 104                         my $next_node = $graph->next_word( $lemma_start );
 105                         foreach my $w ( @lemma_words[1..$#lemma_words] ) {
 106                             printf STDERR "Now matching %s against %s\n",
 107                                     cmp_str($next_node), $w
 108                                 if "$line.$num" eq $scrutinize;
 109                             if( $w ne cmp_str($next_node) ) {
 110                                 $unmatch = 1;
 111                                 last;
 112                             } else {
 113                                 $lemma_end = $next_node;
 114                                 $next_node = $graph->next_word( $lemma_end );
 115                             }
 116                         }
 117                     } else {
 118                         $lemma_end = $lemma_start;
 119                     }
 120                 } else {
 121                     $unmatch = 1;
 122                 }
 123             }
 124             last unless ( $unmatch || !defined( $lemma_end ) );
 125             $lemma_end = undef;
 126             $lemma_start = $graph->next_word( $lemma_start );
 127         }
 128
 129         unless( $lemma_end ) {
 130             warn "No match found for @lemma_words at $line.$num";
 131             next;
 132         } else {
 133             # These are no longer common nodes; unmark them as such.
 134             my @lemma_nodes = $graph->node_sequence( $lemma_start,
 135                                                      $lemma_end );
 136             map { $_->set_attribute( 'class', 'lemma' ) } @lemma_nodes;
 137         }
 138
 139         # Now we have our lemma nodes; we add the variant nodes to the graph.
 140
 141         # For each reading that is not rdg_0, we make a chain of nodes
 142         # and connect them to the anchor.  Edges are named after the mss
 143         # that are relevant.
 144         foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
 145             next if $k eq 'rdg_0'; # that's the lemma.
 146             my @variant = split( /\s+/, $app->{$k} );
 147             @variant = () if $app->{$k} eq '/'; # This is an omission.
 148             my @mss = grep { $app->{$_} eq $k } keys( %$app );
 149
 150             unless( @mss ) {
 151                 print STDERR "Skipping '@variant' at $line.$num: no mss\n";
 152                 next;
 153             }
 154
 155             # Determine the label name for the edges here.
 156             my $edge_name = join(', ', @mss );
 157
 158             # Make the variant into a set of nodes.
 159             my $ctr = 0;
 160             my $last_node = $graph->prior_word( $lemma_start );
 161             my $var_start;
 162             foreach my $vw ( @variant ) {
 163                 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
 164                 my $vwnode = $graph->add_node( $vwname );
 165                 $vwnode->set_attribute( 'label', $vw );
 166                 $vwnode->set_attribute( 'class', 'variant' );
 167                 $graph->add_edge( $last_node, $vwnode, $edge_name );
 168                 $var_start = $vwnode unless $var_start;
 169                 $last_node = $vwnode;
 170             }
 171             # Now hook it up at the end.
 172             $graph->add_edge( $last_node, $graph->next_word( $lemma_end ),
 173                                         $edge_name );
 174
 175             # Now collate and collapse the identical nodes within the graph.
 176             collate_variant( $graph, $lemma_start, $lemma_end,
 177                              $var_start, $last_node );
 178
 179         }
 180     }
 181
 182     ## Now in theory I have a graph.  I want to make it a little easier to
 183     ## read.  So I collapse nodes that have only one edge in and one edge
 184     ## out, and I do this by looking at the edges.
 185
 186     foreach my $edge ( $graph->edges() ) {
 187         my @out_edges = $edge->from()->outgoing();
 188         my @in_edges = $edge->to()->incoming();
 189
 190         next unless scalar( @out_edges ) == 1;
 191         next unless scalar( @in_edges ) == 1;
 192         next unless $out_edges[0] eq $in_edges[0];
 193         # In theory if we've got this far, we're safe, but just to
 194         # double-check...
 195         next unless $out_edges[0] eq $edge;
 196
 197         $graph->merge_nodes( $edge->from(), $edge->to(), ' ' );
 198     }
 199 }
 200
 201 =item B<read_base>
 202
 203 my @line_beginnings = read_base( 'reference.txt', $graph );
 204
 205 Takes a text file and a (presumed empty) graph object, adds the words
 206 as simple linear nodes to the graph, and returns a list of nodes that
 207 represent the beginning of lines. This graph is now the starting point
 208 for application of apparatus entries in merge_base, e.g. from a CSV
 209 file or a Classical Text Editor file.
 210
 211 =cut
 212
 213 sub read_base {
 214     my( $base_file, $graph ) = @_;
 215
 216     # This array gives the first node for each line.  We put the
 217     # common starting point in line zero.
 218     my $last_node = $graph->start();
 219     my $lineref_array = [ $last_node ]; # There is no line zero.
 220
 221     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
 222     while(<BASE>) {
 223         # Make the nodes, and connect them up for the base, but also
 224         # save the first node of each line in an array for the purpose.
 225         chomp;
 226         my @words = split;
 227         my $started = 0;
 228         my $wordref = 0;
 229         my $lineref = scalar @$lineref_array;
 230         foreach my $w ( @words ) {
 231             my $noderef = join( ',', $lineref, ++$wordref );
 232             my $node = $graph->add_node( $noderef );
 233             $node->set_attribute( 'label', $w );
 234             $node->set_attribute( 'class', 'common' );
 235             unless( $started ) {
 236                 push( @$lineref_array, $node );
 237                 $started = 1;
 238             }
 239             if( $last_node ) {
 240                 $graph->add_edge( $last_node, $node, "base text" );
 241                 $last_node = $node;
 242             } # TODO there should be no else here...
 243         }
 244     }
 245     close BASE;
 246     # Ending point for all texts
 247     my $endpoint = $graph->add_node( '#END#' );
 248     $graph->add_edge( $last_node, $endpoint, "base text" );
 249     push( @$lineref_array, $endpoint );
 250
 251     return( @$lineref_array );
 252 }
 253
 254 =item B<collate_variant>
 255
 256 collate_variant( $graph, $lemma_start, $lemma_end, $var_start, $var_end );
 257
 258 Given a lemma and a variant as start- and endpoints on the graph,
 259 walks through each to identify those nodes that are identical.  The
 260 graph is a Text::Tradition::Graph object; the other arguments are
 261 Graph::Easy::Node objects that appear on the graph.
 262
 263 TODO: Handle collapsed and non-collapsed transpositions.
 264
 265 =cut
 266
 267 sub collate_variant {
 268     my( $graph, $lemma_start, $lemma_end, $var_start, $var_end ) = @_;
 269     # If var_start is undef, then the variant is an omission and
 270     # there's nothing to collate. Return.
 271     return unless $var_start;
 272
 273     # I want to look at the nodes in the variant and lemma, and
 274     # collapse nodes that are the same word.  This is mini-collation.
 275     my %collapsed = ();
 276     # There will only be one outgoing edge at first, so this is safe.
 277     my @out = $var_start->outgoing();
 278     my $var_label = $out[0]->label();
 279
 280     my @lemma_nodes;
 281     while( $lemma_start ne $lemma_end ) {
 282         push( @lemma_nodes, $lemma_start );
 283         $lemma_start = $graph->next_word( $lemma_start );
 284     }
 285     push( @lemma_nodes, $lemma_end );
 286
 287     my @variant_nodes;
 288     while( $var_start ne $var_end ) {
 289         push( @variant_nodes, $var_start );
 290         $var_start = $graph->next_word( $var_start, $var_label );
 291     }
 292     push( @variant_nodes, $var_end );
 293
 294     # Go through the variant nodes, and if we find a lemma node that
 295     # hasn't yet been collapsed with a node, equate them.
 296
 297     foreach my $w ( @variant_nodes ) {
 298         my $word = $w->label();
 299         foreach my $l ( @lemma_nodes ) {
 300             if( $word eq cmp_str( $l ) ) {
 301                 next if exists( $collapsed{ $l->label } )
 302                     && $collapsed{ $l->label } eq $l;
 303                 # Collapse the nodes.
 304                 printf STDERR "Merging nodes %s/%s and %s/%s\n",
 305                     $l->name, $l->label, $w->name, $w->label;
 306                 $graph->merge_nodes( $l, $w );
 307                 $collapsed{ $l->label } = $l;
 308                 # Now collapse any multiple edges to and from the node.
 309                 # Rely on the presence of the 'base text' edge.
 310                 remove_duplicate_edges( $graph, $graph->prior_word( $l ), $l );
 311                 remove_duplicate_edges( $graph, $l, $graph->next_word( $l ) );
 312             }
 313         }
 314     }
 315 }
 316
 317 =item B<remove_duplicate_edges>
 318
 319 remove_duplicate_edges( $graph, $from, $to );
 320
 321 Given two nodes, reduce the number of edges between those nodes to
 322 one.  If neither edge represents a base text, combine their labels.
 323
 324 =cut
 325
 326 sub remove_duplicate_edges {
 327     my( $graph, $from, $to ) = @_;
 328     my @edges = $from->edges_to( $to );
 329     if( scalar @edges > 1 ) {
 330         my @base = grep { $_->label eq 'base text' } @edges;
 331         if ( scalar @base ) {
 332             # Remove the edges that are not base.
 333             foreach my $e ( @edges ) {
 334                 $graph->del_edge( $e )
 335                     unless $e eq $base[0];
 336             }
 337         } else {
 338             # Combine the edges into one.
 339             my $new_edge_name = join( ', ', map { $_->label() } @edges );
 340             my $new_edge = shift @edges;
 341             $new_edge->set_attribute( 'label', $new_edge_name );
 342             foreach my $e ( @edges ) {
 343                 $graph->del_edge( $e );
 344             }
 345         }
 346     }
 347 }
 348
 349 =item B<cmp_str>
 350
 351 Pretend you never saw this method.  Really it needs to not be hardcoded.
 352
 353 =cut
 354
 355 sub cmp_str {
 356     my( $node ) = @_;
 357     my $word = $node->label();
 358     $word = lc( $word );
 359     $word =~ s/\W//g;
 360     $word =~ s/v/u/g;
 361     $word =~ s/j/i/g;
 362     $word =~ s/cha/ca/g;
 363     $word =~ s/quatuor/quattuor/g;
 364     $word =~ s/ioannes/iohannes/g;
 365     return $word;
 366 }
 367
 368 =back
 369
 370 =head1 LICENSE
 371
 372 This package is free software and is provided "as is" without express
 373 or implied warranty.  You can redistribute it and/or modify it under
 374 the same terms as Perl itself.
 375
 376 =head1 AUTHOR
 377
 378 Tara L Andrews, aurum@cpan.org
 379
 380 =cut
 381
 382 1;