lib/Text/Tradition/Parser/BaseText.pm

   1 package Text::Tradition::Parser::BaseText;
   2
   3 use strict;
   4 use warnings;
   5 use Module::Load;
   6
   7 =head1 NAME
   8
   9 Text::Tradition::Parser::BaseText
  10
  11 =head1 SYNOPSIS
  12
  13 use Text::Tradition::Parser::BaseText qw( merge_base );
  14 merge_base( $graph, 'reference.txt', @apparatus_entries )
  15
  16 =head1 DESCRIPTION
  17
  18 For an overview of the package, see the documentation for the
  19 Text::Tradition::Graph module.
  20
  21 This module is meant for use with certain of the other Parser classes
  22 - whenever a list of variants is given with reference to a base text,
  23 these must be joined into a single collation.  The parser should
  24 therefore make a list of variants and their locations, and BaseText
  25 will join those listed variants onto the reference text.
  26
  27 =head1 SUBROUTINES
  28
  29 =over
  30
  31 =item B<parse>
  32
  33 parse( $graph, %opts );
  34
  35 Takes an initialized graph and a set of options, which must include:
  36 - 'base' - the base text referenced by the variants
  37 - 'format' - the format of the variant list
  38 - 'data' - the variants, in the given format.
  39
  40 =cut
  41
  42 sub parse {
  43     my( $tradition, %opts ) = @_;
  44
  45     my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
  46     load( $format_mod );
  47     my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
  48     merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
  49 }
  50
  51 =item B<merge_base>
  52
  53 merge_base( $graph, 'reference.txt', @apparatus_entries )
  54
  55 Takes three arguments: a newly-initialized Text::Tradition::Graph
  56 object, a text file containing the reference text, and a list of
  57 variants (apparatus entries).  Adds the base text to the graph, and
  58 joins the variants to that.
  59
  60 The list of variants is an array of hash references; each hash takes
  61 the form
  62  { '_id' => line reference,
  63    'rdg_0' => lemma reading,
  64    'rdg_1' => first variant,
  65    ...  # and so on until all distinct readings are listed
  66    'WitnessA' => 'rdg_0',
  67    'WitnessB' => 'rdg_1',
  68    ...  # and so on until all witnesses are listed with their readings
  69  }
  70
  71 Any hash key that is not of the form /^rdg_\d+$/ and that does not
  72 begin with an underscore is assumed to be a witness name.  Any 'meta'
  73 information to be passed must be passed in a key with a leading
  74 underscore in its name.
  75
  76 =cut
  77
  78 sub merge_base {
  79     my( $collation, $base_file, @app_entries ) = @_;
  80     my @base_line_starts = read_base( $base_file, $collation );
  81
  82     my %all_witnesses;
  83     foreach my $app ( @app_entries ) {
  84         my( $line, $num ) = split( /\./, $app->{_id} );
  85         # DEBUG with a short graph
  86         # last if $line > 2;
  87         # DEBUG for problematic entries
  88         my $scrutinize = "7.3";
  89         my $first_line_reading = $base_line_starts[ $line ];
  90         my $too_far = $base_line_starts[ $line+1 ];
  91
  92         my $lemma = $app->{rdg_0};
  93         my $seq = 1;
  94         # Is this the Nth occurrence of this reading in the line?
  95         if( $lemma =~ s/(_)?(\d)$// ) {
  96             $seq = $2;
  97         }
  98         my @lemma_words = split( /\s+/, $lemma );
  99
 100         # Now search for the lemma words within this line.
 101         my $lemma_start = $first_line_reading;
 102         my $lemma_end;
 103         my %seen;
 104         while( $lemma_start ne $too_far ) {
 105             # Loop detection
 106             if( $seen{ $lemma_start->name() } ) {
 107                 warn "Detected loop at " . $lemma_start->name() .
 108                     ", ref $line,$num";
 109                 last;
 110             }
 111             $seen{ $lemma_start->name() } = 1;
 112
 113             # Try to match the lemma.
 114             my $unmatch = 0;
 115             print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
 116                 $lemma_words[0] . "...\n"
 117                 if "$line.$num" eq $scrutinize;
 118             if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
 119                 # Skip it if we need a match that is not the first.
 120                 if( --$seq < 1 ) {
 121                     # Now we have to compare the rest of the words here.
 122                     if( scalar( @lemma_words ) > 1 ) {
 123                         my $next_reading =
 124                             $collation->next_reading( $lemma_start );
 125                         foreach my $w ( @lemma_words[1..$#lemma_words] ) {
 126                             printf STDERR "Now matching %s against %s\n",
 127                                     cmp_str($next_reading), $w
 128                                 if "$line.$num" eq $scrutinize;
 129                             if( $w ne cmp_str($next_reading) ) {
 130                                 $unmatch = 1;
 131                                 last;
 132                             } else {
 133                                 $lemma_end = $next_reading;
 134                                 $next_reading =
 135                                     $collation->next_reading( $lemma_end );
 136                             }
 137                         }
 138                     } else {
 139                         $lemma_end = $lemma_start;
 140                     }
 141                 } else {
 142                     $unmatch = 1;
 143                 }
 144             }
 145             last unless ( $unmatch || !defined( $lemma_end ) );
 146             $lemma_end = undef;
 147             $lemma_start = $collation->next_reading( $lemma_start );
 148         }
 149
 150         unless( $lemma_end ) {
 151             warn "No match found for @lemma_words at $line.$num";
 152             next;
 153         } else {
 154             # These are no longer common readings; unmark them as such.
 155             my @lemma_readings = $collation->reading_sequence( $lemma_start,
 156                                                      $lemma_end );
 157             map { $_->set_attribute( 'class', 'lemma' ) } @lemma_readings;
 158         }
 159
 160         # Now we have our lemma readings; we add the variant readings
 161         # to the collation.
 162
 163         # Keep track of the start and end point of each reading for later
 164         # reading collapse.
 165         my @readings = ( $lemma_start, $lemma_end );
 166
 167         # For each reading that is not rdg_0, we make a chain of readings
 168         # and connect them to the anchor.  Edges are named after the mss
 169         # that are relevant.
 170         foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
 171             if( $k eq 'rdg_0' ) { # that's the lemma
 172                 # The lemma is already in the graph, but we need to look for
 173                 # any explicit post-correctione readings and add the
 174                 # relevant path.
 175                 my @mss = grep { $app->{$_} eq $k } keys( %$app );
 176                 foreach my $m ( @mss ) {
 177                     my $base = _is_post_corr( $m );
 178                     next unless $base;
 179                     my @lem = $collation->reading_sequence( $lemma_start, $lemma_end );
 180                     foreach my $i ( 0 .. $#lem-1 ) {
 181                         $collation->add_path( $lem[$i], $lem[$i++], $m );
 182                     }
 183                 }
 184             }
 185             my @variant = split( /\s+/, $app->{$k} );
 186             @variant = () if $app->{$k} eq '/'; # This is an omission.
 187             my @mss = grep { $app->{$_} eq $k } keys( %$app );
 188
 189             unless( @mss ) {
 190                 print STDERR "Skipping '@variant' at $line.$num: no mss\n";
 191                 next;
 192             }
 193
 194             # Keep track of what witnesses we have seen.
 195             @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
 196
 197             # Make the variant into a set of readings.
 198             my $ctr = 0;
 199             my $last_reading = $collation->prior_reading( $lemma_start );
 200             my $var_start;
 201             foreach my $vw ( @variant ) {
 202                 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
 203                 my $vwreading = $collation->add_reading( $vwname );
 204                 $vwreading->text( $vw );
 205                 $vwreading->make_variant();
 206                 foreach ( @mss ) {
 207                     $collation->add_path( $last_reading, $vwreading, $_ );
 208                 }
 209                 $var_start = $vwreading unless $var_start;
 210                 $last_reading = $vwreading;
 211             }
 212             # Now hook it up at the end.
 213             foreach ( @mss ) {
 214                 $collation->add_path( $last_reading,
 215                                       $collation->next_reading( $lemma_end ),
 216                                       $_ );
 217             }
 218
 219             if( $var_start ) { # if it wasn't an empty reading
 220                 push( @readings, $var_start, $last_reading );
 221             }
 222         }
 223
 224         # Now collate and collapse the identical readings within the collation.
 225         collate_variants( $collation, @readings );
 226     }
 227
 228     # Now make the witness objects
 229     foreach my $w ( keys %all_witnesses ) {
 230         my $base = _is_post_corr( $w );
 231         if( $base ) {
 232             my $pctag = substr( $w, length( $base ) );
 233             my $existing_wit = $collation->tradition->witness( $base );
 234             unless( $existing_wit ) {
 235                 $existing_wit = $collation->tradition->add_witness( sigil => $base );
 236             }
 237             $existing_wit->post_correctione( $pctag );
 238         } else {
 239             $collation->tradition->add_witness( sigil => $w )
 240                 unless $collation->tradition->witness( $w );
 241         }
 242     }
 243
 244     # Now walk paths and calculate positions.
 245     my @common_readings =
 246         $collation->walk_and_expand_base( $collation->reading( '#END#' ) );
 247     $collation->calculate_positions( @common_readings );
 248 }
 249
 250 =item B<read_base>
 251
 252 my @line_beginnings = read_base( 'reference.txt', $collation );
 253
 254 Takes a text file and a (presumed empty) collation object, adds the
 255 words as simple linear readings to the collation, and returns a
 256 list of readings that represent the beginning of lines. This collation
 257 is now the starting point for application of apparatus entries in
 258 merge_base, e.g. from a CSV file or a Classical Text Editor file.
 259
 260 =cut
 261
 262 sub read_base {
 263     my( $base_file, $collation ) = @_;
 264
 265     # This array gives the first reading for each line.  We put the
 266     # common starting point in line zero.
 267     my $last_reading = $collation->start();
 268     my $lineref_array = [ $last_reading ]; # There is no line zero.
 269
 270     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
 271     while(<BASE>) {
 272         # Make the readings, and connect them up for the base, but
 273         # also save the first reading of each line in an array for the
 274         # purpose.
 275         # TODO use configurable reading separator
 276         chomp;
 277         my @words = split;
 278         my $started = 0;
 279         my $wordref = 0;
 280         my $lineref = scalar @$lineref_array;
 281         foreach my $w ( @words ) {
 282             my $readingref = join( ',', $lineref, ++$wordref );
 283             my $reading = $collation->add_reading( $readingref );
 284             $reading->text( $w );
 285             $reading->make_common();
 286             unless( $started ) {
 287                 push( @$lineref_array, $reading );
 288                 $started = 1;
 289             }
 290             if( $last_reading ) {
 291                 my $path = $collation->add_path( $last_reading, $reading,
 292                                                  "base text" );
 293                 $path->set_attribute( 'class', 'basetext' );
 294                 $last_reading = $reading;
 295             } # TODO there should be no else here...
 296         }
 297     }
 298     close BASE;
 299     # Ending point for all texts
 300     my $endpoint = $collation->add_reading( '#END#' );
 301     $collation->add_path( $last_reading, $endpoint, "base text" );
 302     push( @$lineref_array, $endpoint );
 303
 304     return( @$lineref_array );
 305 }
 306
 307 =item B<collate_variants>
 308
 309 collate_variants( $collation, @readings )
 310
 311 Given a set of readings in the form
 312 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
 313 walks through each to identify those readings that are identical.  The
 314 collation is a Text::Tradition::Collation object; the elements of
 315 @readings are Text::Tradition::Collation::Reading objects that appear
 316 on the collation graph.
 317
 318 TODO: Handle collapsed and non-collapsed transpositions.
 319
 320 =cut
 321
 322 sub collate_variants {
 323     my( $collation, @readings ) = @_;
 324     my $lemma_start = shift @readings;
 325     my $lemma_end = shift @readings;
 326     my $detranspose = 1;
 327
 328     # Start the list of distinct readings with those readings in the lemma.
 329     my @distinct_readings;
 330     while( $lemma_start ne $lemma_end ) {
 331         push( @distinct_readings, [ $lemma_start, 'base text' ] );
 332         $lemma_start = $collation->next_reading( $lemma_start );
 333     }
 334     push( @distinct_readings, [ $lemma_end, 'base text' ] );
 335
 336
 337     while( scalar @readings ) {
 338         my( $var_start, $var_end ) = splice( @readings, 0, 2 );
 339
 340         # I want to look at the readings in the variant and lemma, and
 341         # collapse readings that are the same word.  This is mini-collation.
 342         # Each word in the 'main' list can only be collapsed once with a
 343         # word from the current reading.
 344         my %collapsed = ();
 345
 346         # Get the variant witnesses.  They will all be going along the
 347         # same path, so just use the first one as representative for
 348         # the purpose of following the path.
 349         my @var_wits = map { $_->label } $var_start->outgoing();
 350         my $rep_wit = $var_wits[0];
 351
 352         my @variant_readings;
 353         while( $var_start ne $var_end ) {
 354             push( @variant_readings, $var_start );
 355             $var_start = $collation->next_reading( $var_start, $rep_wit );
 356         }
 357         push( @variant_readings, $var_end );
 358
 359         # Go through the variant readings, and if we find a lemma reading that
 360         # hasn't yet been collapsed with a reading, equate them.  If we do
 361         # not, keep them to push onto the end of all_readings.
 362         # TODO replace this with proper mini-collation
 363         my @remaining_readings;
 364         my $last_index = 0;
 365         my $curr_pos = 0;
 366         foreach my $w ( @variant_readings ) {
 367             my $word = $w->label();
 368             my $matched = 0;
 369             foreach my $idx ( $last_index .. $#distinct_readings ) {
 370                 my( $l, $pathlabel ) = @{$distinct_readings[$idx]};
 371                 if( $word eq cmp_str( $l ) ) {
 372                     next if exists( $collapsed{ $l->label } )
 373                         && $collapsed{ $l->label } eq $l;
 374                     $matched = 1;
 375                     $last_index = $idx if $detranspose;
 376                     # Collapse the readings.
 377                     printf STDERR "Merging readings %s/%s and %s/%s\n",
 378                         $l->name, $l->label, $w->name, $w->label;
 379                     $collation->merge_readings( $l, $w );
 380                     $collapsed{ $l->label } = $l;
 381                     # Now collapse any multiple paths to and from the reading.
 382                     remove_duplicate_paths( $collation,
 383                                     $collation->prior_reading( $l, $rep_wit ), $l );
 384                     remove_duplicate_paths( $collation, $l,
 385                                     $collation->next_reading( $l, $rep_wit ) );
 386                     last;
 387                 }
 388             }
 389             push( @remaining_readings, [ $w, $rep_wit ] ) unless $matched;
 390         }
 391         push( @distinct_readings, @remaining_readings) if scalar( @remaining_readings );
 392     }
 393 }
 394
 395 =item B<remove_duplicate_paths>
 396
 397 remove_duplicate_paths( $collation, $from, $to );
 398
 399 Given two readings, reduce the number of paths between those readings to
 400 a set of unique paths.
 401
 402 =cut
 403
 404 # TODO wonder if this is necessary
 405 sub remove_duplicate_paths {
 406     my( $collation, $from, $to ) = @_;
 407     my %seen_paths;
 408     foreach my $p ( $from->edges_to( $to ) ) {
 409         if( exists $seen_paths{$p->name} ) {
 410             $collation->del_path( $p );
 411         } else {
 412             $seen_paths{$p->name} = 1;
 413         }
 414     }
 415 }
 416
 417 # Helper function. Given a witness sigil, if it is a post-correctione
 418 # sigil,return the base witness.  If not, return a false value.
 419 sub _is_post_corr {
 420     my( $sigil ) = @_;
 421     if( $sigil =~ /^(.*?)(\s*\(p\.\s*c\.\))$/ ) {
 422         return $1;
 423     }
 424     return undef;
 425 }
 426
 427 =item B<cmp_str>
 428
 429 Pretend you never saw this method.  Really it needs to not be hardcoded.
 430
 431 =cut
 432
 433 sub cmp_str {
 434     my( $reading ) = @_;
 435     my $word = $reading->label();
 436     $word = lc( $word );
 437     $word =~ s/\W//g;
 438     $word =~ s/v/u/g;
 439     $word =~ s/j/i/g;
 440     $word =~ s/cha/ca/g;
 441     $word =~ s/quatuor/quattuor/g;
 442     $word =~ s/ioannes/iohannes/g;
 443     return $word;
 444 }
 445
 446 =back
 447
 448 =head1 LICENSE
 449
 450 This package is free software and is provided "as is" without express
 451 or implied warranty.  You can redistribute it and/or modify it under
 452 the same terms as Perl itself.
 453
 454 =head1 AUTHOR
 455
 456 Tara L Andrews, aurum@cpan.org
 457
 458 =cut
 459
 460 1;