lib/Text/Tradition/Parser/BaseText.pm

   1 package Text::Tradition::Parser::BaseText;
   2
   3 use strict;
   4 use warnings;
   5 use Module::Load;
   6
   7 =head1 NAME
   8
   9 Text::Tradition::Parser::BaseText
  10
  11 =head1 SYNOPSIS
  12
  13 use Text::Tradition::Parser::BaseText qw( merge_base );
  14 merge_base( $graph, 'reference.txt', @apparatus_entries )
  15
  16 =head1 DESCRIPTION
  17
  18 For an overview of the package, see the documentation for the
  19 Text::Tradition::Graph module.
  20
  21 This module is meant for use with certain of the other Parser classes
  22 - whenever a list of variants is given with reference to a base text,
  23 these must be joined into a single collation.  The parser should
  24 therefore make a list of variants and their locations, and BaseText
  25 will join those listed variants onto the reference text.
  26
  27 =head1 SUBROUTINES
  28
  29 =over
  30
  31 =item B<parse>
  32
  33 parse( $graph, %opts );
  34
  35 Takes an initialized graph and a set of options, which must include:
  36 - 'base' - the base text referenced by the variants
  37 - 'format' - the format of the variant list
  38 - 'data' - the variants, in the given format.
  39
  40 =cut
  41
  42 sub parse {
  43     my( $tradition, %opts ) = @_;
  44
  45     my $format_mod = 'Text::Tradition::Parser::' . $opts{'format'};
  46     load( $format_mod );
  47     my @apparatus_entries = $format_mod->can('read')->( $opts{'data'} );
  48     merge_base( $tradition->collation, $opts{'base'}, @apparatus_entries );
  49 }
  50
  51 =item B<merge_base>
  52
  53 merge_base( $graph, 'reference.txt', @apparatus_entries )
  54
  55 Takes three arguments: a newly-initialized Text::Tradition::Graph
  56 object, a text file containing the reference text, and a list of
  57 variants (apparatus entries).  Adds the base text to the graph, and
  58 joins the variants to that.
  59
  60 The list of variants is an array of hash references; each hash takes
  61 the form
  62  { '_id' => line reference,
  63    'rdg_0' => lemma reading,
  64    'rdg_1' => first variant,
  65    ...  # and so on until all distinct readings are listed
  66    'WitnessA' => 'rdg_0',
  67    'WitnessB' => 'rdg_1',
  68    ...  # and so on until all witnesses are listed with their readings
  69  }
  70
  71 Any hash key that is not of the form /^rdg_\d+$/ and that does not
  72 begin with an underscore is assumed to be a witness name.  Any 'meta'
  73 information to be passed must be passed in a key with a leading
  74 underscore in its name.
  75
  76 =cut
  77
  78 my $SHORT = 20;
  79
  80 sub merge_base {
  81     my( $collation, $base_file, @app_entries ) = @_;
  82     my @base_line_starts = read_base( $base_file, $collation );
  83
  84     my %all_witnesses;
  85     foreach my $app ( @app_entries ) {
  86         my( $line, $num ) = split( /\./, $app->{_id} );
  87         # DEBUG with a short graph
  88         last if $SHORT && $line > $SHORT;
  89         # DEBUG for problematic entries
  90         my $scrutinize = "";
  91         my $first_line_reading = $base_line_starts[ $line ];
  92         my $too_far = $base_line_starts[ $line+1 ];
  93
  94         my $lemma = $app->{rdg_0};
  95         my $seq = 1;
  96         # Is this the Nth occurrence of this reading in the line?
  97         if( $lemma =~ s/(_)?(\d)$// ) {
  98             $seq = $2;
  99         }
 100         my @lemma_words = split( /\s+/, $lemma );
 101
 102         # Now search for the lemma words within this line.
 103         my $lemma_start = $first_line_reading;
 104         my $lemma_end;
 105         my %seen;
 106         while( $lemma_start ne $too_far ) {
 107             # Loop detection
 108             if( $seen{ $lemma_start->name() } ) {
 109                 warn "Detected loop at " . $lemma_start->name() .
 110                     ", ref $line,$num";
 111                 last;
 112             }
 113             $seen{ $lemma_start->name() } = 1;
 114
 115             # Try to match the lemma.
 116             my $unmatch = 0;
 117             print STDERR "Matching " . cmp_str( $lemma_start) . " against " .
 118                 $lemma_words[0] . "...\n"
 119                 if "$line.$num" eq $scrutinize;
 120             if( cmp_str( $lemma_start ) eq $lemma_words[0] ) {
 121                 # Skip it if we need a match that is not the first.
 122                 if( --$seq < 1 ) {
 123                     # Now we have to compare the rest of the words here.
 124                     if( scalar( @lemma_words ) > 1 ) {
 125                         my $next_reading =
 126                             $collation->next_reading( $lemma_start );
 127                         foreach my $w ( @lemma_words[1..$#lemma_words] ) {
 128                             printf STDERR "Now matching %s against %s\n",
 129                                     cmp_str($next_reading), $w
 130                                 if "$line.$num" eq $scrutinize;
 131                             if( $w ne cmp_str($next_reading) ) {
 132                                 $unmatch = 1;
 133                                 last;
 134                             } else {
 135                                 $lemma_end = $next_reading;
 136                                 $next_reading =
 137                                     $collation->next_reading( $lemma_end );
 138                             }
 139                         }
 140                     } else {
 141                         $lemma_end = $lemma_start;
 142                     }
 143                 } else {
 144                     $unmatch = 1;
 145                 }
 146             }
 147             last unless ( $unmatch || !defined( $lemma_end ) );
 148             $lemma_end = undef;
 149             $lemma_start = $collation->next_reading( $lemma_start );
 150         }
 151
 152         unless( $lemma_end ) {
 153             warn "No match found for @lemma_words at $line.$num";
 154             next;
 155         } else {
 156             # These are no longer common readings; unmark them as such.
 157             my @lemma_readings = $collation->reading_sequence( $lemma_start,
 158                                                      $lemma_end );
 159             map { $_->make_variant } @lemma_readings;
 160         }
 161
 162         # Now we have our lemma readings; we add the variant readings
 163         # to the collation.
 164
 165         # Keep track of the start and end point of each reading for later
 166         # reading collapse.
 167         my @readings = ( $lemma_start, $lemma_end );
 168
 169         # For each reading that is not rdg_0, we make a chain of readings
 170         # and connect them to the anchor.  Edges are named after the mss
 171         # that are relevant.
 172         foreach my $k ( grep { /^rdg/ } keys( %$app ) ) {
 173             if( $k eq 'rdg_0' ) { # that's the lemma
 174                 # The lemma is already in the graph, but we need to look for
 175                 # any explicit post-correctione readings and add the
 176                 # relevant path.
 177                 my @mss = grep { $app->{$_} eq $k } keys( %$app );
 178                 # Keep track of what witnesses we have seen.
 179                 @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
 180                 foreach my $m ( @mss ) {
 181                     my $base = _is_post_corr( $m );
 182                     next unless $base;
 183                     my @lem = $collation->reading_sequence( $lemma_start, $lemma_end );
 184                     $collation->add_path( $collation->prior_reading( $lem[0] ), $lem[0], $m );
 185                     foreach my $i ( 0 .. $#lem-1 ) {
 186                         $collation->add_path( $lem[$i], $lem[++$i], $m );
 187                     }
 188                     $collation->add_path( $lem[-1], $collation->next_reading( $lem[-1] ), $m );
 189                 }
 190                 next;
 191             }
 192             my @variant = split( /\s+/, $app->{$k} );
 193             @variant = () if $app->{$k} eq '/'; # This is an omission.
 194             my @mss = grep { $app->{$_} eq $k } keys( %$app );
 195
 196             unless( @mss ) {
 197                 print STDERR "Skipping '@variant' at $line.$num: no mss\n";
 198                 next;
 199             }
 200
 201             # Keep track of what witnesses we have seen.
 202             @all_witnesses{ @mss } = ( 1 ) x scalar( @mss );
 203
 204             # Make the variant into a set of readings.
 205             my $ctr = 0;
 206             my $last_reading = $collation->prior_reading( $lemma_start );
 207             my $var_start;
 208             foreach my $vw ( @variant ) {
 209                 my $vwname = "$k/$line.$num.$ctr"; $ctr++;
 210                 my $vwreading = $collation->add_reading( $vwname );
 211                 $vwreading->text( $vw );
 212                 $vwreading->make_variant();
 213                 foreach ( @mss ) {
 214                     $collation->add_path( $last_reading, $vwreading, $_ );
 215                 }
 216                 $var_start = $vwreading unless $var_start;
 217                 $last_reading = $vwreading;
 218             }
 219             # Now hook it up at the end.
 220             foreach ( @mss ) {
 221                 $collation->add_path( $last_reading,
 222                                       $collation->next_reading( $lemma_end ),
 223                                       $_ );
 224             }
 225
 226             if( $var_start ) { # if it wasn't an empty reading
 227                 push( @readings, $var_start, $last_reading );
 228             }
 229         }
 230
 231         # Now collate and collapse the identical readings within the collation.
 232         collate_variants( $collation, @readings );
 233     }
 234
 235     # Now make the witness objects
 236     foreach my $w ( keys %all_witnesses ) {
 237         my $base = _is_post_corr( $w );
 238         if( $base ) {
 239             my $pctag = substr( $w, length( $base ) );
 240             my $existing_wit = $collation->tradition->witness( $base );
 241             unless( $existing_wit ) {
 242                 $existing_wit = $collation->tradition->add_witness( sigil => $base );
 243             }
 244             $existing_wit->post_correctione( $pctag );
 245         } else {
 246             $collation->tradition->add_witness( sigil => $w )
 247                 unless $collation->tradition->witness( $w );
 248         }
 249     }
 250
 251     # Now walk paths and calculate positions.
 252     my @common_readings =
 253         $collation->walk_and_expand_base( $collation->reading( '#END#' ) );
 254     $collation->calculate_positions( @common_readings );
 255 }
 256
 257 =item B<read_base>
 258
 259 my @line_beginnings = read_base( 'reference.txt', $collation );
 260
 261 Takes a text file and a (presumed empty) collation object, adds the
 262 words as simple linear readings to the collation, and returns a
 263 list of readings that represent the beginning of lines. This collation
 264 is now the starting point for application of apparatus entries in
 265 merge_base, e.g. from a CSV file or a Classical Text Editor file.
 266
 267 =cut
 268
 269 sub read_base {
 270     my( $base_file, $collation ) = @_;
 271
 272     # This array gives the first reading for each line.  We put the
 273     # common starting point in line zero.
 274     my $last_reading = $collation->start();
 275     my $lineref_array = [ $last_reading ]; # There is no line zero.
 276
 277     open( BASE, $base_file ) or die "Could not open file $base_file: $!";
 278     while(<BASE>) {
 279         # Make the readings, and connect them up for the base, but
 280         # also save the first reading of each line in an array for the
 281         # purpose.
 282         # TODO use configurable reading separator
 283         chomp;
 284         my @words = split;
 285         my $started = 0;
 286         my $wordref = 0;
 287         my $lineref = scalar @$lineref_array;
 288         last if $SHORT && $lineref > $SHORT;
 289         foreach my $w ( @words ) {
 290             my $readingref = join( ',', $lineref, ++$wordref );
 291             my $reading = $collation->add_reading( $readingref );
 292             $reading->text( $w );
 293             $reading->make_common();
 294             unless( $started ) {
 295                 push( @$lineref_array, $reading );
 296                 $started = 1;
 297             }
 298             if( $last_reading ) {
 299                 my $path = $collation->add_path( $last_reading, $reading,
 300                                                  $collation->baselabel );
 301                 $path->set_attribute( 'class', 'basetext' );
 302                 $last_reading = $reading;
 303             } # TODO there should be no else here...
 304         }
 305     }
 306     close BASE;
 307     # Ending point for all texts
 308     my $endpoint = $collation->add_reading( '#END#' );
 309     $collation->add_path( $last_reading, $endpoint, $collation->baselabel );
 310     push( @$lineref_array, $endpoint );
 311
 312     return( @$lineref_array );
 313 }
 314
 315 =item B<collate_variants>
 316
 317 collate_variants( $collation, @readings )
 318
 319 Given a set of readings in the form
 320 ( lemma_start, lemma_end, rdg1_start, rdg1_end, ... )
 321 walks through each to identify those readings that are identical.  The
 322 collation is a Text::Tradition::Collation object; the elements of
 323 @readings are Text::Tradition::Collation::Reading objects that appear
 324 on the collation graph.
 325
 326 TODO: Handle collapsed and non-collapsed transpositions.
 327
 328 =cut
 329
 330 sub collate_variants {
 331     my( $collation, @readings ) = @_;
 332     my $lemma_start = shift @readings;
 333     my $lemma_end = shift @readings;
 334     my $detranspose = 1;
 335
 336     # Start the list of distinct readings with those readings in the lemma.
 337     my @distinct_readings;
 338     while( $lemma_start ne $lemma_end ) {
 339         push( @distinct_readings, [ $lemma_start, $collation->baselabel ] );
 340         $lemma_start = $collation->next_reading( $lemma_start );
 341     }
 342     push( @distinct_readings, [ $lemma_end, $collation->baselabel ] );
 343
 344
 345     while( scalar @readings ) {
 346         my( $var_start, $var_end ) = splice( @readings, 0, 2 );
 347
 348         # I want to look at the readings in the variant and lemma, and
 349         # collapse readings that are the same word.  This is mini-collation.
 350         # Each word in the 'main' list can only be collapsed once with a
 351         # word from the current reading.
 352         my %collapsed = ();
 353
 354         # Get the variant witnesses.  They will all be going along the
 355         # same path, so just use the first one as representative for
 356         # the purpose of following the path.
 357         my @var_wits = map { $_->label } $var_start->outgoing();
 358         my $rep_wit = $var_wits[0];
 359
 360         my @variant_readings;
 361         while( $var_start ne $var_end ) {
 362             push( @variant_readings, $var_start );
 363             $var_start = $collation->next_reading( $var_start, $rep_wit );
 364         }
 365         push( @variant_readings, $var_end );
 366
 367         # Go through the variant readings, and if we find a lemma reading that
 368         # hasn't yet been collapsed with a reading, equate them.  If we do
 369         # not, keep them to push onto the end of all_readings.
 370         # TODO replace this with proper mini-collation
 371         my @remaining_readings;
 372         my $last_index = 0;
 373         my $curr_pos = 0;
 374         foreach my $w ( @variant_readings ) {
 375             my $word = $w->label();
 376             my $matched = 0;
 377             foreach my $idx ( $last_index .. $#distinct_readings ) {
 378                 my( $l, $pathlabel ) = @{$distinct_readings[$idx]};
 379                 if( $word eq cmp_str( $l ) ) {
 380                     next if exists( $collapsed{ $l->label } )
 381                         && $collapsed{ $l->label } eq $l;
 382                     $matched = 1;
 383                     $last_index = $idx if $detranspose;
 384                     # Collapse the readings.
 385                     printf STDERR "Merging readings %s/%s and %s/%s\n",
 386                         $l->name, $l->label, $w->name, $w->label;
 387                     $collation->merge_readings( $l, $w );
 388                     $collapsed{ $l->label } = $l;
 389                     # Now collapse any multiple paths to and from the reading.
 390                     remove_duplicate_paths( $collation,
 391                                     $collation->prior_reading( $l, $rep_wit ), $l );
 392                     remove_duplicate_paths( $collation, $l,
 393                                     $collation->next_reading( $l, $rep_wit ) );
 394                     last;
 395                 }
 396             }
 397             push( @remaining_readings, [ $w, $rep_wit ] ) unless $matched;
 398         }
 399         push( @distinct_readings, @remaining_readings) if scalar( @remaining_readings );
 400     }
 401 }
 402
 403 =item B<remove_duplicate_paths>
 404
 405 remove_duplicate_paths( $collation, $from, $to );
 406
 407 Given two readings, reduce the number of paths between those readings to
 408 a set of unique paths.
 409
 410 =cut
 411
 412 # TODO wonder if this is necessary
 413 sub remove_duplicate_paths {
 414     my( $collation, $from, $to ) = @_;
 415     my %seen_paths;
 416     foreach my $p ( $from->edges_to( $to ) ) {
 417         if( exists $seen_paths{$p->name} ) {
 418             $collation->del_path( $p );
 419         } else {
 420             $seen_paths{$p->name} = 1;
 421         }
 422     }
 423 }
 424
 425 # Helper function. Given a witness sigil, if it is a post-correctione
 426 # sigil,return the base witness.  If not, return a false value.
 427 sub _is_post_corr {
 428     my( $sigil ) = @_;
 429     if( $sigil =~ /^(.*?)(\s*\(?p\.\s*c\.\)?)$/ ) {
 430         return $1;
 431     }
 432     return undef;
 433 }
 434
 435 =item B<cmp_str>
 436
 437 Pretend you never saw this method.  Really it needs to not be hardcoded.
 438
 439 =cut
 440
 441 sub cmp_str {
 442     my( $reading ) = @_;
 443     my $word = $reading->label();
 444     $word = lc( $word );
 445     $word =~ s/\W//g;
 446     $word =~ s/v/u/g;
 447     $word =~ s/j/i/g;
 448     $word =~ s/cha/ca/g;
 449     $word =~ s/quatuor/quattuor/g;
 450     $word =~ s/ioannes/iohannes/g;
 451     return $word;
 452 }
 453
 454 =back
 455
 456 =head1 LICENSE
 457
 458 This package is free software and is provided "as is" without express
 459 or implied warranty.  You can redistribute it and/or modify it under
 460 the same terms as Perl itself.
 461
 462 =head1 AUTHOR
 463
 464 Tara L Andrews, aurum@cpan.org
 465
 466 =cut
 467
 468 1;