lib/Text/Tradition/Language/Base.pm

   1 package Text::Tradition::Language::Base;
   2
   3 use strict;
   4 use warnings;
   5 use Encode qw/ encode_utf8 decode_utf8 /;
   6 use Exporter 'import';
   7 use vars qw/ @EXPORT_OK /;
   8 use IPC::Run qw/ run /;
   9 use Lingua::TagSet::Multext;
  10 use Lingua::TagSet::TreeTagger;
  11 use Module::Load;
  12 use Text::Tradition::Collation::Reading::Lexeme;
  13 use Text::Tradition::Collation::Reading::WordForm;
  14 use TryCatch;
  15
  16 @EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger treetagger_struct
  17         multext_struct lfs_morph_tags /;
  18
  19 =head1 NAME
  20
  21 Text::Tradition::Language::Base - Base subroutines for lemmatization of words
  22
  23 =head1 DESCRIPTION
  24
  25 Common routines for applying morphological tagging to a Text::Tradition. Used
  26 with callbacks from the named language packages.
  27
  28 =head1 SUBROUTINES
  29
  30 =head2 lemmatize_treetagger( $tradition )
  31
  32 Evaluates the tradition with the given options, and returns the results.
  33
  34 =cut
  35
  36 sub lemmatize_treetagger {
  37         my( $tradition, %opts ) = @_;
  38
  39         # Given a tradition, lemmatize it witness by witness and see what we get.
  40         my $c = $tradition->collation;
  41         # First, clear out all existing lexemes from the readings.
  42         my %witness_paths = _clear_reading_lexemes( $tradition );
  43
  44         foreach my $sig ( keys %witness_paths ) {
  45                 # Get the text as a sequence of readings and as a string
  46                 my %witopts = (
  47                         'path' => $witness_paths{$sig},
  48                         %opts
  49                         );
  50                 _lemmatize_treetagger_sequence( %witopts );
  51         }
  52 }
  53
  54 sub _clear_reading_lexemes {
  55         my $tradition = shift;
  56                 my $c = $tradition->collation;
  57         # Clear out all existing lexemes from the readings. Save the path as long
  58         # as we went to the trouble of generating it.
  59         my %witness_paths;
  60         foreach my $wit ( $tradition->witnesses ) {
  61                 my @sigla = ( $wit->sigil );
  62                 push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
  63                 foreach my $sig ( @sigla ) {
  64                         my @path = grep { !$_->is_meta }
  65                                 $c->reading_sequence( $c->start, $c->end, $sig );
  66                         map { $_->clear_lexemes } @path;
  67                         $witness_paths{$sig} = \@path;
  68                 }
  69         }
  70         return %witness_paths;
  71 }
  72
  73 =head2 reading_lookup( $rdg[, $rdg, ...] )
  74
  75 Looks up one or more readings using the Flemm package, and returns the
  76 possible results.  This uses the same logic as L<lemmatize> above for the
  77 entire tradition, but can also be used to (re-)analyze individual readings.
  78
  79 =cut
  80
  81 sub reading_lookup_treetagger {
  82         my %opts = @_;
  83         $opts{'replace'} = 1;
  84         return _lemmatize_treetagger_sequence( %opts );
  85 }
  86
  87 sub _lemmatize_treetagger_sequence {
  88         my %opts = @_;
  89         my @path = @{$opts{'path'}};
  90         my $tagresult = _treetag_string( _text_from_path( 1, @path ), $opts{'language'} );
  91         if( $tagresult ) {
  92                 # Map the tagged words onto the original readings, splitting
  93                 # them up into lexemes where necessary.
  94                 # NOTE we can have multiple lexemes in a reading, but not
  95                 # multiple readings to a lexeme.
  96                 my @tags = split( /\n/, $tagresult );
  97                 my @lexemes;
  98                 my $curr_rdg = shift @path;
  99                 my @curr_lexemes;
 100                 my $unused_rdg_part;
 101                 foreach my $tag ( @tags ) {
 102                         # Get the original word
 103                         my( $lexeme, @rest ) = split( /\t/, $tag );
 104                         # Lemmatize the whole
 105                         # TODO error trap this
 106                         my @forms = $opts{'callback'}( $tag );
 107
 108                         my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
 109                                 'string' => $lexeme, 'language' => $opts{'language'},
 110                                 'wordform_matchlist' => \@forms );
 111                         # Find the next non-meta reading
 112                         while( $curr_rdg && $curr_rdg->is_meta ) {
 113                                 $curr_rdg = shift @path;
 114                         }
 115                         unless( $curr_rdg ) {
 116                                 warn "Ran out of readings in sequence at $lexeme";
 117                                 last;
 118                         }
 119                         my $curr_rdg_text = $curr_rdg->has_normal_form
 120                                 ? $curr_rdg->normal_form : $curr_rdg->text;
 121                         if( $unused_rdg_part &&
 122                                 $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
 123                                 # Nth part of curr_rdg
 124                                 $unused_rdg_part = $2;
 125                                 push( @curr_lexemes, $lexobj );
 126                         } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
 127                                 # Flag an error if there is already an unused reading part.
 128                                 warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
 129                                         if $unused_rdg_part;
 130                                 $unused_rdg_part = $2; # will be empty if the whole reading matched
 131                                 push( @curr_lexemes, $lexobj );
 132                         } else {
 133                                 # We do not cope with the idea of a lexeme being
 134                                 # spread across multiple readings.
 135                                 warn "Word sequence changed unexpectedly in text";
 136                                 # See if we can find a matching reading
 137                                 my @lookahead;
 138                                 my $matched;
 139                                 while( my $nr = shift @path ) {
 140                                         my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
 141                                         if( $nrtext =~ /^\Q$lexeme\E/ ) {
 142                                                 $curr_rdg = $lookahead[-1] if @lookahead;
 143                                                 $matched = 1;
 144                                                 last;
 145                                         } else {
 146                                                 push( @lookahead, $nr );
 147                                         }
 148                                 }
 149                                 # No match? Restore the state we had
 150                                 unless( $matched ) {
 151                                         unshift( @path, @lookahead );
 152                                 }
 153                                 # Trigger a move
 154                                 $unused_rdg_part = '';
 155                         }
 156
 157                         unless( $unused_rdg_part ) {
 158                                 # Record the lexemes for the given reading.
 159                                 #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
 160                                 #       join( ' ', map { $_->string } @curr_lexemes ),
 161                                 #       $curr_rdg->id, $curr_rdg->text );
 162                                 _update_reading_lexemes( $opts{replace}, $curr_rdg, @curr_lexemes );
 163                                 $curr_rdg = shift @path;
 164                                 @curr_lexemes = ();
 165                         }
 166                 }
 167         }
 168 }
 169
 170 sub _update_reading_lexemes {
 171         my( $replace, $reading, @lexemes ) = @_;
 172         if( $reading->has_lexemes && !$replace ) {
 173                 # We need to merge what is in @lexemes with what we have already.
 174                 my @oldlex = $reading->lexemes;
 175                 my $cmp1 = join( '||', map { $_->string } @oldlex );
 176                 my $cmp2 = join( '||', map { $_->string } @lexemes );
 177                 if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
 178                         # The lexeme strings are the same, so merge the possible
 179                         # word forms from new to old.
 180                         foreach my $i ( 0 .. $#lexemes ) {
 181                                 my $ol = $oldlex[$i];
 182                                 my $nl = $lexemes[$i];
 183                                 my %ofw;
 184                                 map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
 185                                 foreach my $form ( $nl->matching_forms ) {
 186                                         unless( $ofw{$form->to_string} ) {
 187                                                 # print STDERR "Adding form " . $form->to_string .
 188                                                 #       " to lexeme " . $nl->string . " at $reading\n";
 189                                                 $ol->add_matching_form( $form );
 190                                                 $ol->is_disambiguated(0);
 191                                         }
 192                                 }
 193                         }
 194                 } else {
 195                         warn "Lexeme layout for $reading changed; replacing the lot";
 196                         $reading->clear_lexemes;
 197                         $reading->add_lexeme( @lexemes );
 198                 }
 199         } else {
 200                 $reading->clear_lexemes if $replace;
 201                 $reading->add_lexeme( @lexemes );
 202         }
 203 }
 204
 205 # Utility function so that we can cheat and use it when we need both the path
 206 # and its text.
 207 sub _text_from_path {
 208         my( $normalize, @path ) = @_;
 209         my $pathtext = '';
 210         my $last;
 211         foreach my $r ( @path ) {
 212                 unless ( $r->join_prior || !$last || $last->join_next ) {
 213                         $pathtext .= ' ';
 214                 }
 215                 $pathtext .= ( $normalize && $r->has_normal_form )
 216                         ? $r->normal_form : $r->text;
 217                 $last = $r;
 218         }
 219         return $pathtext;
 220 }
 221
 222 # Utility function that actually calls the tree tagger.
 223 sub _treetag_string {
 224         my( $text, $lang ) = @_;
 225         my $wittext = encode_utf8( $text );
 226         # Then see if we have TreeTagger
 227         try {
 228                 load( 'Lingua::TreeTagger' );
 229         } catch {
 230                 warn "Cannot run TreeTagger without Lingua::TreeTagger module";
 231                 return '';
 232         }
 233         # OK, we can run it then.
 234         # First upgrade to UTF8 for necessary languages.
 235         my @utf8_supported = qw/ French /;
 236         my %ttopts = ( 'language' => $lang, 'options' => [ qw/ -token -lemma / ] );
 237         if( grep { $_ eq $lang } @utf8_supported ) {
 238                 $ttopts{'use_utf8'} = 1;
 239         }
 240         # Now instantiate and run the tagger.
 241         my $tagger = Lingua::TreeTagger->new( %ttopts );
 242         my $tagresult = $tagger->tag_text( \$text );
 243
 244         # TODO maybe send the tokens back rather than the interpreted string...
 245         return $tagresult->as_text();
 246 }
 247
 248 ## HACK function to correct for TagSet::TreeTagger brokenness
 249 sub treetagger_struct {
 250         my $pos = shift;
 251         $pos =~ s/PREP/PRP/;
 252         return Lingua::TagSet::TreeTagger->tag2structure( $pos );
 253 }
 254
 255 sub multext_struct {
 256         my $pos = shift;
 257         # No known hacks needed
 258         return Lingua::TagSet::Multext->tag2structure( $pos );
 259 }
 260
 261 =head2 lfs_morph_tags
 262
 263 Return a data structure describing the available parts of speech and their attributes
 264 from the Lingua::Features::Structure class currently defined.
 265
 266 =cut
 267
 268 sub lfs_morph_tags {
 269         load('Lingua::Features::StructureType');
 270         my $tagset = { 'structures' => [], 'features' => {} };
 271         foreach my $lfs ( sort { _by_structid( $a->id, $b->id ) } Lingua::Features::StructureType->types() ) {
 272                 my $tsstruct = { 'id' => $lfs->id, 'use_features' => [] };
 273                 foreach my $ftid ( Lingua::Features::StructureType->type($lfs->id)->features ) {
 274                         my $ftype = $lfs->feature_type( $ftid );
 275                         my $tfstruct = { 'id' => $ftid, 'values' => [] };
 276                         foreach my $fval( $ftype->values ) {
 277                                 push( @{$tfstruct->{'values'}},
 278                                         { 'short' => $fval, 'long' => $ftype->value_name( $fval ) } );
 279                         }
 280                         push( @{$tsstruct->{'use_features'}}, $ftid );
 281                         $tagset->{'features'}->{$ftid} = $tfstruct;
 282                 }
 283                 push( @{$tagset->{'structures'}}, $tsstruct );
 284         }
 285         return $tagset;
 286 }
 287
 288 sub _by_structid {
 289         my( $a, $b ) = @_;
 290         return -1 if $a eq 'cat';
 291         return 1 if $b eq 'cat';
 292         return $a cmp $b;
 293 }
 294
 295 1;
 296
 297 =head2 TODO
 298
 299 =over
 300
 301 =item * Handle package dependencies more gracefully
 302
 303 =back
 304
 305 =head1 LICENSE
 306
 307 This package is free software and is provided "as is" without express
 308 or implied warranty.  You can redistribute it and/or modify it under
 309 the same terms as Perl itself.
 310
 311 =head1 AUTHOR
 312
 313 Tara L Andrews E<lt>aurum@cpan.orgE<gt>