morphology/lib/Text/Tradition/Language/Base.pm

   1 package Text::Tradition::Language::Base;
   2
   3 use strict;
   4 use warnings;
   5 use Encode qw/ encode_utf8 decode_utf8 /;
   6 use Exporter 'import';
   7 use vars qw/ @EXPORT_OK /;
   8 use IPC::Run qw/ run /;
   9 use Module::Load;
  10 use Text::Tradition::Collation::Reading::Lexeme;
  11 use Text::Tradition::Collation::Reading::WordForm;
  12 use TryCatch;
  13 use Unicode::Normalize;
  14
  15 @EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags
  16         unicode_regularize /;
  17
  18 =head1 NAME
  19
  20 Text::Tradition::Language::Base - Base subroutines for lemmatization of words
  21
  22 =head1 DESCRIPTION
  23
  24 Common routines for applying morphological tagging to a Text::Tradition. Used
  25 with callbacks from the named language packages.
  26
  27 =head1 SUBROUTINES
  28
  29 =head2 lemmatize_treetagger( $tradition )
  30
  31 Evaluates the tradition with the given options, and returns the results.
  32
  33 =cut
  34
  35 sub lemmatize_treetagger {
  36         my( $tradition, %opts ) = @_;
  37
  38         # Given a tradition, lemmatize it witness by witness and see what we get.
  39         my $c = $tradition->collation;
  40         # First, clear out all existing lexemes from the readings.
  41         my %witness_paths = _clear_reading_lexemes( $tradition );
  42
  43         foreach my $sig ( keys %witness_paths ) {
  44                 # Get the text as a sequence of readings and as a string
  45                 my %witopts = (
  46                         'path' => $witness_paths{$sig},
  47                         %opts
  48                         );
  49                 _lemmatize_treetagger_sequence( %witopts );
  50         }
  51 }
  52
  53 sub _clear_reading_lexemes {
  54         my $tradition = shift;
  55                 my $c = $tradition->collation;
  56         # Clear out all existing lexemes from the readings. Save the path as long
  57         # as we went to the trouble of generating it.
  58         my %witness_paths;
  59         foreach my $wit ( $tradition->witnesses ) {
  60                 my @sigla = ( $wit->sigil );
  61                 push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
  62                 foreach my $sig ( @sigla ) {
  63                         my @path = grep { !$_->is_meta }
  64                                 $c->reading_sequence( $c->start, $c->end, $sig );
  65                         map { $_->clear_lexemes } @path;
  66                         $witness_paths{$sig} = \@path;
  67                 }
  68         }
  69         return %witness_paths;
  70 }
  71
  72 =head2 reading_lookup( $rdg[, $rdg, ...] )
  73
  74 Looks up one or more readings using the Flemm package, and returns the
  75 possible results.  This uses the same logic as L<lemmatize> above for the
  76 entire tradition, but can also be used to (re-)analyze individual readings.
  77
  78 =cut
  79
  80 sub reading_lookup_treetagger {
  81         my %opts = @_;
  82         $opts{'replace'} = 1;
  83         return _lemmatize_treetagger_sequence( %opts );
  84 }
  85
  86 sub _lemmatize_treetagger_sequence {
  87         my %opts = @_;
  88         my @path = @{$opts{'path'}};
  89         my $tagresult = _treetag_string( _text_from_path( 1, @path ), $opts{'language'} );
  90         if( $tagresult ) {
  91                 # Map the tagged words onto the original readings, splitting
  92                 # them up into lexemes where necessary.
  93                 # NOTE we can have multiple lexemes in a reading, but not
  94                 # multiple readings to a lexeme.
  95                 my @tags = split( /\n/, $tagresult );
  96                 my @lexemes;
  97                 my $curr_rdg = shift @path;
  98                 my @curr_lexemes;
  99                 my $unused_rdg_part;
 100                 foreach my $tag ( @tags ) {
 101                         # Get the original word
 102                         my( $lexeme, @rest ) = split( /\t/, $tag );
 103                         # Lemmatize the whole
 104                         # TODO error trap this
 105                         my @forms = $opts{'callback'}( $tag );
 106
 107                         my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
 108                                 'string' => $lexeme, 'language' => $opts{'language'},
 109                                 'wordform_matchlist' => \@forms );
 110                         # Find the next non-meta reading
 111                         while( $curr_rdg && $curr_rdg->is_meta ) {
 112                                 $curr_rdg = shift @path;
 113                         }
 114                         unless( $curr_rdg ) {
 115                                 warn "Ran out of readings in sequence at $lexeme";
 116                                 last;
 117                         }
 118                         my $curr_rdg_text = $curr_rdg->normal_form;
 119                         if( $unused_rdg_part &&
 120                                 $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
 121                                 # Nth part of curr_rdg
 122                                 $unused_rdg_part = $2;
 123                                 push( @curr_lexemes, $lexobj );
 124                         } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
 125                                 # Flag an error if there is already an unused reading part.
 126                                 warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
 127                                         if $unused_rdg_part;
 128                                 $unused_rdg_part = $2; # will be empty if the whole reading matched
 129                                 push( @curr_lexemes, $lexobj );
 130                         } else {
 131                                 # We do not cope with the idea of a lexeme being
 132                                 # spread across multiple readings.
 133                                 warn "Word sequence changed unexpectedly in text";
 134                                 # See if we can find a matching reading
 135                                 my @lookahead;
 136                                 my $matched;
 137                                 while( my $nr = shift @path ) {
 138                                         my $nrtext = $nr->normal_form;
 139                                         if( $nrtext =~ /^\Q$lexeme\E/ ) {
 140                                                 $curr_rdg = $lookahead[-1] if @lookahead;
 141                                                 $matched = 1;
 142                                                 last;
 143                                         } else {
 144                                                 push( @lookahead, $nr );
 145                                         }
 146                                 }
 147                                 # No match? Restore the state we had
 148                                 unless( $matched ) {
 149                                         unshift( @path, @lookahead );
 150                                 }
 151                                 # Trigger a move
 152                                 $unused_rdg_part = '';
 153                         }
 154
 155                         unless( $unused_rdg_part ) {
 156                                 # Record the lexemes for the given reading.
 157                                 #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
 158                                 #       join( ' ', map { $_->string } @curr_lexemes ),
 159                                 #       $curr_rdg->id, $curr_rdg->text );
 160                                 _update_reading_lexemes( $opts{replace}, $curr_rdg, @curr_lexemes );
 161                                 $curr_rdg = shift @path;
 162                                 @curr_lexemes = ();
 163                         }
 164                 }
 165         }
 166 }
 167
 168 sub _update_reading_lexemes {
 169         my( $replace, $reading, @lexemes ) = @_;
 170         if( $reading->has_lexemes && !$replace ) {
 171                 # We need to merge what is in @lexemes with what we have already.
 172                 my @oldlex = $reading->lexemes;
 173                 my $cmp1 = join( '||', map { $_->string } @oldlex );
 174                 my $cmp2 = join( '||', map { $_->string } @lexemes );
 175                 if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
 176                         # The lexeme strings are the same, so merge the possible
 177                         # word forms from new to old.
 178                         foreach my $i ( 0 .. $#lexemes ) {
 179                                 my $ol = $oldlex[$i];
 180                                 my $nl = $lexemes[$i];
 181                                 my %ofw;
 182                                 map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
 183                                 foreach my $form ( $nl->matching_forms ) {
 184                                         unless( $ofw{$form->to_string} ) {
 185                                                 # print STDERR "Adding form " . $form->to_string .
 186                                                 #       " to lexeme " . $nl->string . " at $reading\n";
 187                                                 $ol->add_matching_form( $form );
 188                                                 $ol->is_disambiguated(0);
 189                                         }
 190                                 }
 191                         }
 192                 } else {
 193                         warn "Lexeme layout for $reading changed; replacing the lot";
 194                         $reading->clear_lexemes;
 195                         $reading->add_lexeme( @lexemes );
 196                 }
 197         } else {
 198                 $reading->clear_lexemes if $replace;
 199                 $reading->add_lexeme( @lexemes );
 200         }
 201 }
 202
 203 # Utility function so that we can cheat and use it when we need both the path
 204 # and its text.
 205 sub _text_from_path {
 206         my( $normalize, @path ) = @_;
 207         my $pathtext = '';
 208         my $last;
 209         foreach my $r ( @path ) {
 210                 unless ( $r->join_prior || !$last || $last->join_next ) {
 211                         $pathtext .= ' ';
 212                 }
 213                 $pathtext .= $normalize ? $r->normal_form : $r->text;
 214                 $last = $r;
 215         }
 216         return $pathtext;
 217 }
 218
 219 # Utility function that actually calls the tree tagger.
 220 sub _treetag_string {
 221         my( $text, $lang ) = @_;
 222         my $wittext = encode_utf8( $text );
 223         # Then see if we have TreeTagger
 224         try {
 225                 load( 'Lingua::TreeTagger' );
 226         } catch {
 227                 warn "Cannot run TreeTagger without Lingua::TreeTagger module";
 228                 return '';
 229         }
 230         # OK, we can run it then.
 231         # First upgrade to UTF8 for necessary languages.
 232         my @utf8_supported = qw/ French Latin Greek /;
 233         my %ttopts = ( 'language' => $lang, 'options' => [ qw/ -token -lemma / ] );
 234         if( grep { $_ eq $lang } @utf8_supported ) {
 235                 $ttopts{'use_utf8'} = 1;
 236         }
 237         # Now instantiate and run the tagger.
 238         my $tagger = Lingua::TreeTagger->new( %ttopts );
 239         my $tagresult = $tagger->tag_text( \$text );
 240
 241         # TODO maybe send the tokens back rather than the interpreted string...
 242         return $tagresult->as_text();
 243 }
 244
 245 =head2 lfs_morph_tags
 246
 247 Return a data structure describing the available parts of speech and their attributes
 248 from the Lingua::Features::Structure class currently defined.
 249
 250 =cut
 251
 252 sub lfs_morph_tags {
 253         load('Lingua::Features::StructureType');
 254         my $tagset = { 'structures' => [], 'features' => {} };
 255         foreach my $lfs ( sort { _by_structid( $a->id, $b->id ) } Lingua::Features::StructureType->types() ) {
 256                 my $tsstruct = { 'id' => $lfs->id, 'desc' => $lfs->desc, 'use_features' => [] };
 257                 foreach my $ftid ( Lingua::Features::StructureType->type($lfs->id)->features ) {
 258                         my $ftype = $lfs->feature_type( $ftid );
 259                         if( !$ftype && $lfs->base ) {
 260                                 $ftype = $lfs->base->feature_type( $ftid );
 261                         }
 262                         if( $ftype ) {
 263                                 push( @{$tsstruct->{'use_features'}}, $ftid );
 264                                 if( $ftid eq 'type' ) {
 265                                         # Type values change according to category
 266                                         $ftid .= " (" . $lfs->id . ")";
 267                                 }
 268                                 my $tfstruct = { 'id' => $ftid, 'values' => [] };
 269                                 foreach my $fval( $ftype->values ) {
 270                                         push( @{$tfstruct->{'values'}},
 271                                                 { 'short' => $fval, 'long' => $ftype->value_name( $fval ) } );
 272                                 }
 273                                 $tagset->{'features'}->{$ftid} = $tfstruct;
 274                         }
 275                 }
 276                 push( @{$tagset->{'structures'}}, $tsstruct );
 277         }
 278         return $tagset;
 279 }
 280
 281 sub _by_structid {
 282         my( $a, $b ) = @_;
 283         return -1 if $a eq 'cat';
 284         return 1 if $b eq 'cat';
 285         return $a cmp $b;
 286 }
 287
 288 =head2 unicode_regularize( $word )
 289
 290 Returns a lowercased and accent-stripped version of the word.
 291
 292 =cut
 293
 294 sub unicode_regularize {
 295         my $word = shift;
 296         my @normalized;
 297         my @letters = split( '', lc( $word ) );
 298         foreach my $l ( @letters ) {
 299                 my $d = chr( ord( NFKD( $l ) ) );
 300                 next unless $d =~ /[[:alnum:]]/; # toss out e.g. Greek underdots
 301                 push( @normalized, $d );
 302         }
 303         return join( '', @normalized );
 304 }
 305
 306 1;
 307
 308 =head2 TODO
 309
 310 =over
 311
 312 =item * Handle package dependencies more gracefully
 313
 314 =back
 315