1 package Text::Tradition::Language::English;
5 use Encode qw/ encode_utf8 decode_utf8 /;
6 use IPC::Run qw/ run /;
7 use Lingua::TagSet::TreeTagger;
9 use Text::Tradition::Collation::Reading::Lexeme;
10 use Text::Tradition::Collation::Reading::WordForm;
13 my $MORPHDIR = '/Users/tla/Projects/morphology';
17 Text::Tradition::Language::English - language-specific module for English
21 Implements morphology lookup for English words in context. This module
22 depends on the TreeTagger software
23 (L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
24 (for now) expected to be installed in $MORPHDIR/TreeTagger.
28 =head2 lemmatize( $text )
30 Evaluates the string using the TreeTagger, and returns the results.
34 binmode STDOUT, ':utf8';
36 use_ok( 'Text::Tradition::Language::English' );
43 my $tradition = shift;
45 # Given a tradition, lemmatize it witness by witness and see what we get.
46 my $workdir = File::Temp->newdir();
47 my $c = $tradition->collation;
48 # First, clear out all existing lexemes from the readings. Save the
49 # path as long as we went to the trouble of generating it.
51 foreach my $wit ( $tradition->witnesses ) {
52 my @sigla = ( $wit->sigil );
53 push( @sigla, $wit->sigil . $c->ac_label ) if $wit->is_layered;
54 foreach my $sig ( @sigla ) {
55 my @path = grep { !$_->is_meta }
56 $c->reading_sequence( $c->start, $c->end, $sig );
57 map { $_->clear_lexemes } @path;
58 $witness_paths{$sig} = \@path;
62 foreach my $sig ( keys %witness_paths ) {
63 # Get the text as a sequence of readings and as a string
64 print STDERR "Morphologizing witness $sig\n";
65 _lemmatize_sequence( undef, @{$witness_paths{$sig}} );
69 sub _update_reading_lexemes {
70 my( $replace, $reading, @lexemes ) = @_;
71 if( $reading->has_lexemes && !$replace ) {
72 # We need to merge what is in @lexemes with what we have already.
73 my @oldlex = $reading->lexemes;
74 my $cmp1 = join( '||', map { $_->string } @oldlex );
75 my $cmp2 = join( '||', map { $_->string } @lexemes );
76 if ( @oldlex == @lexemes && $cmp1 eq $cmp2 ) {
77 # The lexeme strings are the same, so merge the possible
78 # word forms from new to old.
79 foreach my $i ( 0 .. $#lexemes ) {
81 my $nl = $lexemes[$i];
83 map { $ofw{$_->to_string} = 1 } $ol->matching_forms;
84 foreach my $form ( $nl->matching_forms ) {
85 unless( $ofw{$form->to_string} ) {
86 print STDERR "Adding form " . $form->to_string .
87 " to lexeme " . $nl->string . " at $reading\n";
88 $ol->add_matching_form( $form );
89 $ol->is_disambiguated(0);
94 warn "Lexeme layout for $reading changed; replacing the lot";
95 $reading->clear_lexemes;
96 $reading->add_lexeme( @lexemes );
99 $reading->clear_lexemes if $replace;
100 $reading->add_lexeme( @lexemes );
104 =head2 reading_lookup( $rdg[, $rdg, ...] )
106 Looks up one or more readings using the Flemm package, and returns the
107 possible results. This uses the same logic as L<lemmatize> above for the
108 entire tradition, but can also be used to (re-)analyze individual readings.
113 return _lemmatize_sequence( 1, @_ );
116 sub _lemmatize_sequence {
117 my( $replace, @path ) = @_;
118 my $tagresult = _treetag_string( _text_from_path( 1, @path ) );
120 # Map the tagged words onto the original readings, splitting
121 # them up into lexemes where necessary.
122 # NOTE we can have multiple lexemes in a reading, but not
123 # multiple readings to a lexeme.
124 my @tags = split( /\n/, $tagresult );
126 my $curr_rdg = shift @path;
129 foreach my $tag ( @tags ) {
130 # Get the original word
131 my( $lexeme, @rest ) = split( /\t/, $tag );
132 # Lemmatize the whole
133 my @forms = _parse_wordform( $tag );
134 my $lexobj = Text::Tradition::Collation::Reading::Lexeme->new(
135 'string' => $lexeme, 'language' => 'English',
136 'wordform_matchlist' => \@forms );
137 # Find the next non-meta reading
138 while( $curr_rdg && $curr_rdg->is_meta ) {
139 $curr_rdg = shift @path;
141 unless( $curr_rdg ) {
142 warn "Ran out of readings in sequence at $lexeme";
145 my $curr_rdg_text = $curr_rdg->has_normal_form
146 ? $curr_rdg->normal_form : $curr_rdg->text;
147 if( $unused_rdg_part &&
148 $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
149 # Nth part of curr_rdg
150 $unused_rdg_part = $2;
151 push( @curr_lexemes, $lexobj );
152 } elsif( $curr_rdg_text =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
153 # Flag an error if there is already an unused reading part.
154 warn "Skipped over unused text $unused_rdg_part at $curr_rdg"
156 $unused_rdg_part = $2; # will be empty if the whole reading matched
157 push( @curr_lexemes, $lexobj );
159 # We do not cope with the idea of a lexeme being
160 # spread across multiple readings.
161 warn "Word sequence changed unexpectedly in text";
162 # See if we can find a matching reading
165 while( my $nr = shift @path ) {
166 my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
167 if( $nrtext =~ /^\Q$lexeme\E/ ) {
168 $curr_rdg = $lookahead[-1] if @lookahead;
172 push( @lookahead, $nr );
175 # No match? Restore the state we had
177 unshift( @path, @lookahead );
180 $unused_rdg_part = '';
183 unless( $unused_rdg_part ) {
184 # Record the lexemes for the given reading.
185 #print STDERR sprintf( "Adding lexeme(s) %s to reading %s (%s)\n",
186 # join( ' ', map { $_->string } @curr_lexemes ),
187 # $curr_rdg->id, $curr_rdg->text );
188 _update_reading_lexemes( $replace, $curr_rdg, @curr_lexemes );
189 $curr_rdg = shift @path;
196 # Utility function so that we can cheat and use it when we need both the path
198 sub _text_from_path {
199 my( $normalize, @path ) = @_;
202 foreach my $r ( @path ) {
203 unless ( $r->join_prior || !$last || $last->join_next ) {
206 $pathtext .= ( $normalize && $r->has_normal_form )
207 ? $r->normal_form : $r->text;
213 # Utility function that actually calls the tree tagger.
214 sub _treetag_string {
216 my $wittext = encode_utf8( $text );
217 # Then see if we have TreeTagger
218 my $taggercmd = "$MORPHDIR/TreeTagger/cmd/tree-tagger-english";
219 unless( -f $taggercmd ) {
220 warn "Cannot do English word lemmatization without TreeTagger";
223 # OK, we can run it then.
224 my @cmd = ( $taggercmd );
225 my( $tagresult, $err ); # Capture the output and error
226 run( \@cmd, \$wittext, \$tagresult, \$err );
227 # TODO check for error
228 return decode_utf8( $tagresult );
231 # Utility function to turn a TreeTagger result into a WordForm
232 sub _parse_wordform {
233 my $tagresult = shift;
234 my( $orig, $tag, $lemma ) = split( /\t/, $tagresult );
235 my $morphobj = Lingua::TagSet::TreeTagger->tag2structure( $tag );
237 return Text::Tradition::Collation::Reading::WordForm->new(
238 'language' => 'English',
240 'morphology' => $morphobj,
243 warn "No morphology found for word: $_";
253 =item * Handle package dependencies more gracefully
255 =item * Refactor English/French use of TreeTagger into its own util package
261 This package is free software and is provided "as is" without express
262 or implied warranty. You can redistribute it and/or modify it under
263 the same terms as Perl itself.
267 Tara L Andrews E<lt>aurum@cpan.orgE<gt>