requires( 'YAML::XS' );
# For the morphology stuff
requires( 'Lingua::TagSet::Multext' );
-requires( 'Lingua::TagSet::TreeTagger' );
+requires( 'Lingua::TagSet::TreeTagger::French' );
+requires( 'Lingua::TagSet::TreeTagger::English' );
requires( 'Lingua::Features::Structure' );
build_requires( 'Test::Warn' );
# Modules needed for morphology but not trivially CPANnable
use Exporter 'import';
use vars qw/ @EXPORT_OK /;
use IPC::Run qw/ run /;
-use Lingua::TagSet::Multext;
-use Lingua::TagSet::TreeTagger;
use Module::Load;
use Text::Tradition::Collation::Reading::Lexeme;
use Text::Tradition::Collation::Reading::WordForm;
use TryCatch;
-@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger treetagger_struct
- multext_struct lfs_morph_tags /;
+@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /;
=head1 NAME
return $tagresult->as_text();
}
-## HACK function to correct for TagSet::TreeTagger brokenness
-sub treetagger_struct {
- my $pos = shift;
- $pos =~ s/PREP/PRP/;
- return Lingua::TagSet::TreeTagger->tag2structure( $pos );
-}
-
-sub multext_struct {
- my $pos = shift;
- # No known hacks needed
- return Lingua::TagSet::Multext->tag2structure( $pos );
-}
-
=head2 lfs_morph_tags
Return a data structure describing the available parts of speech and their attributes
load('Lingua::Features::StructureType');
my $tagset = { 'structures' => [], 'features' => {} };
foreach my $lfs ( sort { _by_structid( $a->id, $b->id ) } Lingua::Features::StructureType->types() ) {
- my $tsstruct = { 'id' => $lfs->id, 'use_features' => [] };
+ my $tsstruct = { 'id' => $lfs->id, 'desc' => $lfs->desc, 'use_features' => [] };
foreach my $ftid ( Lingua::Features::StructureType->type($lfs->id)->features ) {
my $ftype = $lfs->feature_type( $ftid );
- my $tfstruct = { 'id' => $ftid, 'values' => [] };
- foreach my $fval( $ftype->values ) {
- push( @{$tfstruct->{'values'}},
- { 'short' => $fval, 'long' => $ftype->value_name( $fval ) } );
+ if( !$ftype && $lfs->base ) {
+ $ftype = $lfs->base->feature_type( $ftid );
+ }
+ if( $ftype ) {
+ push( @{$tsstruct->{'use_features'}}, $ftid );
+ if( $ftid eq 'type' ) {
+ # Type values change according to category
+ $ftid .= " (" . $lfs->id . ")";
+ }
+ my $tfstruct = { 'id' => $ftid, 'values' => [] };
+ foreach my $fval( $ftype->values ) {
+ push( @{$tfstruct->{'values'}},
+ { 'short' => $fval, 'long' => $ftype->value_name( $fval ) } );
+ }
+ $tagset->{'features'}->{$ftid} = $tfstruct;
}
- push( @{$tsstruct->{'use_features'}}, $ftid );
- $tagset->{'features'}->{$ftid} = $tfstruct;
}
push( @{$tagset->{'structures'}}, $tsstruct );
}
use strict;
use warnings;
+use Lingua::TagSet::TreeTagger::English;
use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger
lfs_morph_tags /;
use TryCatch;
sub _parse_wordform {
my $tagresult = shift;
my( $orig, $tag, $lemma ) = split( /\t/, $tagresult );
- my $morphobj = Lingua::TagSet::TreeTagger->tag2structure( $tag );
+ return () unless $tag =~ /\w/; # skip punct-only "tags"
+ my $morphobj = Lingua::TagSet::TreeTagger::English->tag2structure( $tag );
if( $morphobj ) {
- return Text::Tradition::Collation::Reading::WordForm->new(
+ return ( Text::Tradition::Collation::Reading::WordForm->new(
'language' => 'English',
'lemma' => $lemma,
'morphology' => $morphobj,
- );
+ ) );
} else {
- warn "No morphology found for word: $_";
+ warn "No morphology found for word: $tagresult";
+ return ();
}
}
use strict;
use warnings;
+use Lingua::TagSet::Multext;
+use Lingua::TagSet::TreeTagger::French;
use Module::Load qw/ load /;
-use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger treetagger_struct multext_struct lfs_morph_tags /;
+use Text::Tradition::Language::Base qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /;
use TryCatch;
=head1 NAME
# Test the lemmatization. How many readings now have morphological info?
# Do the lexemes match the reading?
my $ambig = 0;
+ my $flemmed = 0;
foreach my $r ( $tf->collation->readings ) {
next if $r->is_meta;
ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
$textstr =~ s/\s+//g;
is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
foreach my $l ( @lex ) {
+ # Check to see if Flemm actually ran
+ foreach my $wf ( $l->matching_forms ) {
+ $flemmed++ if $wf->morphology->get_feature('num');
+ }
next if $l->is_disambiguated;
- # printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
- # join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
$ambig++;
}
}
is( $ambig, 102, "Found 102 ambiguous forms as expected" );
+ ok( $flemmed > 500, "Found enough Flemm info in wordforms" );
# Try setting the normal form of a reading and re-analyzing
my $mr = $tf->collation->reading('r99.2');
is( $mr->language, 'French', "Reading has correct language setting" );
$mr->normal_form( "m'inspire" );
$mr->lemmatize;
- is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" );
+ my @l = $mr->lexemes;
+ is( @l, 2, "Got two lexemes for new m'inspire reading" );
+ is( $l[0]->form->to_string,
+ '"French // se|le|lui // cat@pron type@pers pers@1 num@sing case@acc|dat"',
+ "New reading has correct first lexeme" );
}
=end testing
my( $pos, $morph ) = split( /:/, $tag );
my $morphobj;
if( $morph ) {
- $morphobj = multext_struct( $morph );
+ $morphobj = Lingua::TagSet::Multext->tag2structure( $morph );
} else {
# Use the TreeTagger info if there is no Flemm morphology.
- $morphobj = treetagger_struct( $pos );
+ $morphobj = Lingua::TagSet::TreeTagger::French->tag2structure( $pos );
}
if( $morphobj ) {
my $wf = Text::Tradition::Collation::Reading::WordForm->new(
use strict;
use warnings;
use Module::Load;
-use Text::Tradition::Language::Base qw/ lemmatize_treetagger treetagger_struct
- lfs_morph_tags /;
+use Text::Tradition::Language::Base qw/ lemmatize_treetagger lfs_morph_tags /;
use TryCatch;
=head1 NAME
=head1 DESCRIPTION
Implements morphology lookup for French words in context. This module
-depends on the Morph::Perseus module for access to PhiloLogic database data.
-It also depends on the TreeTagger software
-(L<http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/>), which is
-(for now) expected to be installed in $MORPHDIR/TreeTagger.
+depends on the Lingua::Morph::Perseus module for access to PhiloLogic database data.
=head1 SUBROUTINES
use Text::Tradition;
use_ok( 'Text::Tradition::Language::Latin' );
-eval "use Morph::Perseus";
+eval "use Lingua::Morph::Perseus";
my $err = $@;
SKIP: {
- skip "Package Morph::Perseus not found" if $err;
+ skip "Package Lingua::Morph::Perseus not found" if $err;
my $trad = Text::Tradition->new(
'language' => 'Latin',
$ambig++;
}
}
- is( $ambig, 7, "Found 7 ambiguous forms as expected" );
+ is( $ambig, 4, "Found 4 ambiguous forms as expected" );
}
=end testing
=cut
sub morphology_tags {
- try {
- load 'Morph::Perseus::Structure';
- } catch {
- warn "Not using Perseus Latin tags";
- }
return lfs_morph_tags();
}
sub _morph_connect {
unless( $morph ) {
try {
- load 'Morph::Perseus';
- load 'Morph::Perseus::Structure';
- $morph = Morph::Perseus->connect( 'Latin' );
+ load 'Lingua::Morph::Perseus';
+ $morph = Lingua::Morph::Perseus->connect( 'Latin' );
} catch {
- warn "Cannot do Latin word lemmatization without Morph::Perseus: @_";
+ warn "Cannot do Latin word lemmatization without Lingua::Morph::Perseus: @_";
return;
}
}
# T carne -> carnis
# T melius -> bonus
-
my %excep = (
'absens' => 'absum',
'aperte' => 'apertus',
sub _perseus_lookup_tt {
my( $orig, $pos, $lemma ) = split( /\t/, $_[0] );
_morph_connect();
- return unlesss $morph;
- my $result = $morph->lookup( $orig );
+ return unless $morph;
# Discard results that don't match the lemma, unless lemma is unknown
- my @orig = @{$result->{'objects'}};
- my @ret;
+ my $lookupopts = {};
unless( $lemma eq '<unknown>' || $lemma =~ /^\W+$/ ) {
# TODO Perseus lemma might have a number on the end, yuck.
# multiple lemmata separated with |
$lemma =~ s/[^\w|]//g;
$lemma = $excep{$lemma} if exists $excep{$lemma};
$lemma =~ s/j/i/g;
- my %lems;
- my @forms =
- map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
- # Now match the lemmata from Treetagger to the lemmata and alt_ls
- # from Perseus.
- @ret = grep {
- my $x = $_->lemma;
- $x =~ s/\d+$//;
- my $y = $_->alt_lex || '';
- $y =~ s/\d+$//;
- $lems{$x} || $lems{$y};
- } @orig;
- warn "TreeTagger lemma $lemma matched no results from Perseus for $orig"
- if @orig && !@ret;
+ if( $lemma ) { # if we have anything left...
+ my %lems;
+ map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma );
+ $lookupopts->{'lemma'} = [ keys %lems ];
+ }
}
- @ret = @orig unless @ret;
+ $lookupopts->{'ttpos'} = $pos if $pos;
+ my $result = $morph->lexicon_lookup( $orig, $lookupopts );
+ # unless( !keys( %$lookupopts ) || $result->{'filtered'} ) {
+ # warn "Filter on $pos / $lemma returned no results; using all results";
+ # }
+ my @ret = @{$result->{'objects'}};
my %unique_wordforms;
foreach my $obj ( @ret ) {
my $wf = _wordform_from_row( $obj );
$unique_wordforms{$wf->to_string} = $wf;
}
- ## TODO Use TreeTagger info - requires serious hacking of Lingua::TagSet
- # Discard results that don't match the given TreeTagger POS, unless
- # that leaves zero results
-# my $ttstruct = treetagger_struct( $pos );
-# my @ttmatch = grep { $ttstruct->is_compatible( $_->morphology ) } @wordforms;
-# unless( @ttmatch ) {
-# warn "TreeTagger POS $pos matched no results from Perseus for $orig";
-# @ttmatch = @wordforms;
-# }
-# return @ttmatch;
return values( %unique_wordforms );
}
sub _wordform_from_row {
my( $rowobj ) = @_;
- my $mpstruct;
- try {
- # M::P::St will be loaded already if we got here
- $mpstruct = Morph::Perseus::Structure->from_tag( $rowobj->code );
- } catch {
- warn "Could not create morphology structure from "
- . $rowobj->code . ": $!";
- }
my $lemma = $rowobj->lemma;
$lemma =~ s/^(\D+)\d*$/$1/;
my $wf = Text::Tradition::Collation::Reading::WordForm->new(
'language' => 'Latin',
'lemma' => $lemma,
- 'morphology' => $mpstruct,
+ 'morphology' => $rowobj->morphology,
);
return $wf;
}
# Test the lemmatization. How many readings now have morphological info?
# Do the lexemes match the reading?
my $ambig = 0;
+ my $flemmed = 0;
foreach my $r ( $tf->collation->readings ) {
next if $r->is_meta;
ok( $r->has_lexemes, "Reading $r has one or more lexemes" );
$textstr =~ s/\s+//g;
is( $textstr, $lexstr, "Lexemes for reading $r match the reading" );
foreach my $l ( @lex ) {
+ # Check to see if Flemm actually ran
+ foreach my $wf ( $l->matching_forms ) {
+ $flemmed++ if $wf->morphology->get_feature('num');
+ }
next if $l->is_disambiguated;
- # printf( "Ambiguous lexeme %s for reading %s:\n\t%s\n", $l->string, $r->id,
- # join( "\n\t", map { $_->lemma . ': ' . $_->morphology->to_string } $l->matching_forms ) );
$ambig++;
}
}
is( $ambig, 102, "Found 102 ambiguous forms as expected" );
+ ok( $flemmed > 500, "Found enough Flemm info in wordforms" );
# Try setting the normal form of a reading and re-analyzing
my $mr = $tf->collation->reading('r99.2');
is( $mr->language, 'French', "Reading has correct language setting" );
$mr->normal_form( "m'inspire" );
$mr->lemmatize;
- is( $mr->lexemes, 2, "Got two lexemes for new m'inspire reading" );
+ my @l = $mr->lexemes;
+ is( @l, 2, "Got two lexemes for new m'inspire reading" );
+ is( $l[0]->form->to_string,
+ '"French // se|le|lui // cat@pron type@pers pers@1 num@sing case@acc|dat"',
+ "New reading has correct first lexeme" );
}
}
use Text::Tradition;
use_ok( 'Text::Tradition::Language::Latin' );
-eval "use Morph::Perseus";
+eval "use Lingua::Morph::Perseus";
my $err = $@;
SKIP: {
- skip "Package Morph::Perseus not found" if $err;
+ skip "Package Lingua::Morph::Perseus not found" if $err;
my $trad = Text::Tradition->new(
'language' => 'Latin',
$ambig++;
}
}
- is( $ambig, 7, "Found 7 ambiguous forms as expected" );
+ is( $ambig, 4, "Found 4 ambiguous forms as expected" );
}
}