use Exporter 'import';
use vars qw/ @EXPORT_OK /;
use IPC::Run qw/ run /;
-use Lingua::TagSet::Multext;
-use Lingua::TagSet::TreeTagger;
use Module::Load;
use Text::Tradition::Collation::Reading::Lexeme;
use Text::Tradition::Collation::Reading::WordForm;
use TryCatch;
-@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger treetagger_struct
- multext_struct /;
+@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /;
=head1 NAME
warn "Ran out of readings in sequence at $lexeme";
last;
}
- my $curr_rdg_text = $curr_rdg->has_normal_form
- ? $curr_rdg->normal_form : $curr_rdg->text;
+ my $curr_rdg_text = $curr_rdg->normal_form;
if( $unused_rdg_part &&
$unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
# Nth part of curr_rdg
my @lookahead;
my $matched;
while( my $nr = shift @path ) {
- my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
+ my $nrtext = $nr->normal_form;
if( $nrtext =~ /^\Q$lexeme\E/ ) {
$curr_rdg = $lookahead[-1] if @lookahead;
$matched = 1;
unless ( $r->join_prior || !$last || $last->join_next ) {
$pathtext .= ' ';
}
- $pathtext .= ( $normalize && $r->has_normal_form )
- ? $r->normal_form : $r->text;
+ $pathtext .= $normalize ? $r->normal_form : $r->text;
$last = $r;
}
return $pathtext;
}
# OK, we can run it then.
# First upgrade to UTF8 for necessary languages.
- my @utf8_supported = qw/ French /;
+ my @utf8_supported = qw/ French Latin Greek /;
my %ttopts = ( 'language' => $lang, 'options' => [ qw/ -token -lemma / ] );
if( grep { $_ eq $lang } @utf8_supported ) {
$ttopts{'use_utf8'} = 1;
return $tagresult->as_text();
}
-## HACK function to correct for TagSet::TreeTagger brokenness
-sub treetagger_struct {
- my $pos = shift;
- $pos =~ s/PREP/PRP/;
- return Lingua::TagSet::TreeTagger->tag2structure( $pos );
+=head2 lfs_morph_tags
+
+Return a data structure describing the available parts of speech and their attributes
+from the Lingua::Features::Structure class currently defined.
+
+=cut
+
+sub lfs_morph_tags {
+ load('Lingua::Features::StructureType');
+ my $tagset = { 'structures' => [], 'features' => {} };
+ foreach my $lfs ( sort { _by_structid( $a->id, $b->id ) } Lingua::Features::StructureType->types() ) {
+ my $tsstruct = { 'id' => $lfs->id, 'desc' => $lfs->desc, 'use_features' => [] };
+ foreach my $ftid ( Lingua::Features::StructureType->type($lfs->id)->features ) {
+ my $ftype = $lfs->feature_type( $ftid );
+ if( !$ftype && $lfs->base ) {
+ $ftype = $lfs->base->feature_type( $ftid );
+ }
+ if( $ftype ) {
+ push( @{$tsstruct->{'use_features'}}, $ftid );
+ if( $ftid eq 'type' ) {
+ # Type values change according to category
+ $ftid .= " (" . $lfs->id . ")";
+ }
+ my $tfstruct = { 'id' => $ftid, 'values' => [] };
+ foreach my $fval( $ftype->values ) {
+ push( @{$tfstruct->{'values'}},
+ { 'short' => $fval, 'long' => $ftype->value_name( $fval ) } );
+ }
+ $tagset->{'features'}->{$ftid} = $tfstruct;
+ }
+ }
+ push( @{$tagset->{'structures'}}, $tsstruct );
+ }
+ return $tagset;
}
-sub multext_struct {
- my $pos = shift;
- # No known hacks needed
- return Lingua::TagSet::Multext->tag2structure( $pos );
+sub _by_structid {
+ my( $a, $b ) = @_;
+ return -1 if $a eq 'cat';
+ return 1 if $b eq 'cat';
+ return $a cmp $b;
}
1;