X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FLanguage%2FBase.pm;h=f0fe30410c157931f2aa9ffee12363d902fb7c73;hb=367e901bd7644e4f197cb157380cfa7dff12ae41;hp=05f81c0d8a94ad00b91009da83dcb98bc4c21aea;hpb=5271a01157263fd46b8d504e0ddb0b349a3fb9c4;p=scpubgit%2Fstemmatology.git diff --git a/lib/Text/Tradition/Language/Base.pm b/lib/Text/Tradition/Language/Base.pm index 05f81c0..f0fe304 100644 --- a/lib/Text/Tradition/Language/Base.pm +++ b/lib/Text/Tradition/Language/Base.pm @@ -6,15 +6,12 @@ use Encode qw/ encode_utf8 decode_utf8 /; use Exporter 'import'; use vars qw/ @EXPORT_OK /; use IPC::Run qw/ run /; -use Lingua::TagSet::Multext; -use Lingua::TagSet::TreeTagger; use Module::Load; use Text::Tradition::Collation::Reading::Lexeme; use Text::Tradition::Collation::Reading::WordForm; use TryCatch; -@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger treetagger_struct - multext_struct /; +@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /; =head1 NAME @@ -116,8 +113,7 @@ sub _lemmatize_treetagger_sequence { warn "Ran out of readings in sequence at $lexeme"; last; } - my $curr_rdg_text = $curr_rdg->has_normal_form - ? $curr_rdg->normal_form : $curr_rdg->text; + my $curr_rdg_text = $curr_rdg->normal_form; if( $unused_rdg_part && $unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) { # Nth part of curr_rdg @@ -137,7 +133,7 @@ sub _lemmatize_treetagger_sequence { my @lookahead; my $matched; while( my $nr = shift @path ) { - my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text; + my $nrtext = $nr->normal_form; if( $nrtext =~ /^\Q$lexeme\E/ ) { $curr_rdg = $lookahead[-1] if @lookahead; $matched = 1; @@ -212,8 +208,7 @@ sub _text_from_path { unless ( $r->join_prior || !$last || $last->join_next ) { $pathtext .= ' '; } - $pathtext .= ( $normalize && $r->has_normal_form ) - ? $r->normal_form : $r->text; + $pathtext .= $normalize ? $r->normal_form : $r->text; $last = $r; } return $pathtext; @@ -232,7 +227,7 @@ sub _treetag_string { } # OK, we can run it then. # First upgrade to UTF8 for necessary languages. - my @utf8_supported = qw/ French /; + my @utf8_supported = qw/ French Latin Greek /; my %ttopts = ( 'language' => $lang, 'options' => [ qw/ -token -lemma / ] ); if( grep { $_ eq $lang } @utf8_supported ) { $ttopts{'use_utf8'} = 1; @@ -245,17 +240,47 @@ sub _treetag_string { return $tagresult->as_text(); } -## HACK function to correct for TagSet::TreeTagger brokenness -sub treetagger_struct { - my $pos = shift; - $pos =~ s/PREP/PRP/; - return Lingua::TagSet::TreeTagger->tag2structure( $pos ); +=head2 lfs_morph_tags + +Return a data structure describing the available parts of speech and their attributes +from the Lingua::Features::Structure class currently defined. + +=cut + +sub lfs_morph_tags { + load('Lingua::Features::StructureType'); + my $tagset = { 'structures' => [], 'features' => {} }; + foreach my $lfs ( sort { _by_structid( $a->id, $b->id ) } Lingua::Features::StructureType->types() ) { + my $tsstruct = { 'id' => $lfs->id, 'desc' => $lfs->desc, 'use_features' => [] }; + foreach my $ftid ( Lingua::Features::StructureType->type($lfs->id)->features ) { + my $ftype = $lfs->feature_type( $ftid ); + if( !$ftype && $lfs->base ) { + $ftype = $lfs->base->feature_type( $ftid ); + } + if( $ftype ) { + push( @{$tsstruct->{'use_features'}}, $ftid ); + if( $ftid eq 'type' ) { + # Type values change according to category + $ftid .= " (" . $lfs->id . ")"; + } + my $tfstruct = { 'id' => $ftid, 'values' => [] }; + foreach my $fval( $ftype->values ) { + push( @{$tfstruct->{'values'}}, + { 'short' => $fval, 'long' => $ftype->value_name( $fval ) } ); + } + $tagset->{'features'}->{$ftid} = $tfstruct; + } + } + push( @{$tagset->{'structures'}}, $tsstruct ); + } + return $tagset; } -sub multext_struct { - my $pos = shift; - # No known hacks needed - return Lingua::TagSet::Multext->tag2structure( $pos ); +sub _by_structid { + my( $a, $b ) = @_; + return -1 if $a eq 'cat'; + return 1 if $b eq 'cat'; + return $a cmp $b; } 1;