X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=blobdiff_plain;f=lib%2FText%2FTradition%2FLanguage%2FBase.pm;h=f0fe30410c157931f2aa9ffee12363d902fb7c73;hb=367e901bd7644e4f197cb157380cfa7dff12ae41;hp=05f81c0d8a94ad00b91009da83dcb98bc4c21aea;hpb=5271a01157263fd46b8d504e0ddb0b349a3fb9c4;p=scpubgit%2Fstemmatology.git

diff --git a/lib/Text/Tradition/Language/Base.pm b/lib/Text/Tradition/Language/Base.pm
index 05f81c0..f0fe304 100644
--- a/lib/Text/Tradition/Language/Base.pm
+++ b/lib/Text/Tradition/Language/Base.pm
@@ -6,15 +6,12 @@ use Encode qw/ encode_utf8 decode_utf8 /;
 use Exporter 'import';
 use vars qw/ @EXPORT_OK /;
 use IPC::Run qw/ run /;
-use Lingua::TagSet::Multext;
-use Lingua::TagSet::TreeTagger;
 use Module::Load;
 use Text::Tradition::Collation::Reading::Lexeme;
 use Text::Tradition::Collation::Reading::WordForm;
 use TryCatch;
 
-@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger treetagger_struct
-	multext_struct /;
+@EXPORT_OK = qw/ lemmatize_treetagger reading_lookup_treetagger lfs_morph_tags /;
 
 =head1 NAME
 
@@ -116,8 +113,7 @@ sub _lemmatize_treetagger_sequence {
 				warn "Ran out of readings in sequence at $lexeme";
 				last;
 			}
-			my $curr_rdg_text = $curr_rdg->has_normal_form 
-				? $curr_rdg->normal_form : $curr_rdg->text;
+			my $curr_rdg_text = $curr_rdg->normal_form;
 			if( $unused_rdg_part &&
 				$unused_rdg_part =~ /^\Q$lexeme\E(\s*)(.*)$/ ) {
 				# Nth part of curr_rdg
@@ -137,7 +133,7 @@ sub _lemmatize_treetagger_sequence {
 				my @lookahead;
 				my $matched;
 				while( my $nr = shift @path ) {
-					my $nrtext = $nr->has_normal_form ? $nr->normal_form : $nr->text;
+					my $nrtext = $nr->normal_form;
 					if( $nrtext =~ /^\Q$lexeme\E/ ) {
 						$curr_rdg = $lookahead[-1] if @lookahead;
 						$matched = 1;
@@ -212,8 +208,7 @@ sub _text_from_path {
 		unless ( $r->join_prior || !$last || $last->join_next ) {
 			$pathtext .= ' ';
 		} 
-		$pathtext .= ( $normalize && $r->has_normal_form ) 
-			? $r->normal_form : $r->text;
+		$pathtext .= $normalize ? $r->normal_form : $r->text;
 		$last = $r;
 	}
 	return $pathtext;
@@ -232,7 +227,7 @@ sub _treetag_string {
 	}
 	# OK, we can run it then.
 	# First upgrade to UTF8 for necessary languages.
-	my @utf8_supported = qw/ French /;
+	my @utf8_supported = qw/ French Latin Greek /;
 	my %ttopts = ( 'language' => $lang, 'options' => [ qw/ -token -lemma / ] );
 	if( grep { $_ eq $lang } @utf8_supported ) {
 		$ttopts{'use_utf8'} = 1;
@@ -245,17 +240,47 @@ sub _treetag_string {
 	return $tagresult->as_text();
 }
 
-## HACK function to correct for TagSet::TreeTagger brokenness
-sub treetagger_struct {
-	my $pos = shift;
-	$pos =~ s/PREP/PRP/;
-	return Lingua::TagSet::TreeTagger->tag2structure( $pos );
+=head2 lfs_morph_tags
+
+Return a data structure describing the available parts of speech and their attributes
+from the Lingua::Features::Structure class currently defined.
+
+=cut
+
+sub lfs_morph_tags {
+	load('Lingua::Features::StructureType');
+	my $tagset = { 'structures' => [], 'features' => {} };
+	foreach my $lfs ( sort { _by_structid( $a->id, $b->id ) } Lingua::Features::StructureType->types() ) {
+		my $tsstruct = { 'id' => $lfs->id, 'desc' => $lfs->desc, 'use_features' => [] };
+		foreach my $ftid ( Lingua::Features::StructureType->type($lfs->id)->features ) {
+			my $ftype = $lfs->feature_type( $ftid );
+			if( !$ftype && $lfs->base ) {
+				$ftype = $lfs->base->feature_type( $ftid );
+			}
+			if( $ftype ) {
+				push( @{$tsstruct->{'use_features'}}, $ftid );
+				if( $ftid eq 'type' ) {
+					# Type values change according to category
+					$ftid .= " (" . $lfs->id . ")";
+				}
+				my $tfstruct = { 'id' => $ftid, 'values' => [] };
+				foreach my $fval( $ftype->values ) {
+					push( @{$tfstruct->{'values'}}, 
+						{ 'short' => $fval, 'long' => $ftype->value_name( $fval ) } );
+				}
+				$tagset->{'features'}->{$ftid} = $tfstruct;
+			}
+		}
+		push( @{$tagset->{'structures'}}, $tsstruct );
+	}
+	return $tagset;
 }
 
-sub multext_struct {
-	my $pos = shift;
-	# No known hacks needed
-	return Lingua::TagSet::Multext->tag2structure( $pos );
+sub _by_structid {
+	my( $a, $b ) = @_;
+	return -1 if $a eq 'cat';
+	return 1 if $b eq 'cat';
+	return $a cmp $b;
 }
 
 1;