From: Tara L Andrews Date: Fri, 15 Jun 2012 13:39:43 +0000 (+0200) Subject: now using Perseus-generated Latin treetagger params X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?p=scpubgit%2Fstemmatology.git;a=commitdiff_plain;h=ad2a2c4722a8c6bc324a8a19bee0990c7992c070 now using Perseus-generated Latin treetagger params --- diff --git a/lib/Text/Tradition/Language/Base.pm b/lib/Text/Tradition/Language/Base.pm index dab12f2..d0d1caf 100644 --- a/lib/Text/Tradition/Language/Base.pm +++ b/lib/Text/Tradition/Language/Base.pm @@ -229,7 +229,7 @@ sub _treetag_string { } # OK, we can run it then. # First upgrade to UTF8 for necessary languages. - my @utf8_supported = qw/ French /; + my @utf8_supported = qw/ French Latin Greek /; my %ttopts = ( 'language' => $lang, 'options' => [ qw/ -token -lemma / ] ); if( grep { $_ eq $lang } @utf8_supported ) { $ttopts{'use_utf8'} = 1; diff --git a/lib/Text/Tradition/Language/Latin.pm b/lib/Text/Tradition/Language/Latin.pm index 1d278b5..5643351 100644 --- a/lib/Text/Tradition/Language/Latin.pm +++ b/lib/Text/Tradition/Language/Latin.pm @@ -108,46 +108,7 @@ sub morphology_tags { } } } - - # TODO special case: - # passive verbs (-or) - # T sapientia -> sapientia - # T primus -> unus - # T occulta -> occultus (with occulo in next field, hmm...) - # T carne -> carnis - # T melius -> bonus - - my %excep = ( - 'absens' => 'absum', - 'aperte' => 'apertus', - 'evolvo' => 'exvolvo', - 'inquiam' => 'inquam', - 'intelligo' => 'intellego', - 'itaque' => 'ita', - 'iuste' => 'iustus', - 'longe' => 'longus', - 'male' => 'malus|malum', - 'multum' => 'multus', - 'nec' => 'neque', - 'nos' => 'ego', - 'occultum' => 'occultus', - 'peregrinans' => 'peregrinor', - 'perfectus' => 'perficio', - 'potius' => 'potis', - 'praesente' => 'praesens', - 'prius' => 'prior', - 'quotidianus' => 'cottidianus', - 'se' => 'sui', - 'septem' => 'septimus', - 'Spiritum' => 'spiritus', - 'viriliter' => 'virilis', # TODO special case -iter? - 'vos' => 'tu', - - 'datum' => 'do|data|datus', - 'forte' => 'fors|fortis', - 'vere' => 'verum|verus', - ); - + sub _perseus_lookup_tt { my( $orig, $pos, $lemma ) = split( /\t/, $_[0] ); _morph_connect(); @@ -155,16 +116,9 @@ sub morphology_tags { # Discard results that don't match the lemma, unless lemma is unknown my $lookupopts = {}; unless( $lemma eq '' || $lemma =~ /^\W+$/ ) { - # TODO Perseus lemma might have a number on the end, yuck. - # multiple lemmata separated with | - $lemma =~ s/[^\w|]//g; - $lemma = $excep{$lemma} if exists $excep{$lemma}; - $lemma =~ s/j/i/g; - if( $lemma ) { # if we have anything left... - my %lems; - map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma ); - $lookupopts->{'lemma'} = [ keys %lems ]; - } + my %lems; + map { $lems{$_} = 1; $lems{lc($_)} = 1 } split( /\|/, $lemma ); + $lookupopts->{'lemma'} = [ keys %lems ]; } $lookupopts->{'ttpos'} = $pos if $pos;