From: SADAHIRO Tomoyuki Date: Tue, 24 Jul 2001 01:51:32 +0000 (+0900) Subject: UnicodeCD::charinfo X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=a6fa416b21f337e919933a60a08f55591b9017ff;p=p5sagit%2Fp5-mst-13.2.git UnicodeCD::charinfo Message-Id: <20010724015114.CF4D.BQW10602@nifty.com> p4raw-id: //depot/perl@11481 --- diff --git a/lib/UnicodeCD.pm b/lib/UnicodeCD.pm index c1ca6b4..4f4c19d 100644 --- a/lib/UnicodeCD.pm +++ b/lib/UnicodeCD.pm @@ -134,14 +134,129 @@ sub _getcode { return; } +sub han_charname { + my $arg = shift; + my $code = _getcode($arg); + croak __PACKAGE__, "::charinfo: unknown code '$arg'" + unless defined $code; + croak __PACKAGE__, "::han_charname: outside CJK Unified Ideographs '$arg'" + unless 0x3400 <= $code && $code <= 0x4DB5 + || 0x4E00 <= $code && $code <= 0x9FA5 + || 0x20000 <= $code && $code <= 0x2A6D6; + sprintf "CJK UNIFIED IDEOGRAPH-%04X", $code; +} + +my @JamoL = ( # Leading Consonant (HANGUL CHOSEONG) + "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", + "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H", + ); + +my @JamoV = ( # Medium Vowel (HANGUL JUNGSEONG) + "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", + "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI", + "YU", "EU", "YI", "I", + ); + +my @JamoT = ( # Trailing Consonant (HANGUL JONGSEONG) + "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", + "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", + "S", "SS", "NG", "J", "C", "K", "T", "P", "H", + ); + +my %HangulConst = ( + SBase => 0xAC00, + LBase => 0x1100, + VBase => 0x1161, + TBase => 0x11A7, + LCount => 19, # scalar @JamoL + VCount => 21, # scalar @JamoV + TCount => 28, # scalar @JamoT + NCount => 588, # VCount * TCount + SCount => 11172, # LCount * NCount + Final => 0xD7A3, # SBase -1 + SCount + ); + +sub hangul_charname { + my $arg = shift; + my $code = _getcode($arg); + croak __PACKAGE__, "::charinfo: unknown code '$arg'" + unless defined $code; + croak __PACKAGE__, "::hangul_charname: outside Hangul Syllables '$arg'" + unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final}; + my $SIndex = $code - $HangulConst{SBase}; + my $LIndex = int( $SIndex / $HangulConst{NCount}); + my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount}); + my $TIndex = $SIndex % $HangulConst{TCount}; + return join('', + "HANGUL SYLLABLE ", + $JamoL[$LIndex], + $JamoV[$VIndex], + $JamoT[$TIndex], + ); +} + +sub hangul_decomp { + my $arg = shift; + my $code = _getcode($arg); + croak __PACKAGE__, "::charinfo: unknown code '$arg'" + unless defined $code; + croak __PACKAGE__, "::hangul_decomp: outside Hangul Syllables '$arg'" + unless $HangulConst{SBase} <= $code && $code <= $HangulConst{Final}; + my $SIndex = $code - $HangulConst{SBase}; + my $LIndex = int( $SIndex / $HangulConst{NCount}); + my $VIndex = int(($SIndex % $HangulConst{NCount}) / $HangulConst{TCount}); + my $TIndex = $SIndex % $HangulConst{TCount}; + + return join(" ", + sprintf("%04X", $HangulConst{LBase} + $LIndex), + sprintf("%04X", $HangulConst{VBase} + $VIndex), + $TIndex ? + sprintf("%04X", $HangulConst{TBase} + $TIndex) : (), + ); +} + +my @CharinfoRanges = ( +# block name +# [ first, last, coderef to name, coderef to decompose ], +# CJK Ideographs Extension A + [ 0x3400, 0x4DB5, \&han_charname, undef ], +# CJK Ideographs + [ 0x4E00, 0x9FA5, \&han_charname, undef ], +# Hangul Syllables + [ 0xAC00, 0xD7A3, \&hangul_charname, \&hangul_decomp ], +# Non-Private Use High Surrogates + [ 0xD800, 0xDB7F, undef, undef ], +# Private Use High Surrogates + [ 0xDB80, 0xDBFF, undef, undef ], +# Low Surrogates + [ 0xDC00, 0xDFFF, undef, undef ], +# The Private Use Area + [ 0xE000, 0xF8FF, undef, undef ], +# CJK Ideographs Extension B + [ 0x20000, 0x2A6D6, \&han_charname, undef ], +# Plane 15 Private Use Area + [ 0xF0000, 0xFFFFD, undef, undef ], +# Plane 16 Private Use Area + [ 0x100000, 0x10FFFD, undef, undef ], +); + sub charinfo { my $arg = shift; my $code = _getcode($arg); croak __PACKAGE__, "::charinfo: unknown code '$arg'" unless defined $code; my $hexk = sprintf("%04X", $code); - - openunicode(\$UNICODEFH, "Unicode.txt"); + my($rcode,$rname,$rdec); + foreach my $range (@CharinfoRanges){ + if($range->[0] <= $code && $code <= $range->[1]){ + $rcode = $hexk; + $rname = $range->[2] ? $range->[2]->($code) : ''; + $rdec = $range->[3] ? $range->[3]->($code) : ''; + $hexk = sprintf("%04X",$range->[0]); # replace by the first + last; + } + } + openunicode(\$UNICODEFH, "Unicode.sort"); # sorted if (defined $UNICODEFH) { use Search::Dict; if (look($UNICODEFH, "$hexk;") >= 0) { @@ -158,6 +273,11 @@ sub charinfo { if ($prop{code} eq $hexk) { $prop{block} = charblock($code); $prop{script} = charscript($code); + if(defined $rname){ + $prop{code} = $rcode; + $prop{name} = $rname; + $prop{decomposition} = $rdec; + } return \%prop; } } diff --git a/lib/UnicodeCD.t b/lib/UnicodeCD.t index 07c572c..6e92284 100644 --- a/lib/UnicodeCD.t +++ b/lib/UnicodeCD.t @@ -3,7 +3,7 @@ use UnicodeCD; use Test; use strict; -BEGIN { plan tests => 111 }; +BEGIN { plan tests => 111 + 17 * 3}; use UnicodeCD 'charinfo'; @@ -93,6 +93,70 @@ ok($charinfo->{title}, ''); ok($charinfo->{block}, 'Hebrew'); ok($charinfo->{script}, 'Hebrew'); +# an open syllable in Hangul + +$charinfo = charinfo(0xAC00); + +ok($charinfo->{code}, 'AC00'); +ok($charinfo->{name}, 'HANGUL SYLLABLE GA'); +ok($charinfo->{category}, 'Lo'); +ok($charinfo->{combining}, '0'); +ok($charinfo->{bidi}, 'L'); +ok($charinfo->{decomposition}, '1100 1161'); +ok($charinfo->{decimal}, ''); +ok($charinfo->{digit}, ''); +ok($charinfo->{numeric}, ''); +ok($charinfo->{mirrored}, 'N'); +ok($charinfo->{unicode10}, ''); +ok($charinfo->{comment}, ''); +ok($charinfo->{upper}, ''); +ok($charinfo->{lower}, ''); +ok($charinfo->{title}, ''); +ok($charinfo->{block}, 'Hangul Syllables'); +ok($charinfo->{script}, 'Hangul'); + +# a close syllable in Hangul + +$charinfo = charinfo(0xAE00); + +ok($charinfo->{code}, 'AE00'); +ok($charinfo->{name}, 'HANGUL SYLLABLE GEUL'); +ok($charinfo->{category}, 'Lo'); +ok($charinfo->{combining}, '0'); +ok($charinfo->{bidi}, 'L'); +ok($charinfo->{decomposition}, '1100 1173 11AF'); +ok($charinfo->{decimal}, ''); +ok($charinfo->{digit}, ''); +ok($charinfo->{numeric}, ''); +ok($charinfo->{mirrored}, 'N'); +ok($charinfo->{unicode10}, ''); +ok($charinfo->{comment}, ''); +ok($charinfo->{upper}, ''); +ok($charinfo->{lower}, ''); +ok($charinfo->{title}, ''); +ok($charinfo->{block}, 'Hangul Syllables'); +ok($charinfo->{script}, 'Hangul'); + +$charinfo = charinfo(0x1D400); + +ok($charinfo->{code}, '1D400'); +ok($charinfo->{name}, 'MATHEMATICAL BOLD CAPITAL A'); +ok($charinfo->{category}, 'Lu'); +ok($charinfo->{combining}, '0'); +ok($charinfo->{bidi}, 'L'); +ok($charinfo->{decomposition}, ' 0041'); +ok($charinfo->{decimal}, ''); +ok($charinfo->{digit}, ''); +ok($charinfo->{numeric}, ''); +ok($charinfo->{mirrored}, 'N'); +ok($charinfo->{unicode10}, ''); +ok($charinfo->{comment}, ''); +ok($charinfo->{upper}, ''); +ok($charinfo->{lower}, ''); +ok($charinfo->{title}, ''); +ok($charinfo->{block}, 'Mathematical Alphanumeric Symbols'); +ok($charinfo->{script}, undef); + use UnicodeCD qw(charblock charscript); # 0x0590 is in the Hebrew block but unused.