Test miscounting.
[p5sagit/p5-mst-13.2.git] / lib / Lingua / KO / Hangul / Util.pm
CommitLineData
45394607 1package Lingua::KO::Hangul::Util;
2
3use 5.006;
4use strict;
5use warnings;
6
7require Exporter;
8
9our @ISA = qw(Exporter);
10our %EXPORT_TAGS = ();
11our @EXPORT_OK = ();
12our @EXPORT = qw(
13 decomposeHangul
14 composeHangul
15 getHangulName
16 parseHangulName
17);
18our $VERSION = '0.02';
19
20our @JamoL = ( # Initial (HANGUL CHOSEONG)
21 "G", "GG", "N", "D", "DD", "R", "M", "B", "BB",
22 "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H",
23 );
24
25our @JamoV = ( # Medial (HANGUL JUNGSEONG)
26 "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
27 "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI",
28 "YU", "EU", "YI", "I",
29 );
30
31our @JamoT = ( # Final (HANGUL JONGSEONG)
32 "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM",
33 "LB", "LS", "LT", "LP", "LH", "M", "B", "BS",
34 "S", "SS", "NG", "J", "C", "K", "T", "P", "H",
35 );
36
37our $BlockName = "HANGUL SYLLABLE ";
38
39use constant SBase => 0xAC00;
40use constant LBase => 0x1100;
41use constant VBase => 0x1161;
42use constant TBase => 0x11A7;
43use constant LCount => 19; # scalar @JamoL
44use constant VCount => 21; # scalar @JamoV
45use constant TCount => 28; # scalar @JamoT
46use constant NCount => 588; # VCount * TCount
47use constant SCount => 11172; # LCount * NCount
48use constant SFinal => 0xD7A3; # SBase -1 + SCount
49
50our(%CodeL, %CodeV, %CodeT);
51@CodeL{@JamoL} = 0 .. LCount-1;
52@CodeV{@JamoV} = 0 .. VCount-1;
53@CodeT{@JamoT} = 0 .. TCount-1;
54
55sub getHangulName {
56 my $code = shift;
57 return undef unless SBase <= $code && $code <= SFinal;
58 my $SIndex = $code - SBase;
59 my $LIndex = int( $SIndex / NCount);
60 my $VIndex = int(($SIndex % NCount) / TCount);
61 my $TIndex = $SIndex % TCount;
62 "$BlockName$JamoL[$LIndex]$JamoV[$VIndex]$JamoT[$TIndex]";
63}
64
65sub parseHangulName {
66 my $arg = shift;
67 return undef unless $arg =~ s/$BlockName//o;
68 return undef unless $arg =~ /^([^AEIOUWY]*)([AEIOUWY]+)([^AEIOUWY]*)$/;
69 return undef unless exists $CodeL{$1}
70 && exists $CodeV{$2}
71 && exists $CodeT{$3};
72 SBase + $CodeL{$1} * NCount + $CodeV{$2} * TCount + $CodeT{$3};
73}
74
75sub decomposeHangul {
76 my $code = shift;
77 return unless SBase <= $code && $code <= SFinal;
78 my $SIndex = $code - SBase;
79 my $LIndex = int( $SIndex / NCount);
80 my $VIndex = int(($SIndex % NCount) / TCount);
81 my $TIndex = $SIndex % TCount;
82 my @ret = (
83 LBase + $LIndex,
84 VBase + $VIndex,
85 $TIndex ? (TBase + $TIndex) : (),
86 );
87 wantarray ? @ret : pack('U*', @ret);
88}
89
90#
91# To Do:
92# s/(\p{JamoL}\p{JamoV})/toHangLV($1)/ge;
93# s/(\p{HangLV}\p{JamoT})/toHangLVT($1)/ge;
94#
95sub composeHangul {
96 my $str = shift;
97 return $str unless length $str;
98 my(@ret);
99
100 foreach my $ch (unpack('U*', $str)) # Makes list! The string be short!
101 {
102 push(@ret, $ch) and next unless @ret;
103
104 # 1. check to see if $ret[-1] is L and $ch is V.
105 my $LIndex = $ret[-1] - LBase;
106 if(0 <= $LIndex && $LIndex < LCount)
107 {
108 my $VIndex = $ch - VBase;
109 if(0 <= $VIndex && $VIndex < VCount)
110 {
111 $ret[-1] = SBase + ($LIndex * VCount + $VIndex) * TCount;
112 next; # discard $ch
113 }
114 }
115
116 # 2. check to see if $ret[-1] is LV and $ch is T.
117 my $SIndex = $ret[-1] - SBase;
118 if(0 <= $SIndex && $SIndex < SCount && $SIndex % TCount == 0)
119 {
120 my $TIndex = $ch - TBase;
121 if(0 <= $TIndex && $TIndex < TCount)
122 {
123 $ret[-1] += $TIndex;
124 next; # discard $ch
125 }
126 }
127
128 # 3. just append $ch
129 push(@ret, $ch);
130 }
131 wantarray ? @ret : pack('U*', @ret);
132}
133
1341;
135__END__
136
137=head1 NAME
138
139Lingua::KO::Hangul::Util - utility functions for Hangul Syllables
140
141=head1 SYNOPSIS
142
143 use Lingua::KO::Hangul::Util;
144
145 decomposeHangul(0xAC00);
146 # (0x1100,0x1161) or "\x{1100}\x{1161}"
147
148 composeHangul("\x{1100}\x{1161}");
149 # "\x{AC00}"
150
151 getHangulName(0xAC00);
152 # "HANGUL SYLLABLE GA"
153
154 parseHangulName("HANGUL SYLLABLE GA");
155 # 0xAC00
156
157=head1 DESCRIPTION
158
159A Hangul syllable consists of Hangul Jamo.
160
161Hangul Jamo are classified into three classes:
162
163 CHOSEONG (the initial sound) as a leading consonant (L),
164 JUNGSEONG (the medial sound) as a vowel (V),
165 JONGSEONG (the final sound) as a trailing consonant (T).
166
167Any Hangul syllable is a composition of
168
169 i) CHOSEONG + JUNGSEONG (L + V)
170
171 or
172
173 ii) CHOSEONG + JUNGSEONG + JONGSEONG (L + V + T).
174
175Names of Hangul Syllables have a format of C<"HANGUL SYLLABLE %s">.
176
177=head2 Composition and Decomposition
178
179=over 4
180
181=item C<$string_decomposed = decomposeHangul($codepoint)>
182
183=item C<@codepoints = decomposeHangul($codepoint)>
184
185Accepts unicode codepoint integer.
186
187If the specified codepoint is of a Hangul syllable,
188returns a list of codepoints (in a list context)
189or a UTF-8 string (in a scalar context)
190of its decomposition.
191
192 decomposeHangul(0xAC00) # U+AC00 is HANGUL SYLLABLE GA.
193 returns "\x{1100}\x{1161}" or (0x1100, 0x1161);
194
195 decomposeHangul(0xAE00) # U+AE00 is HANGUL SYLLABLE GEUL.
196 returns "\x{1100}\x{1173}\x{11AF}" or (0x1100, 0x1173, 0x11AF);
197
198Otherwise, returns false (empty string or empty list).
199
200 decomposeHangul(0x0041) # outside Hangul Syllables
201 returns empty string or empty list.
202
203=item C<$string_composed = composeHangul($src_string)>
204
205=item C<@codepoints_composed = composeHangul($src_string)>
206
207Any sequence of an initial Jamo C<L> and a medial Jamo C<V>
208is composed into a syllable C<LV>;
209then any sequence of a syllable C<LV> and a final Jamo C<T>
210is composed into a syllable C<LVT>.
211
212Any characters other than Hangul Jamo and Hangul Syllables
213are unaffected.
214
215 composeHangul("Hangul \x{1100}\x{1161}\x{1100}\x{1173}\x{11AF}.")
216 returns "Hangul \x{AC00}\x{AE00}." or
217 (0x48,0x61,0x6E,0x67,0x75,0x6C,0x20,0xAC00,0xAE00,0x2E);
218
219=back
220
221=head2 Hangul Syllable Name
222
223=over 4
224
225=item C<$name = getHangulName($codepoint)>
226
227If the specified codepoint is of a Hangul syllable,
228returns its name; otherwise returns undef.
229
230 getHangulName(0xAC00) returns "HANGUL SYLLABLE GA";
231 getHangulName(0x0041) returns undef.
232
233=item C<$codepoint = parseHangulName($name)>
234
235If the specified name is of a Hangul syllable,
236returns its codepoint; otherwise returns undef.
237
238 parseHangulName("HANGUL SYLLABLE GEUL") returns 0xAE00;
239
240 parseHangulName("LATIN SMALL LETTER A") returns undef;
241
242 parseHangulName("HANGUL SYLLABLE PERL") returns undef;
243 # Regrettably, HANGUL SYLLABLE PERL does not exist :-)
244
245=back
246
247=head2 EXPORT
248
249By default,
250
251 decomposeHangul
252 composeHangul
253 getHangulName
254 parseHangulName
255
256=head1 AUTHOR
257
258SADAHIRO Tomoyuki
259
260 bqw10602@nifty.com
261 http://homepage1.nifty.com/nomenclator/perl/
262
263 Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
264
265 This program is free software; you can redistribute it and/or
266 modify it under the same terms as Perl itself.
267
268=head1 SEE ALSO
269
270=over 4
271
272=item http://www.unicode.org/unicode/reports/tr15
273
274Annex 10: Hangul, in Unicode Normalization Forms (UAX #15).
275
276=back
277
278=cut