Commit | Line | Data |
45394607 |
1 | package Lingua::KO::Hangul::Util; |
2 | |
3 | use 5.006; |
4 | use strict; |
5 | use warnings; |
6 | |
7 | require Exporter; |
8 | |
9 | our @ISA = qw(Exporter); |
10 | our %EXPORT_TAGS = (); |
11 | our @EXPORT_OK = (); |
12 | our @EXPORT = qw( |
13 | decomposeHangul |
14 | composeHangul |
15 | getHangulName |
16 | parseHangulName |
17 | ); |
18 | our $VERSION = '0.02'; |
19 | |
20 | our @JamoL = ( # Initial (HANGUL CHOSEONG) |
21 | "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", |
22 | "S", "SS", "", "J", "JJ", "C", "K", "T", "P", "H", |
23 | ); |
24 | |
25 | our @JamoV = ( # Medial (HANGUL JUNGSEONG) |
26 | "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", |
27 | "WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI", |
28 | "YU", "EU", "YI", "I", |
29 | ); |
30 | |
31 | our @JamoT = ( # Final (HANGUL JONGSEONG) |
32 | "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", |
33 | "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", |
34 | "S", "SS", "NG", "J", "C", "K", "T", "P", "H", |
35 | ); |
36 | |
37 | our $BlockName = "HANGUL SYLLABLE "; |
38 | |
39 | use constant SBase => 0xAC00; |
40 | use constant LBase => 0x1100; |
41 | use constant VBase => 0x1161; |
42 | use constant TBase => 0x11A7; |
43 | use constant LCount => 19; # scalar @JamoL |
44 | use constant VCount => 21; # scalar @JamoV |
45 | use constant TCount => 28; # scalar @JamoT |
46 | use constant NCount => 588; # VCount * TCount |
47 | use constant SCount => 11172; # LCount * NCount |
48 | use constant SFinal => 0xD7A3; # SBase -1 + SCount |
49 | |
50 | our(%CodeL, %CodeV, %CodeT); |
51 | @CodeL{@JamoL} = 0 .. LCount-1; |
52 | @CodeV{@JamoV} = 0 .. VCount-1; |
53 | @CodeT{@JamoT} = 0 .. TCount-1; |
54 | |
55 | sub getHangulName { |
56 | my $code = shift; |
57 | return undef unless SBase <= $code && $code <= SFinal; |
58 | my $SIndex = $code - SBase; |
59 | my $LIndex = int( $SIndex / NCount); |
60 | my $VIndex = int(($SIndex % NCount) / TCount); |
61 | my $TIndex = $SIndex % TCount; |
62 | "$BlockName$JamoL[$LIndex]$JamoV[$VIndex]$JamoT[$TIndex]"; |
63 | } |
64 | |
65 | sub parseHangulName { |
66 | my $arg = shift; |
67 | return undef unless $arg =~ s/$BlockName//o; |
68 | return undef unless $arg =~ /^([^AEIOUWY]*)([AEIOUWY]+)([^AEIOUWY]*)$/; |
69 | return undef unless exists $CodeL{$1} |
70 | && exists $CodeV{$2} |
71 | && exists $CodeT{$3}; |
72 | SBase + $CodeL{$1} * NCount + $CodeV{$2} * TCount + $CodeT{$3}; |
73 | } |
74 | |
75 | sub decomposeHangul { |
76 | my $code = shift; |
77 | return unless SBase <= $code && $code <= SFinal; |
78 | my $SIndex = $code - SBase; |
79 | my $LIndex = int( $SIndex / NCount); |
80 | my $VIndex = int(($SIndex % NCount) / TCount); |
81 | my $TIndex = $SIndex % TCount; |
82 | my @ret = ( |
83 | LBase + $LIndex, |
84 | VBase + $VIndex, |
85 | $TIndex ? (TBase + $TIndex) : (), |
86 | ); |
87 | wantarray ? @ret : pack('U*', @ret); |
88 | } |
89 | |
90 | # |
91 | # To Do: |
92 | # s/(\p{JamoL}\p{JamoV})/toHangLV($1)/ge; |
93 | # s/(\p{HangLV}\p{JamoT})/toHangLVT($1)/ge; |
94 | # |
95 | sub composeHangul { |
96 | my $str = shift; |
97 | return $str unless length $str; |
98 | my(@ret); |
99 | |
100 | foreach my $ch (unpack('U*', $str)) # Makes list! The string be short! |
101 | { |
102 | push(@ret, $ch) and next unless @ret; |
103 | |
104 | # 1. check to see if $ret[-1] is L and $ch is V. |
105 | my $LIndex = $ret[-1] - LBase; |
106 | if(0 <= $LIndex && $LIndex < LCount) |
107 | { |
108 | my $VIndex = $ch - VBase; |
109 | if(0 <= $VIndex && $VIndex < VCount) |
110 | { |
111 | $ret[-1] = SBase + ($LIndex * VCount + $VIndex) * TCount; |
112 | next; # discard $ch |
113 | } |
114 | } |
115 | |
116 | # 2. check to see if $ret[-1] is LV and $ch is T. |
117 | my $SIndex = $ret[-1] - SBase; |
118 | if(0 <= $SIndex && $SIndex < SCount && $SIndex % TCount == 0) |
119 | { |
120 | my $TIndex = $ch - TBase; |
121 | if(0 <= $TIndex && $TIndex < TCount) |
122 | { |
123 | $ret[-1] += $TIndex; |
124 | next; # discard $ch |
125 | } |
126 | } |
127 | |
128 | # 3. just append $ch |
129 | push(@ret, $ch); |
130 | } |
131 | wantarray ? @ret : pack('U*', @ret); |
132 | } |
133 | |
134 | 1; |
135 | __END__ |
136 | |
137 | =head1 NAME |
138 | |
139 | Lingua::KO::Hangul::Util - utility functions for Hangul Syllables |
140 | |
141 | =head1 SYNOPSIS |
142 | |
143 | use Lingua::KO::Hangul::Util; |
144 | |
145 | decomposeHangul(0xAC00); |
146 | # (0x1100,0x1161) or "\x{1100}\x{1161}" |
147 | |
148 | composeHangul("\x{1100}\x{1161}"); |
149 | # "\x{AC00}" |
150 | |
151 | getHangulName(0xAC00); |
152 | # "HANGUL SYLLABLE GA" |
153 | |
154 | parseHangulName("HANGUL SYLLABLE GA"); |
155 | # 0xAC00 |
156 | |
157 | =head1 DESCRIPTION |
158 | |
159 | A Hangul syllable consists of Hangul Jamo. |
160 | |
161 | Hangul Jamo are classified into three classes: |
162 | |
163 | CHOSEONG (the initial sound) as a leading consonant (L), |
164 | JUNGSEONG (the medial sound) as a vowel (V), |
165 | JONGSEONG (the final sound) as a trailing consonant (T). |
166 | |
167 | Any Hangul syllable is a composition of |
168 | |
169 | i) CHOSEONG + JUNGSEONG (L + V) |
170 | |
171 | or |
172 | |
173 | ii) CHOSEONG + JUNGSEONG + JONGSEONG (L + V + T). |
174 | |
175 | Names of Hangul Syllables have a format of C<"HANGUL SYLLABLE %s">. |
176 | |
177 | =head2 Composition and Decomposition |
178 | |
179 | =over 4 |
180 | |
181 | =item C<$string_decomposed = decomposeHangul($codepoint)> |
182 | |
183 | =item C<@codepoints = decomposeHangul($codepoint)> |
184 | |
185 | Accepts unicode codepoint integer. |
186 | |
187 | If the specified codepoint is of a Hangul syllable, |
188 | returns a list of codepoints (in a list context) |
189 | or a UTF-8 string (in a scalar context) |
190 | of its decomposition. |
191 | |
192 | decomposeHangul(0xAC00) # U+AC00 is HANGUL SYLLABLE GA. |
193 | returns "\x{1100}\x{1161}" or (0x1100, 0x1161); |
194 | |
195 | decomposeHangul(0xAE00) # U+AE00 is HANGUL SYLLABLE GEUL. |
196 | returns "\x{1100}\x{1173}\x{11AF}" or (0x1100, 0x1173, 0x11AF); |
197 | |
198 | Otherwise, returns false (empty string or empty list). |
199 | |
200 | decomposeHangul(0x0041) # outside Hangul Syllables |
201 | returns empty string or empty list. |
202 | |
203 | =item C<$string_composed = composeHangul($src_string)> |
204 | |
205 | =item C<@codepoints_composed = composeHangul($src_string)> |
206 | |
207 | Any sequence of an initial Jamo C<L> and a medial Jamo C<V> |
208 | is composed into a syllable C<LV>; |
209 | then any sequence of a syllable C<LV> and a final Jamo C<T> |
210 | is composed into a syllable C<LVT>. |
211 | |
212 | Any characters other than Hangul Jamo and Hangul Syllables |
213 | are unaffected. |
214 | |
215 | composeHangul("Hangul \x{1100}\x{1161}\x{1100}\x{1173}\x{11AF}.") |
216 | returns "Hangul \x{AC00}\x{AE00}." or |
217 | (0x48,0x61,0x6E,0x67,0x75,0x6C,0x20,0xAC00,0xAE00,0x2E); |
218 | |
219 | =back |
220 | |
221 | =head2 Hangul Syllable Name |
222 | |
223 | =over 4 |
224 | |
225 | =item C<$name = getHangulName($codepoint)> |
226 | |
227 | If the specified codepoint is of a Hangul syllable, |
228 | returns its name; otherwise returns undef. |
229 | |
230 | getHangulName(0xAC00) returns "HANGUL SYLLABLE GA"; |
231 | getHangulName(0x0041) returns undef. |
232 | |
233 | =item C<$codepoint = parseHangulName($name)> |
234 | |
235 | If the specified name is of a Hangul syllable, |
236 | returns its codepoint; otherwise returns undef. |
237 | |
238 | parseHangulName("HANGUL SYLLABLE GEUL") returns 0xAE00; |
239 | |
240 | parseHangulName("LATIN SMALL LETTER A") returns undef; |
241 | |
242 | parseHangulName("HANGUL SYLLABLE PERL") returns undef; |
243 | # Regrettably, HANGUL SYLLABLE PERL does not exist :-) |
244 | |
245 | =back |
246 | |
247 | =head2 EXPORT |
248 | |
249 | By default, |
250 | |
251 | decomposeHangul |
252 | composeHangul |
253 | getHangulName |
254 | parseHangulName |
255 | |
256 | =head1 AUTHOR |
257 | |
258 | SADAHIRO Tomoyuki |
259 | |
260 | bqw10602@nifty.com |
261 | http://homepage1.nifty.com/nomenclator/perl/ |
262 | |
263 | Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved. |
264 | |
265 | This program is free software; you can redistribute it and/or |
266 | modify it under the same terms as Perl itself. |
267 | |
268 | =head1 SEE ALSO |
269 | |
270 | =over 4 |
271 | |
272 | =item http://www.unicode.org/unicode/reports/tr15 |
273 | |
274 | Annex 10: Hangul, in Unicode Normalization Forms (UAX #15). |
275 | |
276 | =back |
277 | |
278 | =cut |