Change 12349 didn't really get it right.
[p5sagit/p5-mst-13.2.git] / lib / Unicode / Collate.pm
CommitLineData
45394607 1package Unicode::Collate;
2
3use 5.006;
4use strict;
5use warnings;
6use Carp;
7use Lingua::KO::Hangul::Util;
8require Exporter;
9
d16e9e3d 10our $VERSION = '0.08';
45394607 11our $PACKAGE = __PACKAGE__;
12
13our @ISA = qw(Exporter);
14
15our %EXPORT_TAGS = ();
16our @EXPORT_OK = ();
17our @EXPORT = ();
18
19(our $Path = $INC{'Unicode/Collate.pm'}) =~ s/\.pm$//;
20our $KeyFile = "allkeys.txt";
21
22our %Combin; # combining class from Unicode::Normalize
23
24use constant Min2 => 0x20; # minimum weight at level 2
25use constant Min3 => 0x02; # minimum weight at level 3
26use constant UNDEFINED => 0xFF80; # special value for undefined CE
27
28##
29## constructor
30##
31sub new
32{
33 my $class = shift;
34 my $self = bless { @_ }, $class;
35
36 # alternate
37 $self->{alternate} =
38 ! exists $self->{alternate} ? 'shifted' :
39 ! defined $self->{alternate} ? '' : $self->{alternate};
40
41 # collation level
d16e9e3d 42 $self->{level} ||= ($self->{alternate} =~ /shift/ ? 4 : 3);
45394607 43
44 # normalization form
45 $self->{normalization} = 'D' if ! exists $self->{normalization};
46
47 eval "use Unicode::Normalize;" if defined $self->{normalization};
48
49 $self->{normalize} =
50 ! defined $self->{normalization} ? undef :
51 $self->{normalization} =~ /^(?:NF)?C$/ ? \&NFC :
52 $self->{normalization} =~ /^(?:NF)?D$/ ? \&NFD :
53 $self->{normalization} =~ /^(?:NF)?KC$/ ? \&NFKC :
54 $self->{normalization} =~ /^(?:NF)?KD$/ ? \&NFKD :
55 croak "$PACKAGE unknown normalization form name: $self->{normalization}";
56
57 *Combin = \%Unicode::Normalize::Combin if $self->{normalize} && ! %Combin;
58
59 # backwards
60 $self->{backwards} ||= [];
61 $self->{backwards} = [ $self->{backwards} ] if ! ref $self->{backwards};
62
63 # rearrange
64 $self->{rearrange} ||= []; # maybe not U+0000 (an ASCII)
65 $self->{rearrange} = [ $self->{rearrange} ] if ! ref $self->{rearrange};
66
67 # open the table file
68 my $file = defined $self->{table} ? $self->{table} : $KeyFile;
69 open my $fk, "<$Path/$file" or croak "File does not exist at $Path/$file";
70
71 while(<$fk>){
72 next if /^\s*#/;
73 if(/^\s*\@/){
74 if(/^\@version\s*(\S*)/){
75 $self->{version} ||= $1;
76 }
77 elsif(/^\@alternate\s+(.*)/){
78 $self->{alternate} ||= $1;
79 }
80 elsif(/^\@backwards\s+(.*)/){
81 push @{ $self->{backwards} }, $1;
82 }
83 elsif(/^\@rearrange\s+(.*)/){
84 push @{ $self->{rearrange} }, _getHexArray($1);
85 }
86 next;
87 }
88 $self->parseEntry($_);
89 }
90 close $fk;
91 if($self->{entry}){
92 $self->parseEntry($_) foreach split /\n/, $self->{entry};
93 }
94
95 # keys of $self->{rearrangeHash} are $self->{rearrange}.
96 $self->{rearrangeHash} = {};
97 @{ $self->{rearrangeHash} }{ @{ $self->{rearrange} } } = ();
98
99 return $self;
100}
101
102##
103## get $line, parse it, and write an entry in $self
104##
105sub parseEntry
106{
107 my $self = shift;
108 my $line = shift;
109 my($name, $ele, @key);
110
111 return if $line !~ /^\s*[0-9A-Fa-f]/;
112
113 # get name
114 $name = $1 if $line =~ s/#\s*(.*)//;
115 return if defined $self->{undefName} && $name =~ /$self->{undefName}/;
116
117 # get element
118 my($e, $k) = split /;/, $line;
119 my @e = _getHexArray($e);
120 $ele = pack('U*', @e);
121 return if defined $self->{undefChar} && $ele =~ /$self->{undefChar}/;
122
123 # get sort key
124 if(
125 defined $self->{ignoreName} && $name =~ /$self->{ignoreName}/ ||
126 defined $self->{ignoreChar} && $ele =~ /$self->{ignoreChar}/
127 )
128 {
d16e9e3d 129 $self->{entries}{$ele} = $self->{ignored}{$ele} = 1;
45394607 130 }
131 else
132 {
133 foreach my $arr ($k =~ /\[(\S+)\]/g) {
134 my $var = $arr =~ /\*/;
d16e9e3d 135 push @key, $self->altCE( $var, _getHexArray($arr) );
45394607 136 }
137 $self->{entries}{$ele} = \@key;
138 }
139 $self->{maxlength}{ord $ele} = scalar @e if @e > 1;
140}
141
142
143##
d16e9e3d 144## arrayref CE = altCE(bool variable?, list[num] weights)
45394607 145##
d16e9e3d 146sub altCE
45394607 147{
148 my $self = shift;
149 my $var = shift;
150 my @c = @_;
151
152 $self->{alternate} eq 'blanked' ?
d16e9e3d 153 $var ? [0,0,0] : [ @c[0..2] ] :
154 $self->{alternate} eq 'non-ignorable' ?
155 [ @c[0..2] ] :
45394607 156 $self->{alternate} eq 'shifted' ?
157 $var ? [0,0,0,$c[0] ] : [ @c[0..2], $c[0]+$c[1]+$c[2] ? 0xFFFF : 0 ] :
158 $self->{alternate} eq 'shift-trimmed' ?
159 $var ? [0,0,0,$c[0] ] : [ @c[0..2], 0 ] :
160 \@c;
161}
162
163##
d16e9e3d 164## string hex_sortkey = splitCE(string arg)
45394607 165##
166sub viewSortKey
167{
168 my $self = shift;
169 my $key = $self->getSortKey(@_);
170 my $view = join " ", map sprintf("%04X", $_), unpack 'n*', $key;
171 $view =~ s/ ?0000 ?/|/g;
172 "[$view]";
173}
174
d16e9e3d 175
45394607 176##
d16e9e3d 177## list[strings] elements = splitCE(string arg)
45394607 178##
d16e9e3d 179sub splitCE
45394607 180{
181 my $self = shift;
182 my $code = $self->{preprocess};
183 my $norm = $self->{normalize};
184 my $ent = $self->{entries};
45394607 185 my $max = $self->{maxlength};
45394607 186 my $rear = $self->{rearrangeHash};
187
188 my $str = ref $code ? &$code(shift) : shift;
189 $str = &$norm($str) if ref $norm;
190
191 my(@src, @buf);
192 @src = unpack('U*', $str);
193
194 # rearrangement
195 for(my $i = 0; $i < @src; $i++)
196 {
197 ($src[$i], $src[$i+1]) = ($src[$i+1], $src[$i])
198 if $rear->{ $src[$i] };
199 $i++;
200 }
201
202 for(my $i = 0; $i < @src; $i++)
203 {
204 my $ch;
205 my $u = $src[$i];
206
207 # non-characters
208 next if $u < 0 || 0x10FFFF < $u # out of range
209 || 0xD800 < $u && $u < 0xDFFF; # unpaired surrogates
210 my $four = $u & 0xFFFF;
211 next if $four == 0xFFFE || $four == 0xFFFF;
212
213 if($max->{$u}) # contract
214 {
215 for(my $j = $max->{$u}; $j >= 1; $j--)
216 {
217 next unless $i+$j-1 < @src;
218 $ch = pack 'U*', @src[$i .. $i+$j-1];
219 $i += $j-1, last if $ent->{$ch};
220 }
221 }
222 else { $ch = pack('U', $u) }
223
224 if(%Combin && defined $ch) # with Combining Char
225 {
226 for(my $j = $i+1; $j < @src && $Combin{ $src[$j] }; $j++)
227 {
228 my $comb = pack 'U', $src[$j];
229 next if ! $ent->{ $ch.$comb };
230 $ch .= $comb;
231 splice(@src, $j, 1);
232 last;
233 }
234 }
d16e9e3d 235 push @buf, $ch;
236 }
237 wantarray ? @buf : \@buf;
238}
45394607 239
d16e9e3d 240
241##
242## list[arrayrefs] weight = getWt(string element)
243##
244sub getWt
245{
246 my $self = shift;
247 my $ch = shift;
248 my $ent = $self->{entries};
249 my $ign = $self->{ignored};
250 my $cjk = $self->{overrideCJK};
251 my $hang = $self->{overrideHangul};
252 return if !defined $ch || $ign->{$ch}; # ignored
253 return @{ $ent->{$ch} } if $ent->{$ch};
254 my $u = unpack('U', $ch);
255 return
256 _isHangul($u)
257 ? $hang
258 ? &$hang($u)
259 : map(@{ $ent->{pack('U', $_)} }, decomposeHangul($u))
260 : _isCJK($u)
261 ? $cjk ? &$cjk($u) : map($self->altCE(0,@$_), _CJK($u))
262 : map($self->altCE(0,@$_), _derivCE($u));
263}
264
265##
266## int = index(string, substring)
267##
268sub index
269{
270 my $self = shift;
271 my $lev = $self->{level};
272 my $str = $self->splitCE(shift);
273 my $sub = $self->splitCE(shift);
274
275 return wantarray ? (0,0) : 0 if ! @$sub;
276 return wantarray ? () : -1 if ! @$str;
277
278 my @subWt = grep _ignorableAtLevel($_,$lev),
279 map $self->getWt($_), @$sub;
280
281 my(@strWt,@strPt);
282 my $count = 0;
283 for my $e (@$str){
284 my @tmp = grep _ignorableAtLevel($_,$lev), $self->getWt($e);
285 push @strWt, @tmp;
286 push @strPt, ($count) x @tmp;
287 $count += length $e;
288 while(@strWt >= @subWt){
289 if(_eqArray(\@strWt, \@subWt, $lev)){
290 my $pos = $strPt[0];
291 return wantarray ? ($pos, $count-$pos) : $pos;
292 }
293 shift @strWt;
294 shift @strPt;
295 }
296 }
297 return wantarray ? () : -1;
298}
299
300##
301## bool _eqArray(arrayref, arrayref, level)
302##
303sub _eqArray($$$)
304{
305 my $a = shift; # length $a >= length $b;
306 my $b = shift;
307 my $lev = shift;
308 for my $v (0..$lev-1){
309 for my $c (0..@$b-1){
310 return if $a->[$c][$v] != $b->[$c][$v];
311 }
45394607 312 }
d16e9e3d 313 return 1;
314}
315
316
317##
318## bool _ignorableAtLevel(CE, level)
319##
320sub _ignorableAtLevel($$)
321{
322 my $ce = shift;
323 return if ! defined $ce;
324 my $lv = shift;
325 ! grep { ! $ce->[$_] } 0..$lv-1;
326}
327
328
329##
330## string sortkey = getSortKey(string arg)
331##
332sub getSortKey
333{
334 my $self = shift;
335 my $lev = $self->{level};
336 my $rCE = $self->splitCE(shift); # get an arrayref
337
338 # weight arrays
339 my @buf = grep defined(), map $self->getWt($_), @$rCE;
45394607 340
341 # make sort key
342 my @ret = ([],[],[],[]);
343 foreach my $v (0..$lev-1){
344 foreach my $b (@buf){
345 push @{ $ret[$v] }, $b->[$v] if $b->[$v];
346 }
347 }
348 foreach (@{ $self->{backwards} }){
349 my $v = $_ - 1;
350 @{ $ret[$v] } = reverse @{ $ret[$v] };
351 }
352
353 # modification of tertiary weights
354 if($self->{upper_before_lower}){
355 foreach (@{ $ret[2] }){
356 if (0x8 <= $_ && $_ <= 0xC){ $_ -= 6 } # lower
357 elsif(0x2 <= $_ && $_ <= 0x6){ $_ += 6 } # upper
358 elsif($_ == 0x1C) { $_ += 1 } # square upper
359 elsif($_ == 0x1D) { $_ -= 1 } # square lower
360 }
361 }
362 if($self->{katakana_before_hiragana}){
363 foreach (@{ $ret[2] }){
364 if (0x0F <= $_ && $_ <= 0x13){ $_ -= 2 } # katakana
365 elsif(0x0D <= $_ && $_ <= 0x0E){ $_ += 5 } # hiragana
366 }
367 }
368 join "\0\0", map pack('n*', @$_), @ret;
369}
370
371
372##
d16e9e3d 373## int compare = cmp(string a, string b)
45394607 374##
375sub cmp
376{
377 my $obj = shift;
378 my $a = shift;
379 my $b = shift;
380 $obj->getSortKey($a) cmp $obj->getSortKey($b);
381}
382
383##
d16e9e3d 384## list[strings] sorted = sort(list[strings] arg)
45394607 385##
386sub sort
387{
388 my $obj = shift;
389
390 map { $_->[1] }
391 sort{ $a->[0] cmp $b->[0] }
392 map [ $obj->getSortKey($_), $_ ], @_;
393}
394
395##
d16e9e3d 396## list[arrayrefs] CE = _derivCE(int codepoint)
45394607 397##
398sub _derivCE
399{
400 my $code = shift;
401 my $a = UNDEFINED + ($code >> 15); # ok
402 my $b = ($code & 0x7FFF) | 0x8000; # ok
403# my $a = 0xFFC2 + ($code >> 15); # ng
404# my $b = $code & 0x7FFF | 0x1000; # ng
405 $b ? ([$a,2,1,$code],[$b,0,0,$code]) : [$a,2,1,$code];
406}
407
408##
409## "hhhh hhhh hhhh" to (dddd, dddd, dddd)
410##
411sub _getHexArray
412{
413 my $str = shift;
414 map hex(), $str =~ /([0-9a-fA-F]+)/g;
415}
416
417##
d16e9e3d 418## bool is_a_CJK_Unified_Ideograph = _isCJK(int codepoint)
45394607 419##
420sub _isCJK
421{
422 my $u = shift;
423 return 0x3400 <= $u && $u <= 0x4DB5
424 || 0x4E00 <= $u && $u <= 0x9FA5
425# || 0x20000 <= $u && $u <= 0x2A6D6;
426}
427
428##
d16e9e3d 429## list[arrayref] CE = _CJK(int codepoint_of_CJK)
45394607 430##
431sub _CJK
432{
433 my $u = shift;
434 $u > 0xFFFF ? _derivCE($u) : [$u,0x20,0x02,$u];
435}
436
437##
d16e9e3d 438## bool is_a_Hangul_Syllable = _isHangul(int codepoint)
45394607 439##
440sub _isHangul
441{
442 my $code = shift;
443 return 0xAC00 <= $code && $code <= 0xD7A3;
444}
445
4461;
447__END__
448
449=head1 NAME
450
451Unicode::Collate - use UCA (Unicode Collation Algorithm)
452
453=head1 SYNOPSIS
454
455 use Unicode::Collate;
456
457 #construct
458 $UCA = Unicode::Collate->new(%tailoring);
459
460 #sort
461 @sorted = $UCA->sort(@not_sorted);
462
463 #compare
464 $result = $UCA->cmp($a, $b); # returns 1, 0, or -1.
465
466=head1 DESCRIPTION
467
468=head2 Constructor and Tailoring
469
d16e9e3d 470The C<new> method returns a collator object.
471
45394607 472 $UCA = Unicode::Collate->new(
473 alternate => $alternate,
474 backwards => $levelNumber, # or \@levelNumbers
475 entry => $element,
476 normalization => $normalization_form,
477 ignoreName => qr/$ignoreName/,
478 ignoreChar => qr/$ignoreChar/,
479 katakana_before_hiragana => $bool,
480 level => $collationLevel,
481 overrideCJK => \&overrideCJK,
482 overrideHangul => \&overrideHangul,
483 preprocess => \&preprocess,
484 rearrange => \@charList,
485 table => $filename,
486 undefName => qr/$undefName/,
487 undefChar => qr/$undefChar/,
488 upper_before_lower => $bool,
489 );
490 # if %tailoring is false (empty),
491 # $UCA should do the default collation.
492
493=over 4
494
495=item alternate
496
497-- see 3.2.2 Alternate Weighting, UTR #10.
498
499 alternate => 'shifted', 'blanked', 'non-ignorable', or 'shift-trimmed'.
500
501By default (if specification is omitted), 'shifted' is adopted.
502
503=item backwards
504
505-- see 3.1.2 French Accents, UTR #10.
506
507 backwards => $levelNumber or \@levelNumbers
508
509Weights in reverse order; ex. level 2 (diacritic ordering) in French.
510If omitted, forwards at all the levels.
511
512=item entry
513
514-- see 3.1 Linguistic Features; 3.2.1 File Format, UTR #10.
515
516Overrides a default order or adds a new element
517
518 entry => <<'ENTRIES', # use the UCA file format
51900E6 ; [.0861.0020.0002.00E6] [.08B1.0020.0002.00E6] # ligature <ae> as <a e>
5200063 0068 ; [.0893.0020.0002.0063] # "ch" in traditional Spanish
5210043 0068 ; [.0893.0020.0008.0043] # "Ch" in traditional Spanish
522ENTRIES
523
524=item ignoreName
525
526=item ignoreChar
527
528-- see Completely Ignorable, 3.2.2 Alternate Weighting, UTR #10.
529
530Ignores the entry in the table.
531If an ignored collation element appears in the string to be collated,
532it is ignored as if the element had been deleted from there.
533
534E.g. when 'a' and 'e' are ignored,
535'element' is equal to 'lament' (or 'lmnt').
536
537=item level
538
539-- see 4.3 Form a sort key for each string, UTR #10.
540
541Set the maximum level.
542Any higher levels than the specified one are ignored.
543
544 Level 1: alphabetic ordering
545 Level 2: diacritic ordering
546 Level 3: case ordering
547 Level 4: tie-breaking (e.g. in the case when alternate is 'shifted')
548
549 ex.level => 2,
550
551=item normalization
552
553-- see 4.1 Normalize each input string, UTR #10.
554
555If specified, strings are normalized before preparation sort keys
556(the normalization is executed after preprocess).
557
558As a form name, one of the following names must be used.
559
560 'C' or 'NFC' for Normalization Form C
561 'D' or 'NFD' for Normalization Form D
562 'KC' or 'NFKC' for Normalization Form KC
563 'KD' or 'NFKD' for Normalization Form KD
564
565If omitted, the string is put into Normalization Form D.
566
567If undefined explicitly (as C<normalization =E<gt> undef>),
568any normalization is not carried out (this may make tailoring easier
569if any normalization is not desired).
570
571see B<CAVEAT>.
572
573=item overrideCJK
574
575=item overrideHangul
576
577-- see 7.1 Derived Collation Elements, UTR #10.
578
579By default, mapping of CJK Unified Ideographs
580uses the Unicode codepoint order
581and Hangul Syllables are decomposed into Hangul Jamo.
582
583The mapping of CJK Unified Ideographs
584or Hangul Syllables may be overrided.
585
586ex. CJK Unified Ideographs in the JIS codepoint order.
587
588 overrideCJK => sub {
589 my $u = shift; # get unicode codepoint
590 my $b = pack('n', $u); # to UTF-16BE
591 my $s = your_unicode_to_sjis_converter($b); # convert
592 my $n = unpack('n', $s); # convert sjis to short
593 [ $n, 1, 1 ]; # return collation element
594 },
595
596If you want to override the mapping of Hangul Syllables,
597the Normalization Forms D and KD are not appropriate
598(they will be decomposed before overriding).
599
600=item preprocess
601
602-- see 5.1 Preprocessing, UTR #10.
603
604If specified, the coderef is used to preprocess
605before the formation of sort keys.
606
607ex. dropping English articles, such as "a" or "the".
608Then, "the pen" is before "a pencil".
609
610 preprocess => sub {
611 my $str = shift;
612 $str =~ s/\b(?:an?|the)\s+//g;
613 $str;
614 },
615
616=item rearrange
617
618-- see 3.1.3 Rearrangement, UTR #10.
619
620Characters that are not coded in logical order and to be rearranged.
621By default,
622
623 rearrange => [ 0x0E40..0x0E44, 0x0EC0..0x0EC4 ],
624
625=item table
626
627-- see 3.2 Default Unicode Collation Element Table, UTR #10.
628
629You can use another element table if desired.
630The table file must be in your C<lib/Unicode/Collate> directory.
631
632By default, the file C<lib/Unicode/Collate/allkeys.txt> is used.
633
634=item undefName
635
636=item undefChar
637
638-- see 6.3.4 Reducing the Repertoire, UTR #10.
639
640Undefines the collation element as if it were unassigned in the table.
641This reduces the size of the table.
642If an unassigned character appears in the string to be collated,
643the sort key is made from its codepoint
644as a single-character collation element,
645as it is greater than any other assigned collation elements
646(in the codepoint order among the unassigned characters).
647But, it'd be better to ignore characters
648unfamiliar to you and maybe never used.
649
650=item katakana_before_hiragana
651
652=item upper_before_lower
653
654-- see 6.6 Case Comparisons; 7.3.1 Tertiary Weight Table, UTR #10.
655
656By default, lowercase is before uppercase
657and hiragana is before katakana.
658
659If the parameter is true, this is reversed.
660
661=back
662
663=head2 Other methods
664
665=over 4
666
667=item C<@sorted = $UCA-E<gt>sort(@not_sorted)>
668
669Sorts a list of strings.
670
671=item C<$result = $UCA-E<gt>cmp($a, $b)>
672
673Returns 1 (when C<$a> is greater than C<$b>)
674or 0 (when C<$a> is equal to C<$b>)
675or -1 (when C<$a> is lesser than C<$b>).
676
677=item C<$sortKey = $UCA-E<gt>getSortKey($string)>
678
679-- see 4.3 Form a sort key for each string, UTR #10.
680
681Returns a sort key.
682
683You compare the sort keys using a binary comparison
684and get the result of the comparison of the strings using UCA.
685
686 $UCA->getSortKey($a) cmp $UCA->getSortKey($b)
687
688 is equivalent to
689
690 $UCA->cmp($a, $b)
691
d16e9e3d 692=item C<$position = $UCA-E<gt>index($string, $substring)>
693
694=item C<($position, $length) = $UCA-E<gt>index($string, $substring)>
695
696-- see 6.8 Searching, UTR #10.
697
698If C<$substring> matches a part of C<$string>, returns
699the position of the first occurrence of the matching part in scalar context;
700in list context, returns a two-element list of
701the position and the length of the matching part.
702
703B<Notice> that the length of the matching part may differ from
704the length of C<$substring>.
705
706B<Note> that the position and the length are counted on the string
707after the process of preprocess, normalization, and rearrangement.
708Therefore, in case the specified string is not binary equal to
709the preprocessed/normalized/rearranged string, the position and the length
710may differ form those on the specified string. But it is guaranteed
711that, if matched, it returns a non-negative value as C<$position>.
712
713If C<$substring> does not match any part of C<$string>,
714returns C<-1> in scalar context and
715an empty list in list context.
716
717e.g. you say
718
719 my $UCA = Unicode::Collate->new( normalization => undef, level => 1 );
720 my $str = "Ich mu\x{00DF} studieren.";
721 my $sub = "m\x{00FC}ss";
722 my $match;
723 if(my @tmp = $UCA->index($str, $sub)){
724 $match = substr($str, $tmp[0], $tmp[1]);
725 }
726
727and get C<"mu\x{00DF}"> in C<$match> since C<"mu>E<223>C<">
728is primary equal to C<"m>E<252>C<ss">.
729
45394607 730=back
731
732=head2 EXPORT
733
734None by default.
735
736=head2 CAVEAT
737
738Use of the C<normalization> parameter requires
739the B<Unicode::Normalize> module.
740
741If you need not it (e.g. in the case when you need not
742handle any combining characters),
743assign C<normalization =E<gt> undef> explicitly.
744
745=head1 AUTHOR
746
747SADAHIRO Tomoyuki, E<lt>SADAHIRO@cpan.orgE<gt>
748
749 http://homepage1.nifty.com/nomenclator/perl/
750
751 Copyright(C) 2001, SADAHIRO Tomoyuki. Japan. All rights reserved.
752
753 This program is free software; you can redistribute it and/or
754 modify it under the same terms as Perl itself.
755
756=head1 SEE ALSO
757
758=over 4
759
760=item L<Lingua::KO::Hangul::Util>
761
762utility functions for Hangul Syllables
763
764=item L<Unicode::Normalize>
765
766normalized forms of Unicode text
767
768=item Unicode Collation Algorithm - Unicode TR #10
769
770http://www.unicode.org/unicode/reports/tr10/
771
772=back
773
774=cut