From: SADAHIRO Tomoyuki Date: Sun, 29 Mar 2009 14:50:35 +0000 (+0200) Subject: Update to Unicode::Normalize 1.03 X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=51683ce6d9770c21b79198196ae37e3474eceb5f;p=p5sagit%2Fp5-mst-13.2.git Update to Unicode::Normalize 1.03 1.03 Sun Mar 29 12:56:23 2009 - mkheader: check if no composition needs growing the string. - Makefile.PL: a tweak --- diff --git a/ext/Unicode-Normalize/Changes b/ext/Unicode-Normalize/Changes index 1f1a693..e9cb391 100644 --- a/ext/Unicode-Normalize/Changes +++ b/ext/Unicode-Normalize/Changes @@ -1,5 +1,9 @@ Revision history for Perl extension Unicode::Normalize. +1.03 Sun Mar 29 12:56:23 2009 + - mkheader: check if no composition needs growing the string. + - Makefile.PL: a tweak + 1.02 Tue Jun 5 22:46:45 2007 - XSUB: mkheader, _U_stringify() - avoid unpack('C*') on unicode. - test: short.t removed - pure perl is not inapprotiate for test of diff --git a/ext/Unicode-Normalize/Makefile.PL b/ext/Unicode-Normalize/Makefile.PL index ab4bd03..a04ca62 100644 --- a/ext/Unicode-Normalize/Makefile.PL +++ b/ext/Unicode-Normalize/Makefile.PL @@ -17,6 +17,7 @@ WriteMakefile( 'VERSION_FROM' => 'Normalize.pm', # finds $VERSION 'clean' => $clean, 'PREREQ_PM' => { + bytes => 0, Carp => 0, constant => 0, DynaLoader => 0, @@ -27,5 +28,4 @@ WriteMakefile( Test => 0, warnings => 0, }, - 'MAN3PODS' => {}, ); diff --git a/ext/Unicode-Normalize/Normalize.pm b/ext/Unicode-Normalize/Normalize.pm index cb63fbf..ad5ff82 100644 --- a/ext/Unicode-Normalize/Normalize.pm +++ b/ext/Unicode-Normalize/Normalize.pm @@ -13,7 +13,7 @@ use Carp; no warnings 'utf8'; -our $VERSION = '1.02'; +our $VERSION = '1.03'; our $PACKAGE = __PACKAGE__; require Exporter; @@ -453,6 +453,8 @@ normalization implemented by this module depends on your perl's version. 5.8.1-5.8.3 4.0.0 5.8.4-5.8.6 4.0.1 (normalization is same as 4.0.0) 5.8.7-5.8.8 4.1.0 + 5.10.0 5.0.0 + 5.8.9 5.1.0 =item Correction of decomposition mapping diff --git a/ext/Unicode-Normalize/Normalize.xs b/ext/Unicode-Normalize/Normalize.xs index 792f4a3..f4bbca7 100644 --- a/ext/Unicode-Normalize/Normalize.xs +++ b/ext/Unicode-Normalize/Normalize.xs @@ -33,9 +33,11 @@ /* utf8_hop() hops back before start. Maybe broken UTF-8 */ #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start" -/* It should never happen as there is no instance in UTF-8 and UTF-EBCDIC; - according to Versioning and Stability in UAX#15, no new composition - should come in future. */ +/* It should never happen as there is no instance in UTF-8 and UTF-EBCDIC. + If Unicode would add a new composition of A + B to C + where bytes::length(A) + bytes::length(B) < bytes::length(C), + this code should be fixed. + In this case, mkheader will prevent Unicode::Normalize from building. */ #define ErrLongerThanSrc "panic (Unicode::Normalize %s): longer than source" /* uvuni_to_utf8 wants UTF8_MAXBYTES free bytes available */ diff --git a/ext/Unicode-Normalize/README b/ext/Unicode-Normalize/README index d7d1635..8a5390c 100644 --- a/ext/Unicode-Normalize/README +++ b/ext/Unicode-Normalize/README @@ -1,4 +1,4 @@ -Unicode/Normalize version 1.02 +Unicode/Normalize version 1.03 =================================== Unicode::Normalize - Unicode Normalization Forms diff --git a/ext/Unicode-Normalize/mkheader b/ext/Unicode-Normalize/mkheader index f3ab5e2..b3e3c31 100644 --- a/ext/Unicode-Normalize/mkheader +++ b/ext/Unicode-Normalize/mkheader @@ -83,6 +83,30 @@ sub decomposeHangul { return @ret; } +########## length of a character ########## + +sub utf8len { + my $uv = shift; + return $uv < 0x80 ? 1 : + $uv < 0x800 ? 2 : + $uv < 0x10000 ? 3 : + $uv < 0x110000 ? 4 : + croak "$PACKAGE: illegal char in the composite. codepoint max is 0x10ffff."; +} + +sub utfelen { + my $uv = shift; + return $uv < 0xA0 ? 1 : + $uv < 0x400 ? 2 : + $uv < 0x4000 ? 3 : + $uv < 0x40000 ? 4 : + $uv < 0x110000 ? 5 : + croak "$PACKAGE: illegal char in the composite. codepoint max is 0x10ffff."; +} + +my $errExpand = "$PACKAGE: Composition to U+%04X (from U+%04X and U+%04X) " . + "needs growing the string in %s! Quit. Please inform the author..."; + ########## getting full decomposion ########## { my($f, $fh); @@ -113,9 +137,9 @@ while ($Combin =~ /(.+)/g) { my @tab = split /\t/, $1; my $ini = hex $tab[0]; if ($tab[1] eq '') { - $Combin{ $ini } = $tab[2]; + $Combin{$ini} = $tab[2]; } else { - $Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]); + $Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]); } } @@ -123,54 +147,43 @@ while ($Decomp =~ /(.+)/g) { my @tab = split /\t/, $1; my $compat = $tab[2] =~ s/<[^>]+>//; my $dec = [ _getHexArray($tab[2]) ]; # decomposition - my $ini = hex($tab[0]); # initial decomposable character + my $ini = hex($tab[0]); + my $end = $tab[1] eq '' ? $ini : hex($tab[1]); + # ($ini .. $end) is the range of decomposable characters. my $listname = @$dec == 2 ? sprintf("${structname}_%06x", $dec->[0]) : 'USELESS'; # %04x is bad since it'd place _3046 after _1d157. - if ($tab[1] eq '') { - $Compat{ $ini } = $dec; + foreach my $u ($ini .. $end) { + $Compat{$u} = $dec; if (! $compat) { - $Canon{ $ini } = $dec; + $Canon{$u} = $dec; if (@$dec == 2) { + if (utf8len($dec->[0]) + utf8len($dec->[1]) < utf8len($u)) { + croak sprintf $errExpand, $u, $dec->[0], $dec->[1], + "utf-8"; + } + if (utfelen($dec->[0]) + utfelen($dec->[1]) < utfelen($u)) { + croak sprintf $errExpand, $u, $dec->[0], $dec->[1], + "utf-ebcdic"; + } + if ($Combin{ $dec->[0] }) { - $NonStD{ $ini } = 1; + $NonStD{$u} = 1; } else { - $CompList{ $listname }{ $dec->[1] } = $ini; + $CompList{ $listname }{ $dec->[1] } = $u; $Comp1st{ $dec->[0] } = $listname; - $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$ini}; + $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u}; } } elsif (@$dec == 1) { - $Single{ $ini } = 1; + $Single{$u} = 1; } else { croak("Weird Canonical Decomposition of U+$tab[0]"); } } - } else { - foreach my $u ($ini .. hex($tab[1])) { - $Compat{ $u } = $dec; - - if (! $compat) { - $Canon{ $u } = $dec; - - if (@$dec == 2) { - if ($Combin{ $dec->[0] }) { - $NonStD{ $u } = 1; - } else { - $CompList{ $listname }{ $dec->[1] } = $u; - $Comp1st{ $dec->[0] } = $listname; - $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u}; - } - } elsif (@$dec == 1) { - $Single{ $u } = 1; - } else { - croak("Weird Canonical Decomposition of U+$tab[0]"); - } - } - } } }