From: SADAHIRO Tomoyuki <SADAHIRO@cpan.org>
Date: Sun, 29 Mar 2009 14:50:35 +0000 (+0200)
Subject: Update to Unicode::Normalize 1.03
X-Git-Url: http://git.shadowcat.co.uk/gitweb/gitweb.cgi?a=commitdiff_plain;h=51683ce6d9770c21b79198196ae37e3474eceb5f;p=p5sagit%2Fp5-mst-13.2.git

Update to Unicode::Normalize 1.03

1.03  Sun Mar 29 12:56:23 2009
  - mkheader: check if no composition needs growing the string.
  - Makefile.PL: a tweak
---

diff --git a/ext/Unicode-Normalize/Changes b/ext/Unicode-Normalize/Changes
index 1f1a693..e9cb391 100644
--- a/ext/Unicode-Normalize/Changes
+++ b/ext/Unicode-Normalize/Changes
@@ -1,5 +1,9 @@
 Revision history for Perl extension Unicode::Normalize.
 
+1.03  Sun Mar 29 12:56:23 2009
+    - mkheader: check if no composition needs growing the string.
+    - Makefile.PL: a tweak
+
 1.02  Tue Jun  5 22:46:45 2007
     - XSUB: mkheader, _U_stringify() - avoid unpack('C*') on unicode.
     - test: short.t removed - pure perl is not inapprotiate for test of
diff --git a/ext/Unicode-Normalize/Makefile.PL b/ext/Unicode-Normalize/Makefile.PL
index ab4bd03..a04ca62 100644
--- a/ext/Unicode-Normalize/Makefile.PL
+++ b/ext/Unicode-Normalize/Makefile.PL
@@ -17,6 +17,7 @@ WriteMakefile(
     'VERSION_FROM'	=> 'Normalize.pm', # finds $VERSION
     'clean'		=> $clean,
     'PREREQ_PM'	  	=> {
+	bytes		=> 0,
 	Carp		=> 0,
 	constant	=> 0,
 	DynaLoader	=> 0,
@@ -27,5 +28,4 @@ WriteMakefile(
 	Test		=> 0,
 	warnings	=> 0,
     },
-    'MAN3PODS'		=> {},
 );
diff --git a/ext/Unicode-Normalize/Normalize.pm b/ext/Unicode-Normalize/Normalize.pm
index cb63fbf..ad5ff82 100644
--- a/ext/Unicode-Normalize/Normalize.pm
+++ b/ext/Unicode-Normalize/Normalize.pm
@@ -13,7 +13,7 @@ use Carp;
 
 no warnings 'utf8';
 
-our $VERSION = '1.02';
+our $VERSION = '1.03';
 our $PACKAGE = __PACKAGE__;
 
 require Exporter;
@@ -453,6 +453,8 @@ normalization implemented by this module depends on your perl's version.
      5.8.1-5.8.3          4.0.0
      5.8.4-5.8.6          4.0.1 (normalization is same as 4.0.0)
      5.8.7-5.8.8          4.1.0
+       5.10.0             5.0.0
+       5.8.9              5.1.0
 
 =item Correction of decomposition mapping
 
diff --git a/ext/Unicode-Normalize/Normalize.xs b/ext/Unicode-Normalize/Normalize.xs
index 792f4a3..f4bbca7 100644
--- a/ext/Unicode-Normalize/Normalize.xs
+++ b/ext/Unicode-Normalize/Normalize.xs
@@ -33,9 +33,11 @@
 /* utf8_hop() hops back before start. Maybe broken UTF-8 */
 #define ErrHopBeforeStart "panic (Unicode::Normalize): hopping before start"
 
-/* It should never happen as there is no instance in UTF-8 and UTF-EBCDIC;
-   according to Versioning and Stability in UAX#15, no new composition
-   should come in future. */
+/* It should never happen as there is no instance in UTF-8 and UTF-EBCDIC.
+   If Unicode would add a new composition of A + B to C
+   where bytes::length(A) + bytes::length(B) < bytes::length(C),
+   this code should be fixed.
+   In this case, mkheader will prevent Unicode::Normalize from building. */
 #define ErrLongerThanSrc "panic (Unicode::Normalize %s): longer than source"
 
 /* uvuni_to_utf8 wants UTF8_MAXBYTES free bytes available */
diff --git a/ext/Unicode-Normalize/README b/ext/Unicode-Normalize/README
index d7d1635..8a5390c 100644
--- a/ext/Unicode-Normalize/README
+++ b/ext/Unicode-Normalize/README
@@ -1,4 +1,4 @@
-Unicode/Normalize version 1.02
+Unicode/Normalize version 1.03
 ===================================
 
 Unicode::Normalize - Unicode Normalization Forms
diff --git a/ext/Unicode-Normalize/mkheader b/ext/Unicode-Normalize/mkheader
index f3ab5e2..b3e3c31 100644
--- a/ext/Unicode-Normalize/mkheader
+++ b/ext/Unicode-Normalize/mkheader
@@ -83,6 +83,30 @@ sub decomposeHangul {
     return @ret;
 }
 
+########## length of a character ##########
+
+sub utf8len {
+  my $uv = shift;
+  return $uv < 0x80 ? 1 :
+        $uv < 0x800 ? 2 :
+      $uv < 0x10000 ? 3 :
+     $uv < 0x110000 ? 4 :
+  croak "$PACKAGE: illegal char in the composite. codepoint max is 0x10ffff.";
+}
+
+sub utfelen {
+  my $uv = shift;
+  return $uv < 0xA0 ? 1 :
+        $uv < 0x400 ? 2 :
+       $uv < 0x4000 ? 3 :
+      $uv < 0x40000 ? 4 :
+     $uv < 0x110000 ? 5 :
+  croak "$PACKAGE: illegal char in the composite. codepoint max is 0x10ffff.";
+}
+
+my $errExpand = "$PACKAGE: Composition to U+%04X (from U+%04X and U+%04X) " .
+    "needs growing the string in %s! Quit. Please inform the author...";
+
 ########## getting full decomposion ##########
 {
     my($f, $fh);
@@ -113,9 +137,9 @@ while ($Combin =~ /(.+)/g) {
     my @tab = split /\t/, $1;
     my $ini = hex $tab[0];
     if ($tab[1] eq '') {
-	$Combin{ $ini } = $tab[2];
+	$Combin{$ini} = $tab[2];
     } else {
-	$Combin{ $_ } = $tab[2] foreach $ini .. hex($tab[1]);
+	$Combin{$_} = $tab[2] foreach $ini .. hex($tab[1]);
     }
 }
 
@@ -123,54 +147,43 @@ while ($Decomp =~ /(.+)/g) {
     my @tab = split /\t/, $1;
     my $compat = $tab[2] =~ s/<[^>]+>//;
     my $dec = [ _getHexArray($tab[2]) ]; # decomposition
-    my $ini = hex($tab[0]); # initial decomposable character
+    my $ini = hex($tab[0]);
+    my $end = $tab[1] eq '' ? $ini : hex($tab[1]);
+    # ($ini .. $end) is the range of decomposable characters.
 
     my $listname =
 	@$dec == 2 ? sprintf("${structname}_%06x", $dec->[0]) : 'USELESS';
 		# %04x is bad since it'd place _3046 after _1d157.
 
-    if ($tab[1] eq '') {
-	$Compat{ $ini } = $dec;
+    foreach my $u ($ini .. $end) {
+	$Compat{$u} = $dec;
 
 	if (! $compat) {
-	    $Canon{ $ini } = $dec;
+	    $Canon{$u} = $dec;
 
 	    if (@$dec == 2) {
+		if (utf8len($dec->[0]) + utf8len($dec->[1]) < utf8len($u)) {
+		    croak sprintf $errExpand, $u, $dec->[0], $dec->[1],
+				  "utf-8";
+		}
+		if (utfelen($dec->[0]) + utfelen($dec->[1]) < utfelen($u)) {
+		    croak sprintf $errExpand, $u, $dec->[0], $dec->[1],
+				  "utf-ebcdic";
+		}
+
 		if ($Combin{ $dec->[0] }) {
-		    $NonStD{ $ini } = 1;
+		    $NonStD{$u} = 1;
 		} else {
-		    $CompList{ $listname }{ $dec->[1] } = $ini;
+		    $CompList{ $listname }{ $dec->[1] } = $u;
 		    $Comp1st{ $dec->[0] } = $listname;
-		    $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$ini};
+		    $Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
 		}
 	    } elsif (@$dec == 1) {
-		$Single{ $ini } = 1;
+		$Single{$u} = 1;
 	    } else {
 		croak("Weird Canonical Decomposition of U+$tab[0]");
 	    }
 	}
-    } else {
-	foreach my $u ($ini .. hex($tab[1])) {
-	    $Compat{ $u } = $dec;
-
-	    if (! $compat) {
-		$Canon{ $u } = $dec;
-
-		if (@$dec == 2) {
-		    if ($Combin{ $dec->[0] }) {
-			$NonStD{ $u } = 1;
-		    } else {
-			$CompList{ $listname }{ $dec->[1] } = $u;
-			$Comp1st{ $dec->[0] } = $listname;
-			$Comp2nd{ $dec->[1] } = 1 if ! $Exclus{$u};
-		    }
-		} elsif (@$dec == 1) {
-		    $Single{ $u } = 1;
-		} else {
-		    croak("Weird Canonical Decomposition of U+$tab[0]");
-		}
-	    }
-	}
     }
 }