my %General;
my @General;
+sub gencat {
+ my ($Name, $GeneralH, $GeneralA, $Cat,
+ $name, $cat, $code, $op) = @_;
+
+ $op->($Name, $code, $name);
+ $op->($GeneralA, $code, $cat);
+
+ $op->($GeneralH->{$name} ||= [], $code, $name);
+
+ $op->($Cat->{$cat} ||= [], $code);
+ $op->($Cat->{substr($cat, 0, 1)}
+ ||= [], $code);
+ # 005F: SPACING UNDERSCORE
+ $op->($Cat->{Word} ||= [], $code)
+ if $cat =~ /^[LMN]/ or $code eq "005F";
+ $op->($Cat->{Alnum} ||= [], $code)
+ if $cat =~ /^[LMN]/;
+ $op->($Cat->{Alpha} ||= [], $code)
+ if $cat =~ /^[LM]/;
+ # 0009: HORIZONTAL TABULATION
+ # 000A: LINE FEED
+ # 000B: VERTICAL TABULATION
+ # 000C: FORM FEED
+ # 000D: CARRIAGE RETURN
+ # 0020: SPACE
+ $op->($Cat->{Space} ||= [], $code)
+ if $cat =~ /^Z/ ||
+ $code =~ /^(0009|000A|000B|000C|000D)$/;
+ $op->($Cat->{SpacePerl} ||= [], $code)
+ if $cat =~ /^Z/ ||
+ $code =~ /^(0009|000A|000C|000D)$/;
+ $op->($Cat->{Blank} ||= [], $code)
+ if $code =~ /^(0020|0009)$/ ||
+ $cat =~ /^Z[^lp]$/;
+ $op->($Cat->{Digit} ||= [], $code) if $cat eq "Nd";
+ $op->($Cat->{Upper} ||= [], $code) if $cat eq "Lu";
+ $op->($Cat->{Lower} ||= [], $code) if $cat eq "Ll";
+ $op->($Cat->{Title} ||= [], $code) if $cat eq "Lt";
+ $op->($Cat->{ASCII} ||= [], $code) if $code le "007F";
+ $op->($Cat->{Cntrl} ||= [], $code) if $cat =~ /^C/;
+ $op->($Cat->{Graph} ||= [], $code) if $cat =~ /^([LMNPS]|Co)/;
+ $op->($Cat->{Print} ||= [], $code) if $cat =~ /^([LMNPS]|Co|Zs)/;
+ $op->($Cat->{Punct} ||= [], $code) if $cat =~ /^P/;
+ # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f
+ $op->($Cat->{XDigit} ||= [], $code)
+ if $code =~ /^00(3[0-9]|[46][1-6])$/;
+
+}
+
if (open(my $Unicode, "Unicode.txt")) {
my @Name;
my @Bidi;
if ($name =~ /^<(.+), (First|Last)>$/) {
$name = $1;
- if ($2 eq 'First') {
- append($General{$name} ||= [], $code, $name);
- } else {
- extend($General{$name} , $code);
- }
+ gencat(\@Name, \%General, \@General, \%Cat,
+ $name, $cat, $code,
+ $2 eq 'First' ? \&append : \&extend);
unless (defined $In{$name}) {
$In{$name} = $InId++;
$InIn{$name} = $General{$name};
}
- append($Cat{$cat} ||= [], $code);
- append($Cat{substr($cat, 0, 1)}
- ||= [], $code);
} else {
- append(\@Name, $code, $name);
-
- append(\@General, $code, $cat);
-
- append($Cat{$cat} ||= [], $code);
- append($Cat{substr($cat, 0, 1)}
- ||= [], $code);
- # 005F: SPACING UNDERSCORE
- append($Cat{Word} ||= [], $code)
- if $cat =~ /^[LMN]/ or $code eq "005F";
- append($Cat{Alnum} ||= [], $code)
- if $cat =~ /^[LMN]/;
- append($Cat{Alpha} ||= [], $code)
- if $cat =~ /^[LM]/;
- # 0009: HORIZONTAL TABULATION
- # 000A: LINE FEED
- # 000B: VERTICAL TABULATION
- # 000C: FORM FEED
- # 000D: CARRIAGE RETURN
- # 0020: SPACE
- append($Cat{Space} ||= [], $code)
- if $cat =~ /^Z/ ||
- $code =~ /^(0009|000A|000B|000C|000D)$/;
- append($Cat{SpacePerl} ||= [], $code)
- if $cat =~ /^Z/ ||
- $code =~ /^(0009|000A|000C|000D)$/;
- append($Cat{Blank} ||= [], $code)
- if $code =~ /^(0020|0009)$/ ||
- $cat =~ /^Z[^lp]$/;
- append($Cat{Digit} ||= [], $code) if $cat eq "Nd";
- append($Cat{Upper} ||= [], $code) if $cat eq "Lu";
- append($Cat{Lower} ||= [], $code) if $cat eq "Ll";
- append($Cat{Title} ||= [], $code) if $cat eq "Lt";
- append($Cat{ASCII} ||= [], $code) if $code le "007F";
- append($Cat{Cntrl} ||= [], $code) if $cat =~ /^C/;
- append($Cat{Graph} ||= [], $code) if $cat =~ /^([LMNPS]|Co)/;
- append($Cat{Print} ||= [], $code) if $cat =~ /^([LMNPS]|Co|Zs)/;
- append($Cat{Punct} ||= [], $code) if $cat =~ /^P/;
- # 003[0-9]: DIGIT ZERO..NINE, 00[46][1-6]: A..F, a..f
- append($Cat{XDigit} ||= [], $code)
- if $code =~ /^00(3[0-9]|[46][1-6])$/;
-
+
+ gencat(\@Name, \%General, \@General, \%Cat,
+ $name, $cat, $code, \&append);
+
append($To{Upper} ||= [], $code, $upper) if $upper;
append($To{Lower} ||= [], $code, $lower) if $lower;
append($To{Title} ||= [], $code, $title) if $title;
#
# The mapping from General Category long forms to short forms is
# currently hardwired here since no simple data file in the UCD
-# seems to do that.
+# seems to do that. Unicode 3.2 will assumedly correct this.
#
my %Is = (
'Letter' => 'L',
- 'Uppercase Letter' => 'Lu',
- 'Lowercase Letter' => 'Ll',
- 'Titlecase Letter' => 'Lt',
- 'Modifier Letter' => 'Lm',
- 'Other Letter' => 'Lo',
+ 'Uppercase_Letter' => 'Lu',
+ 'Lowercase_Letter' => 'Ll',
+ 'Titlecase_Letter' => 'Lt',
+ 'Modifier_Letter' => 'Lm',
+ 'Other_Letter' => 'Lo',
'Mark' => 'M',
- 'Non-Spacing Mark' => 'Mn',
- 'Spacing Combining Mark' => 'Mc',
- 'Enclosing Mark' => 'Me',
+ 'Non_Spacing_Mark' => 'Mn',
+ 'Spacing_Mark' => 'Mc',
+ 'Enclosing_Mark' => 'Me',
'Separator' => 'Z',
- 'Space Separator' => 'Zs',
- 'Line Separator' => 'Zl',
- 'Paragraph Separator' => 'Zp',
+ 'Space_Separator' => 'Zs',
+ 'Line_Separator' => 'Zl',
+ 'Paragraph_Separator' => 'Zp',
'Number' => 'N',
- 'Decimal Digit Number' => 'Nd',
- 'Letter Number' => 'Nl',
- 'Other Number' => 'No',
+ 'Decimal_Number' => 'Nd',
+ 'Letter_Number' => 'Nl',
+ 'Other_Number' => 'No',
'Punctuation' => 'P',
- 'Connector Punctuation' => 'Pc',
- 'Dash Punctuation' => 'Pd',
- 'Open Punctuation' => 'Ps',
- 'Close Punctuation' => 'Pe',
- 'Initial Punctuation' => 'Pi',
- 'Final Punctuation' => 'Pf',
- 'Other Punctuation' => 'Po',
+ 'Connector_Punctuation' => 'Pc',
+ 'Dash_Punctuation' => 'Pd',
+ 'Open_Punctuation' => 'Ps',
+ 'Close_Punctuation' => 'Pe',
+ 'Initial_Punctuation' => 'Pi',
+ 'Final_Punctuation' => 'Pf',
+ 'Other_Punctuation' => 'Po',
'Symbol' => 'S',
- 'Math Symbol' => 'Sm',
- 'Currency Symbol' => 'Sc',
- 'Modifier Symbol' => 'Sk',
- 'Other Symbol' => 'So',
+ 'Math_Symbol' => 'Sm',
+ 'Currency_Symbol' => 'Sc',
+ 'Modifier_Symbol' => 'Sk',
+ 'Other_Symbol' => 'So',
'Other' => 'C',
'Control' => 'Cc',
'Format' => 'Cf',
'Surrogate' => 'Cs',
'Private Use' => 'Co',
- 'Not Assigned' => 'Cn',
- # 'Other' aliases
- 'Other Control' => 'Cc',
- 'Other Format' => 'Cf',
- 'Other Surrogate' => 'Cs',
- 'Other Private Use' => 'Co',
- 'Other Not Assigned' => 'Cn',
+ 'Unassigned' => 'Cn',
);
#
my ($ix, $code, $to) = @$prop;
my $tostr =
join "", map { sprintf "\\x{%s}", $_ } split ' ', $to;
- print $Case qq['$ix' => "$tostr",\n];
+ printf $Case qq['%04X' => "$tostr",\n], $ix;
}
print $Case <<EOT;
);
for my $code (sort { $a <=> $b } keys %Fold) {
my $foldstr =
join "", map { sprintf "\\x{%s}", $_ } split ' ', $Fold{$code};
- print $Fold qq['$code' => "$foldstr",\n];
+ printf $Fold qq['%04X' => "$foldstr",\n], $code;
}
print $Fold <<EOT;
);