# This file is built by mktables.PL from e.g. Unicode.txt.
# Any changes made here will be lost!
return <<'END';
+0621 U
0622 0625 R
0626 D
0627 R
06cf R
06d0 06d1 D
06d2 06d3 R
-06d5 U
+06d5 R
06fa 06fc D
0710 R
0712 0714 D
072a R
072b D
072c R
+200d C
END
# This file is built by mktables.PL from e.g. Unicode.txt.
# Any changes made here will be lost!
return <<'END';
+0621 <no shaping>
0622 0623 ALEF
0624 WAW
0625 ALEF
06cf WAW
06d0 06d1 YEH
06d2 06d3 YEH BARREE
-06d5 <no shaping>
+06d5 TEH MARBUTA
06fa SEEN
06fb SAD
06fc AIN
072a DALATH RISH
072b SHIN
072c TAW
+200d <no shaping>
END
-# ArabicShaping-3.txt
+# ArabicShaping-4.txt
#
# This file is a normative contributory data file in the
# Unicode Character Database.
# D dual-joining, U non-joining
# The fourth field defines the joining group.
#
+#
+# Note: Characters of joining type T and most characters of
+# joining type U are not explicitly listed in this file.
+#
+# Characters of joining type T can derived by the following formula:
+# T = Mn + Cf - ZWNJ - ZWJ
+# where Mn and Cf are the general category values. In other words,
+# any non-spacing mark or any format control character, except
+# U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO WIDTH
+# JOINER (joining type C).
+#
+# For an explicit listing of characters of joining type T, see
+# the derived property file DerivedJoiningType.txt.
+#
+# There are currently no characters of type L defined in Unicode.
+#
+# Joining type U includes all characters which are neither joining
+# type T, nor explicitly marked in this file as types R, L, D, or C.
+#
# #############################################################
# Unicode; Schematic Name; Joining Type; Joining Group
# Arabic characters
+0621; HAMZA; U; <no shaping>
0622; MADDA ON ALEF; R; ALEF
0623; HAMZA ON ALEF; R; ALEF
0624; HAMZA ON WAW; R; WAW
06D1; YEH WITH 3 DOTS BELOW; D; YEH
06D2; YEH BARREE; R; YEH BARREE
06D3; HAMZA ON YEH BARREE; R; YEH BARREE
-06D5; AE; U; <no shaping>
+06D5; AE; R; TEH MARBUTA
06FA; SEEN WITH DOT BELOW AND 3 DOTS ABOVE; D; SEEN
06FB; DAD WITH DOT BELOW; D; SAD
06FC; GHAIN WITH DOT BELOW; D; AIN
072A; RISH; R; DALATH RISH
072B; SHIN; D; SHIN
072C; TAW; R; TAW
+
+# Other
+
+200D; ZERO WIDTH JOINER; C; <no shaping>
-# CaseFolding-3.txt
+# CaseFolding-4.txt
#
# Case Folding Properties
#
# case differences (according to UnicodeData.txt and SpecialCasing.txt)
# are eliminated.
#
-# The data supports both implemenations that require simple case foldings
-# (where string lengths don't change), and implemenations that allow full case folding
+# The data supports both implementations that require simple case foldings
+# (where string lengths don't change), and implementations that allow full case folding
# (where string lengths may grow). Note that where they can be supported, the
# full case foldings are superior: for example, they allow "MASSE" and "Maße" to match.
#
-# EastAsianWidth-4.txt
+# EastAsianWidth-5.txt
#
# East Asian Width Properties
#
00AB;N # LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
00AC;Na # NOT SIGN
00AD;A # SOFT HYPHEN
-00AE;N # REGISTERED SIGN
+00AE;A # REGISTERED SIGN
00AF;Na # MACRON
00B0;A # DEGREE SIGN
00B1;A # PLUS-MINUS SIGN
0148;A # LATIN SMALL LETTER N WITH CARON
0149;A # LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
014A;A # LATIN CAPITAL LETTER ENG
-014B;N # LATIN SMALL LETTER ENG
+014B;A # LATIN SMALL LETTER ENG
014C;N # LATIN CAPITAL LETTER O WITH MACRON
014D;A # LATIN SMALL LETTER O WITH MACRON
014E;N # LATIN CAPITAL LETTER O WITH BREVE
02C1;N # MODIFIER LETTER REVERSED GLOTTAL STOP
02C2;N # MODIFIER LETTER LEFT ARROWHEAD
02C3;N # MODIFIER LETTER RIGHT ARROWHEAD
-02C4;N # MODIFIER LETTER UP ARROWHEAD
+02C4;A # MODIFIER LETTER UP ARROWHEAD
02C5;N # MODIFIER LETTER DOWN ARROWHEAD
02C6;N # MODIFIER LETTER CIRCUMFLEX ACCENT
02C7;A # CARON
02DC;N # SMALL TILDE
02DD;A # DOUBLE ACUTE ACCENT
02DE;N # MODIFIER LETTER RHOTIC HOOK
-02DF;N # MODIFIER LETTER CROSS ACCENT
+02DF;A # MODIFIER LETTER CROSS ACCENT
02E0;N # MODIFIER LETTER SMALL GAMMA
02E1;N # MODIFIER LETTER SMALL L
02E2;N # MODIFIER LETTER SMALL S
201F;N # DOUBLE HIGH-REVERSED-9 QUOTATION MARK
2020;A # DAGGER
2021;A # DOUBLE DAGGER
-2022;N # BULLET
+2022;A # BULLET
2023;N # TRIANGULAR BULLET
-2024;N # ONE DOT LEADER
+2024;A # ONE DOT LEADER
2025;A # TWO DOT LEADER
2026;A # HORIZONTAL ELLIPSIS
2027;A # HYPHENATION POINT
203B;A # REFERENCE MARK
203C;N # DOUBLE EXCLAMATION MARK
203D;N # INTERROBANG
-203E;N # OVERLINE
+203E;A # OVERLINE
203F;N # UNDERTIE
2040;N # CHARACTER TIE
2041;N # CARET INSERTION POINT
2113;A # SCRIPT SMALL L
2114;N # L B BAR SYMBOL
2115;N # DOUBLE-STRUCK CAPITAL N
-2116;N # NUMERO SIGN
+2116;A # NUMERO SIGN
2117;N # SOUND RECORDING COPYRIGHT
2118;N # SCRIPT CAPITAL P
2119;N # DOUBLE-STRUCK CAPITAL P
2138;N # DALET SYMBOL
2139;N # INFORMATION SOURCE
213A;N # ROTATED CAPITAL Q
-2153;N # VULGAR FRACTION ONE THIRD
+2153;A # VULGAR FRACTION ONE THIRD
2154;A # VULGAR FRACTION TWO THIRDS
2155;A # VULGAR FRACTION ONE FIFTH
2156;N # VULGAR FRACTION TWO FIFTHS
2159;N # VULGAR FRACTION ONE SIXTH
215A;N # VULGAR FRACTION FIVE SIXTHS
215B;A # VULGAR FRACTION ONE EIGHTH
-215C;N # VULGAR FRACTION THREE EIGHTHS
-215D;N # VULGAR FRACTION FIVE EIGHTHS
+215C;A # VULGAR FRACTION THREE EIGHTHS
+215D;A # VULGAR FRACTION FIVE EIGHTHS
215E;A # VULGAR FRACTION SEVEN EIGHTHS
215F;N # FRACTION NUMERATOR ONE
2160;A # ROMAN NUMERAL ONE
21B5;N # DOWNWARDS ARROW WITH CORNER LEFTWARDS
21B6;N # ANTICLOCKWISE TOP SEMICIRCLE ARROW
21B7;N # CLOCKWISE TOP SEMICIRCLE ARROW
-21B8;N # NORTH WEST ARROW TO LONG BAR
-21B9;N # LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR
+21B8;A # NORTH WEST ARROW TO LONG BAR
+21B9;A # LEFTWARDS ARROW TO BAR OVER RIGHTWARDS ARROW TO BAR
21BA;N # ANTICLOCKWISE OPEN CIRCLE ARROW
21BB;N # CLOCKWISE OPEN CIRCLE ARROW
21BC;N # LEFTWARDS HARPOON WITH BARB UPWARDS
21E4;N # LEFTWARDS ARROW TO BAR
21E5;N # RIGHTWARDS ARROW TO BAR
21E6;N # LEFTWARDS WHITE ARROW
-21E7;N # UPWARDS WHITE ARROW
+21E7;A # UPWARDS WHITE ARROW
21E8;N # RIGHTWARDS WHITE ARROW
21E9;N # DOWNWARDS WHITE ARROW
21EA;N # UPWARDS WHITE ARROW FROM BAR
273A;N # SIXTEEN POINTED ASTERISK
273B;N # TEARDROP-SPOKED ASTERISK
273C;N # OPEN CENTRE TEARDROP-SPOKED ASTERISK
-273D;N # HEAVY TEARDROP-SPOKED ASTERISK
+273D;A # HEAVY TEARDROP-SPOKED ASTERISK
273E;N # SIX PETALLED BLACK AND WHITE FLORETTE
273F;N # BLACK FLORETTE
2740;N # WHITE FLORETTE
<tbody>
<tr>
<td valign="top" width="144">Revision</td>
- <td valign="top">3.1.0</td>
+ <td valign="top">3.1.1</td>
</tr>
<tr>
<td valign="top" width="144">Authors</td>
</tr>
<tr>
<td valign="top" width="144">Date</td>
- <td valign="top">2001-02-28</td>
+ <td valign="top">2001-07-12</td>
</tr>
<tr>
<td valign="top" width="144">This Version</td>
<td valign="top"><a
- href="http://www.unicode.org/Public/3.1-Update/PropList-3.1.0.html">http://www.unicode.org/Public/3.1-Update/PropList-3.1.0.html</a></td>
+ href="http://www.unicode.org/Public/3.1-Update1/PropList-3.1.1.html">http://www.unicode.org/Public/3.1-Update1/PropList-3.1.1.html</a></td>
</tr>
<tr>
<td valign="top" width="144">Previous Version</td>
functions for control of cursive joining and ligation.</td>
</tr>
<tr>
+ <th valign="top">ASCII_Hex_Digit</th>
+ <th valign="top">N</th>
+ <td valign="top">ASCII characters commonly used for the representation of
+ hexadecimal numbers.</td>
+ </tr>
+ <tr>
<th valign="top">Dash</th>
<th valign="top">I</th>
<td valign="top">Those punctuation characters explicitly called out as
-# PropList-3.1.0.txt
+# PropList-3.1.1.txt
#
# Unicode Character Database: Extended Properties
# For documentation, see PropList.html
-# Date: 2001-03-02 00:06:33.9 GMT [MD]
+# Date: 2001-07-12 14:15:00.0 PDT [KW]
# Note: Unassigned and Noncharacter codepoints are omitted,
# except when listing Noncharacter or Cn.
# ================================================
FF10..FF19 ; Hex_Digit # Nd [10] FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE
FF21..FF26 ; Hex_Digit # L& [6] FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER F
FF41..FF46 ; Hex_Digit # L& [6] FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER F
-1D7CE..1D7FF ; Hex_Digit # Nd [50] MATHEMATICAL BOLD DIGIT ZERO..MATHEMATICAL MONOSPACE DIGIT NINE
-# Total code points: 94
+# Total code points: 44
+
+# ================================================
+
+0030..0039 ; ASCII_Hex_Digit # Nd [10] DIGIT ZERO..DIGIT NINE
+0041..0046 ; ASCII_Hex_Digit # L& [6] LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER F
+0061..0066 ; ASCII_Hex_Digit # L& [6] LATIN SMALL LETTER A..LATIN SMALL LETTER F
+
+# Total code points: 22
# ================================================
by which the Perl distribution tries to live. The renamings are listed
in the file 'rename'.
-The two big files, NormalizationTest.txt (1.7MB) and Unihan.txt (15.8MB)
-were not copied due to space considerations.
+The two big files, NormalizationTest.txt (2.0MB) and Unihan.txt (15.8MB)
+were not copied due to space considerations. Also not included are the
+derived files:
+
+ DerivedBidiClass.txt
+ DerivedBinaryProperties.txt
+ DerivedCombiningClass.txt
+ DerivedCoreProperties.txt
+ DerivedDecompositionType.txt
+ DerivedEastAsianWidth.txt
+ DerivedGeneralCategory.txt
+ DerivedJoiningGroup.txt
+ DerivedJoiningType.txt
+ DerivedLineBreak.txt
+ DerivedNormalizationProperties.txt
+ DerivedNumericType.txt
+ DerivedNumericValues.txt
+ DerivedProperties.html
The *.pl files are generated from these files by the 'mktables.PL' script.
-March 30, 2001
+August 10, 2001
This directory contains the Unicode Character Database
data files.
Currently, the Unicode Character Database files are at
the version level:
- Unicode Standard, Version 3.1.
+ Unicode Standard, Version 3.1.1
For information about the standard itself, see
UAX #27, Unicode 3.1. <http://www.unicode.org/unicode/reports/tr27/>
+and the Unicode 3.1.1 Update Notice.
+<http://www.unicode.org/versions/Unicode3.1.1.html>
Detailed documentation of the files constituting the
Unicode Character Database (contributory data files for
files.
Unihan.txt is a very large file. For convenience, the current
-Unicode 3.1 version of Unihan.txt is also available in
-three compressed formats in the Unicode 3.1 update directory.
-See: <http://www.unicode.org/Public/3.1-Update/> or
-<ftp://ftp.unicode.org/Public/3.1-Update/>
+Unicode 3.1.1 version of Unihan.txt is also available in
+two compressed formats in the Unicode 3.1.1 update directory.
+See: <http://www.unicode.org/Public/3.1-Update1/> or
+<ftp://ftp.unicode.org/Public/3.1-Update1/>
-Unihan-3.1.zip for Windows. (Use winzip)
-Unihan-3.1.Z for Unix. (Use uncompress)
-Unihan-3.1.gz for Unix. (Use gzip)
-
-The Unicode 3.1 update directory also contains a winzip
-version of all the other contributory data files for
-that update directory. That file is named:
-
-UCDwithoutUnihan.zip (Use winzip)
-
-UCDwithoutUnihan.zip is a convenient way to download
-most of the Unicode Character Database files, but to
-be complete, be sure to also get ArabicShaping.txt,
-BidiMirroring.txt, and Jamo.txt, which were unchanged
-between Unicode 3.0.1 and Unicode 3.1, and which were
-therefore not included in the 3.1-Update directory.
+Unihan-3.1.1.zip for Windows. (Use winzip)
+Unihan-3.1.1.txt.gz for Unix. (Use gzip or gunzip)
Note that the files are zipped in
exactly the same format they have on the server (with Unix
line endings). From a browser, right-clicking on
-UCDwithoutUnihan.zip will allow automatic download and unzipping on a
+Unihan-3.1.1.zip will allow automatic download and unzip on a
Windows system with winzip installed.
-# SpecialCasing-4.txt
+# SpecialCasing-5.txt
#
# Special Casing Properties
#
# one character, they are separated by spaces.
#
# The <condition_list> is optional. Where present, it consists of one or more locales or contexts,
-# separated by spaces.
-# A condition list overrides the normal behavior if all of the listed conditions are true.
-# Case distinctions in the condition list are not significant.
-# Conditions preceded by "NON_" represent the negation of the condition
+# separated by spaces. In these conditions:
+# - A condition list overrides the normal behavior if all of the listed conditions are true.
+# - Case distinctions in the condition list are not significant.
+# - Conditions preceded by "NOT_" represent the negation of the condition.
+# - A cased letter is any character with general category = Ll or Lo or Lt
+# - An ignorable sequence is a sequence of *zero* or more characters from
+# the set {HYPHEN, SOFT HYPHEN, general category = Mn}.
#
# A locale is defined as:
# <locale> := <ISO_639_code> ( "_" <ISO_3166_code> ( "_" <variant> )? )?
# <ISO_3166_code> := 2-letter ISO country code,
# <ISO_639_code> := 2-letter ISO language code
#
-# A context is one of the following choices:
-# FINAL: The letter is not followed by a letter of general category L* (e.g. Ll, Lt, Lu, Lm, or Lo).
-# MODERN: The mapping is only used for modern text.
-# AFTER_i: The last base character was "i" 0069
+# A context is a locale or one of the following choices:
+# CFINAL: The character is not followed by a sequence consisting of
+# an ignorable sequence and then a cased letter.
+# CINITIAL: The character is not preceded by a sequence consisting of
+# a cased letter and an ignorable sequence.
+# FINAL_SIGMA: CFINAL and NOT_CINITIAL
+# TYPE_i: The character is "i" (0069), "j" (006A),
+# or has a canonical decomposition that begins with an "i" or "j"
+# but has no combining characters above (i.e., i-ogonek (012F),
+# i-tilde-below (1E2D), or i-dot-below (1ECB)).
+# AFTER_i: The last preceding base character was TYPE_i, and
+# no combining character class 230 (above) has intervened.
+# MORE_ABOVE: The character is followed by one or more characters of
+# combining class 230 (ABOVE) in the combining character sequence
#
# Other than as used to separate elements, spaces are to be ignored.
#
# Special case for final form of sigma
-03A3; 03C2; 03A3; 03A3; FINAL; # GREEK CAPITAL LETTER SIGMA
+03A3; 03C2; 03A3; 03A3; FINAL_SIGMA; # GREEK CAPITAL LETTER SIGMA
# Note: the following cases for non-final are already in the UnicodeData file.
# Note: the following cases are not included, since they would case-fold in lowercasing
-# 03C3; 03C2; 03A3; 03A3; FINAL; # GREEK SMALL LETTER SIGMA
-# 03C2; 03C3; 03A3; 03A3; NON_FINAL; # GREEK SMALL LETTER FINAL SIGMA
+# 03C3; 03C2; 03A3; 03A3; FINAL_SIGMA; # GREEK SMALL LETTER SIGMA
+# 03C2; 03C3; 03A3; 03A3; NOT_FINAL_SIGMA; # GREEK SMALL LETTER FINAL SIGMA
# ================================================================================
# Locale-sensitive mappings
# ================================================================================
-# Lithuanian
+# Lithuanian
-0307; 0307; ; ; lt AFTER_i; # Remove DOT ABOVE after "i" with upper or titlecase
+# Lithuanian retains the dot in a lowercase i when followed by accents.
-# Turkish, Azeri
+# Remove DOT ABOVE after "i" with upper or titlecase
+
+0307; 0307; ; ; lt AFTER_i # COMBINING DOT ABOVE
+
+# Introduce an explicit dot above when lowercasing capital I's and J's
+# whenever there are more accents above
+# (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
+
+0049; 0069 0307; 0049; 0049; lt MORE_ABOVE # LATIN CAPITAL LETTER I
+004A; 006A 0307; 004A; 004A; lt MORE_ABOVE # LATIN CAPITAL LETTER J
+012E; 012F 0307; 012E; 012E; lt MORE_ABOVE # LATIN CAPITAL LETTER I WITH OGONEK
+00CC; 0069 0307 0300; 00CC; 00CC; lt # LATIN CAPITAL LETTER I WITH GRAVE
+00CD; 0069 0307 0301; 00CD; 00CD; lt # LATIN CAPITAL LETTER I WITH ACUTE
+0128; 0069 0307 0303; 0128; 0128; lt # LATIN CAPITAL LETTER I WITH TILDE
+
+# ================================================================================
+
+# Turkish and Azeri
+
+# I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
+# The following rules handle those cases.
+
+# Remove spurious dot above small i's when lowercasing, if there are no more accents above:
+
+0307; ; 0307; 0307; tr AFTER_i NOT_MORE_ABOVE # COMBINING DOT ABOVE
+0307; ; 0307; 0307; az AFTER_i NOT_MORE_ABOVE # COMBINING DOT ABOVE
+
+# Fix case pairs
0049; 0131; 0049; 0049; tr; # LATIN CAPITAL LETTER I
0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
-ArabicShaping.txt ArabShap.txt
-BidiMirroring.txt BidiMirr.txt
-#Blocks.txt Blocks.txt
-CaseFolding.txt CaseFold.txt
-CompositionExclusions.txt CompExcl.txt
-EastAsianWidth.txt EAWidth.txt
-#Index.txt Index.txt
-#Jamo.txt Jamo.txt
-LineBreak.txt LineBrk.txt
-#NamesList.html NamesList.html
-#NamesList.txt NamesList.txt
-#PropList.txt PropList.txt
-#PropList.html PropList.html
-#ReadMe.txt ReadMe.txt
-SpecialCasing.txt SpecCase.txt
-UnicodeCharacterDatabase.html UCD.html
-UnicodeData.html Unicode.html
-UnicodeData.txt Unicode.txt
+#!/bin/sh
+
+mv ArabicShaping.txt ArabShap.txt
+mv BidiMirroring.txt BidiMirr.txt
+#Blocks.txt Blocks.txt
+mv CaseFolding.txt CaseFold.txt
+mv CompositionExclusions.txt CompExcl.txt
+mv EastAsianWidth.txt EAWidth.txt
+#Index.txt Index.txt
+#Jamo.txt Jamo.txt
+mv LineBreak.txt LineBrk.txt
+#NamesList.html NamesList.html
+#NamesList.txt NamesList.txt
+#PropList.txt PropList.txt
+#PropList.html PropList.html
+#ReadMe.txt ReadMe.txt
+mv SpecialCasing.txt SpecCase.txt
+mv UnicodeCharacterDatabase.html UCD.html
+mv UnicodeData.html Unicode.html
+mv UnicodeData.txt Unicode.txt